diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 1f0b729ed..089493d3b 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -54,7 +54,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        subset: [backends, slow_tests, group_a, group_b]
+        subset: [backends, slow_tests, group_a, group_b, dragon]
         os: [macos-12, macos-14, ubuntu-22.04] # Operating systems
         compiler: [8] # GNU compiler version
         rai: [1.2.7] # Redis AI versions
@@ -112,9 +112,17 @@ jobs:
           python -m pip install .[dev,mypy,ml]
 
       - name: Install ML Runtimes with Smart (with pt, tf, and onnx support)
-        if: contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')
+        if: (contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')) && ( matrix.subset != 'dragon' )
         run: smart build --device cpu --onnx -v
 
+      - name: Install ML Runtimes with Smart (with pt, tf, dragon, and onnx support)
+        if: (contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')) && ( matrix.subset == 'dragon' )
+        run: |
+          smart build --device cpu --onnx --dragon -v
+          SP=$(python -c 'import site; print(site.getsitepackages()[0])')/smartsim/_core/config/dragon/.env
+          LLP=$(cat $SP | grep LD_LIBRARY_PATH | awk '{split($0, array, "="); print array[2]}')
+          echo "LD_LIBRARY_PATH=$LLP:$LD_LIBRARY_PATH" >> $GITHUB_ENV
+
       - name: Install ML Runtimes with Smart (no ONNX,TF on Apple Silicon)
         if: contains( matrix.os, 'macos-14' )
         run: smart build --device cpu --no_tf -v
@@ -142,9 +150,16 @@ jobs:
           echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV
           py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ ./tests/backends
 
+      # Run pytest (dragon subtests)
+      - name: Run Dragon Pytest
+        if: (matrix.subset == 'dragon' && matrix.os == 'ubuntu-22.04')
+        run: |
+          echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV
+          dragon -s py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ -m ${{ matrix.subset }} ./tests
+
       # Run pytest (test subsets)
       - name: Run Pytest
-        if: "!contains(matrix.subset, 'backends')" # if not running backend tests
+        if: (matrix.subset != 'backends' && matrix.subset != 'dragon') # if not running backend tests or dragon tests
         run: |
           echo "SMARTSIM_LOG_LEVEL=debug" >> $GITHUB_ENV
           py.test -s --import-mode=importlib -o log_cli=true --cov=$(smart site) --cov-report=xml --cov-config=./tests/test_configs/cov/local_cov.cfg --ignore=tests/full_wlm/ -m ${{ matrix.subset }} ./tests
diff --git a/Makefile b/Makefile
index bddbda722..3ab83da89 100644
--- a/Makefile
+++ b/Makefile
@@ -164,22 +164,22 @@ tutorials-prod:
 # help: test                           - Run all tests
 .PHONY: test
 test:
-	@python -m pytest --ignore=tests/full_wlm/
+	@python -m pytest --ignore=tests/full_wlm/ --ignore=tests/dragon
 
 # help: test-verbose                   - Run all tests verbosely
 .PHONY: test-verbose
 test-verbose:
-	@python -m pytest -vv --ignore=tests/full_wlm/
+	@python -m pytest -vv --ignore=tests/full_wlm/ --ignore=tests/dragon
 
 # help: test-debug                     - Run all tests with debug output
 .PHONY: test-debug
 test-debug:
-	@SMARTSIM_LOG_LEVEL=developer python -m pytest -s -o log_cli=true -vv --ignore=tests/full_wlm/
+	@SMARTSIM_LOG_LEVEL=developer python -m pytest -s -o log_cli=true -vv --ignore=tests/full_wlm/ --ignore=tests/dragon
 
 # help: test-cov                       - Run all tests with coverage
 .PHONY: test-cov
 test-cov:
-	@python -m pytest -vv --cov=./smartsim --cov-config=${COV_FILE} --ignore=tests/full_wlm/
+	@python -m pytest -vv --cov=./smartsim --cov-config=${COV_FILE} --ignore=tests/full_wlm/ --ignore=tests/dragon
 
 
 # help: test-full                      - Run all WLM tests with Python coverage (full test suite)
@@ -192,3 +192,8 @@ test-full:
 .PHONY: test-wlm
 test-wlm:
 	@python -m pytest -vv tests/full_wlm/ tests/on_wlm
+
+# help: test-dragon                   - Run dragon-specific tests
+.PHONY: test-dragon
+test-dragon:
+	@dragon pytest tests/dragon
diff --git a/doc/changelog.md b/doc/changelog.md
index 740197ce5..ac09ecf60 100644
--- a/doc/changelog.md
+++ b/doc/changelog.md
@@ -9,6 +9,28 @@ Jump to:
 
 ## SmartSim
 
+### MLI branch
+
+Description
+
+- Add RequestDispatcher and the possibility of batching inference requests
+- Enable hostname selection for dragon tasks
+- Remove pydantic dependency from MLI code
+- Update MLI environment variables using new naming convention
+- Reduce a copy by using torch.from_numpy instead of torch.tensor
+- Enable dynamic feature store selection
+- Fix dragon package installation bug
+- Adjust schemas for better performance
+- Add TorchWorker first implementation and mock inference app example
+- Add error handling in Worker Manager pipeline
+- Add EnvironmentConfigLoader for ML Worker Manager
+- Add Model schema with model metadata included
+- Removed device from schemas, MessageHandler and tests
+- Add ML worker manager, sample worker, and feature store
+- Add schemas and MessageHandler class for de/serialization of
+  inference requests and response messages
+
+
 ### Development branch
 
 To be released at some future point in time
diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py
new file mode 100644
index 000000000..807a70b21
--- /dev/null
+++ b/ex/high_throughput_inference/mli_driver.py
@@ -0,0 +1,77 @@
+import os
+import base64
+import cloudpickle
+import sys
+from smartsim import Experiment
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim.status import TERMINAL_STATUSES
+from smartsim.settings import DragonRunSettings
+import time
+import typing as t
+
+DEVICE = "gpu"
+NUM_RANKS = 4
+NUM_WORKERS = 1
+filedir = os.path.dirname(__file__)
+worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py")
+app_script_name = os.path.join(filedir, "mock_app.py")
+model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt")
+
+transport: t.Literal["hsta", "tcp"] = "hsta"
+
+os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport
+
+exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}")
+os.makedirs(exp_path, exist_ok=True)
+exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path)
+
+torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii")
+
+worker_manager_rs: DragonRunSettings = exp.create_run_settings(
+    sys.executable,
+    [
+        worker_manager_script_name,
+        "--device",
+        DEVICE,
+        "--worker_class",
+        torch_worker_str,
+        "--batch_size",
+        str(NUM_RANKS//NUM_WORKERS),
+        "--batch_timeout",
+        str(0.00),
+        "--num_workers",
+        str(NUM_WORKERS)
+    ],
+)
+
+aff = []
+
+worker_manager_rs.set_cpu_affinity(aff)
+
+worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs)
+worker_manager.attach_generator_files(to_copy=[worker_manager_script_name])
+
+app_rs: DragonRunSettings = exp.create_run_settings(
+    sys.executable,
+    exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(6)],
+)
+app_rs.set_tasks_per_node(NUM_RANKS)
+
+
+app = exp.create_model("app", run_settings=app_rs)
+app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
+
+exp.generate(worker_manager, app, overwrite=True)
+exp.start(worker_manager, app, block=False)
+
+while True:
+    if exp.get_status(app)[0] in TERMINAL_STATUSES:
+        time.sleep(10)
+        exp.stop(worker_manager)
+        break
+    if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES:
+        time.sleep(10)
+        exp.stop(app)
+        break
+
+print("Exiting.")
diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py
new file mode 100644
index 000000000..517d18fb2
--- /dev/null
+++ b/ex/high_throughput_inference/mock_app.py
@@ -0,0 +1,189 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# isort: off
+import dragon
+from dragon import fli
+from dragon.channels import Channel
+import dragon.channels
+from dragon.data.ddict.ddict import DDict
+from dragon.globalservices.api_setup import connect_to_infrastructure
+from dragon.utils import b64decode, b64encode
+
+# isort: on
+
+import argparse
+import io
+import numpy
+import os
+import time
+import torch
+
+from mpi4py import MPI
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+from smartsim._core.utils.timings import PerfTimer
+
+torch.set_num_interop_threads(16)
+torch.set_num_threads(1)
+
+logger = get_logger("App")
+logger.info("Started app")
+
+CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False
+
+class ProtoClient:
+    def __init__(self, timing_on: bool):
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
+        connect_to_infrastructure()
+        ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"]
+        self._ddict = DDict.attach(ddict_str)
+        self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor
+        to_worker_fli_str = None
+        while to_worker_fli_str is None:
+            try:
+                to_worker_fli_str = self._ddict["to_worker_fli"]
+                self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str)
+            except KeyError:
+                time.sleep(1)
+        self._from_worker_ch = Channel.make_process_local()
+        self._from_worker_ch_serialized = self._from_worker_ch.serialize()
+        self._to_worker_ch = Channel.make_process_local()
+
+        self.perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_")
+
+    def run_model(self, model: bytes | str, batch: torch.Tensor):
+        tensors = [batch.numpy()]
+        self.perf_timer.start_timings("batch_size", batch.shape[0])
+        built_tensor_desc = MessageHandler.build_tensor_descriptor(
+            "c", "float32", list(batch.shape)
+        )
+        self.perf_timer.measure_time("build_tensor_descriptor")
+        if isinstance(model, str):
+            model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor)
+        else:
+            model_arg = MessageHandler.build_model(model, "resnet-50", "1.0")
+        request = MessageHandler.build_request(
+            reply_channel=self._from_worker_ch_serialized,
+            model=model_arg,
+            inputs=[built_tensor_desc],
+            outputs=[],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+        self.perf_timer.measure_time("build_request")
+        request_bytes = MessageHandler.serialize_request(request)
+        self.perf_timer.measure_time("serialize_request")
+        with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh:
+            to_sendh.send_bytes(request_bytes)
+            self.perf_timer.measure_time("send_request")
+            for tensor in tensors:
+                to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!!
+        self.perf_timer.measure_time("send_tensors")
+        with self._from_worker_ch.recvh(timeout=None) as from_recvh:
+            resp = from_recvh.recv_bytes(timeout=None)
+            self.perf_timer.measure_time("receive_response")
+            response = MessageHandler.deserialize_response(resp)
+            self.perf_timer.measure_time("deserialize_response")
+            # list of data blobs? recv depending on the len(response.result.descriptors)?
+            data_blob: bytes = from_recvh.recv_bytes(timeout=None)
+            self.perf_timer.measure_time("receive_tensor")
+            result = torch.from_numpy(
+                numpy.frombuffer(
+                    data_blob,
+                    dtype=str(response.result.descriptors[0].dataType),
+                )
+            )
+            self.perf_timer.measure_time("deserialize_tensor")
+
+        self.perf_timer.end_timings()
+        return result
+
+    def set_model(self, key: str, model: bytes):
+        self._ddict[key] = model
+
+
+
+class ResNetWrapper:
+    def __init__(self, name: str, model: str):
+        self._model = torch.jit.load(model)
+        self._name = name
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._model, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        self._serialized_model = buffer.getvalue()
+
+    def get_batch(self, batch_size: int = 32):
+        return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
+
+    @property
+    def model(self):
+        return self._serialized_model
+
+    @property
+    def name(self):
+        return self._name
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Mock application")
+    parser.add_argument("--device", default="cpu", type=str)
+    parser.add_argument("--log_max_batchsize", default=8, type=int)
+    args = parser.parse_args()
+
+    resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt")
+
+    client = ProtoClient(timing_on=True)
+    client.set_model(resnet.name, resnet.model)
+
+    if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
+        # TODO: adapt to non-Nvidia devices
+        torch_device = args.device.replace("gpu", "cuda")
+        pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to(torch_device)
+
+    TOTAL_ITERATIONS = 100
+
+    for log2_bsize in range(args.log_max_batchsize+1):
+        b_size: int = 2**log2_bsize
+        logger.info(f"Batch size: {b_size}")
+        for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)):
+            logger.info(f"Iteration: {iteration_number}")
+            sample_batch = resnet.get_batch(b_size)
+            remote_result = client.run_model(resnet.name, sample_batch)
+            logger.info(client.perf_timer.get_last("total_time"))
+            if CHECK_RESULTS_AND_MAKE_ALL_SLOWER:
+                local_res = pt_model(sample_batch.to(torch_device))
+                err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to(torch_device)-torch.flatten(local_res), ord=1).cpu()
+                res_norm = torch.linalg.vector_norm(remote_result, ord=1).item()
+                local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item()
+                logger.info(f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}")
+                torch.cuda.synchronize()
+
+    client.perf_timer.print_timings(to_file=True)
\ No newline at end of file
diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py
new file mode 100644
index 000000000..8978bcea2
--- /dev/null
+++ b/ex/high_throughput_inference/mock_app_redis.py
@@ -0,0 +1,90 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import io
+import numpy
+import time
+import torch
+from mpi4py import MPI
+from smartsim.log import get_logger
+from smartsim._core.utils.timings import PerfTimer
+from smartredis import Client
+
+logger = get_logger("App")
+
+class ResNetWrapper():
+    def __init__(self, name: str, model: str):
+        self._model = torch.jit.load(model)
+        self._name = name
+        buffer = io.BytesIO()
+        scripted = torch.jit.trace(self._model, self.get_batch())
+        torch.jit.save(scripted, buffer)
+        self._serialized_model = buffer.getvalue()
+
+    def get_batch(self, batch_size: int=32):
+        return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32)
+
+    @property
+    def model(self):
+        return self._serialized_model
+
+    @property
+    def name(self):
+        return self._name
+
+if __name__ == "__main__":
+
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+
+    parser = argparse.ArgumentParser("Mock application")
+    parser.add_argument("--device", default="cpu")
+    args = parser.parse_args()
+
+    resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt")
+
+    client = Client(cluster=False, address=None)
+    client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper())
+
+    perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"redis{rank}_")
+
+    total_iterations = 100
+    timings=[]
+    for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]:
+        logger.info(f"Batch size: {batch_size}")
+        for iteration_number in range(total_iterations + int(batch_size==1)):
+            perf_timer.start_timings("batch_size", batch_size)
+            logger.info(f"Iteration: {iteration_number}")
+            input_name = f"batch_{rank}"
+            output_name = f"result_{rank}"
+            client.put_tensor(name=input_name, data=resnet.get_batch(batch_size).numpy())
+            client.run_model(name=resnet.name, inputs=[input_name], outputs=[output_name])
+            result = client.get_tensor(name=output_name)
+            perf_timer.end_timings()
+
+
+    perf_timer.print_timings(True)
diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py
new file mode 100644
index 000000000..ff57725d4
--- /dev/null
+++ b/ex/high_throughput_inference/redis_driver.py
@@ -0,0 +1,66 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import sys
+from smartsim import Experiment
+from smartsim.status import TERMINAL_STATUSES
+import time
+
+DEVICE = "gpu"
+filedir = os.path.dirname(__file__)
+app_script_name = os.path.join(filedir, "mock_app_redis.py")
+model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt")
+
+
+exp_path = os.path.join(filedir, "redis_ai_multi")
+os.makedirs(exp_path, exist_ok=True)
+exp = Experiment("redis_ai_multi", launcher="slurm", exp_path=exp_path)
+
+db = exp.create_database(interface="hsn0")
+
+app_rs = exp.create_run_settings(
+    sys.executable, exe_args = [app_script_name, "--device", DEVICE]
+    )
+app_rs.set_nodes(1)
+app_rs.set_tasks(4)
+app = exp.create_model("app", run_settings=app_rs)
+app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name])
+
+exp.generate(db, app, overwrite=True)
+
+exp.start(db, app, block=False)
+
+while True:
+    if exp.get_status(app)[0] in TERMINAL_STATUSES:
+        exp.stop(db)
+        break
+    if exp.get_status(db)[0] in TERMINAL_STATUSES:
+        exp.stop(app)
+        break
+    time.sleep(5)
+
+print("Exiting.")
\ No newline at end of file
diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py
new file mode 100644
index 000000000..0b8c61251
--- /dev/null
+++ b/ex/high_throughput_inference/standalone_workermanager.py
@@ -0,0 +1,223 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import dragon
+
+# pylint disable=import-error
+import dragon.infrastructure.policy as dragon_policy
+import dragon.infrastructure.process_desc as dragon_process_desc
+import dragon.native.process as dragon_process
+from dragon import fli
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.globalservices.api_setup import connect_to_infrastructure
+from dragon.managed_memory import MemoryPool
+from dragon.utils import b64decode, b64encode
+# pylint enable=import-error
+
+# isort: off
+# isort: on
+
+import argparse
+import base64
+import multiprocessing as mp
+import os
+import pickle
+import socket
+import sys
+import time
+import typing as t
+
+import cloudpickle
+import optparse
+import os
+
+from smartsim._core.entrypoints.service import Service
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.control.requestdispatcher import (
+    RequestDispatcher,
+)
+from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager
+from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase
+
+from smartsim.log import get_logger
+
+logger = get_logger("Worker Manager Entry Point")
+
+mp.set_start_method("dragon")
+
+pid = os.getpid()
+affinity = os.sched_getaffinity(pid)
+logger.info(f"Entry point: {socket.gethostname()}, {affinity}")
+logger.info(f"CPUS: {os.cpu_count()}")
+
+
+
+def service_as_dragon_proc(
+    service: Service, cpu_affinity: list[int], gpu_affinity: list[int]
+) -> dragon_process.Process:
+
+    options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+    local_policy = dragon_policy.Policy(
+        placement=dragon_policy.Policy.Placement.HOST_NAME,
+        host_name=socket.gethostname(),
+        cpu_affinity=cpu_affinity,
+        gpu_affinity=gpu_affinity,
+    )
+    return dragon_process.Process(
+        target=service.execute,
+        args=[],
+        cwd=os.getcwd(),
+        policy=local_policy,
+        options=options,
+        stderr=dragon_process.Popen.STDOUT,
+        stdout=dragon_process.Popen.STDOUT,
+    )
+
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Worker Manager")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="gpu",
+        choices="gpu cpu".split(),
+        help="Device on which the inference takes place",
+    )
+    parser.add_argument(
+        "--worker_class",
+        type=str,
+        required=True,
+        help="Serialized class of worker to run",
+    )
+    parser.add_argument(
+        "--num_workers", type=int, default=1, help="Number of workers to run"
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="How many requests the workers will try to aggregate before processing them",
+    )
+    parser.add_argument(
+        "--batch_timeout",
+        type=float,
+        default=0.001,
+        help="How much time (in seconds) should be waited before processing an incomplete aggregated request",
+    )
+    args = parser.parse_args()
+
+    connect_to_infrastructure()
+    ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"]
+    ddict = DDict.attach(ddict_str)
+
+    to_worker_channel = Channel.make_process_local()
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    to_worker_fli_serialized = to_worker_fli.serialize()
+    ddict["to_worker_fli"] = to_worker_fli_serialized
+
+    arg_worker_type = cloudpickle.loads(
+        base64.b64decode(args.worker_class.encode("ascii"))
+    )
+
+    dfs = DragonFeatureStore(ddict)
+    comm_channel = DragonFLIChannel(to_worker_fli_serialized)
+
+    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    dispatcher = RequestDispatcher(
+        batch_timeout=args.batch_timeout,
+        batch_size=args.batch_size,
+        config_loader=config_loader,
+        worker_type=arg_worker_type,
+    )
+
+    wms = []
+    worker_device = args.device
+    for wm_idx in range(args.num_workers):
+
+        worker_manager =  WorkerManager(
+            config_loader=config_loader,
+            worker_type=arg_worker_type,
+            as_service=True,
+            cooldown=10,
+            device=worker_device,
+            dispatcher_queue=dispatcher.task_queue,
+        )
+
+        wms.append(worker_manager)
+
+    wm_affinity: list[int] = []
+    disp_affinity: list[int] = []
+
+    # This is hardcoded for a specific type of node:
+    # the GPU-to-CPU mapping is taken from the nvidia-smi tool
+    # TODO can this be computed on the fly?
+    gpu_to_cpu_aff: dict[int, list[int]] = {}
+    gpu_to_cpu_aff[0] = list(range(48,64)) + list(range(112,128))
+    gpu_to_cpu_aff[1] = list(range(32,48)) + list(range(96,112))
+    gpu_to_cpu_aff[2] = list(range(16,32)) + list(range(80,96))
+    gpu_to_cpu_aff[3] = list(range(0,16)) + list(range(64,80))
+
+    worker_manager_procs = []
+    for worker_idx in range(args.num_workers):
+        wm_cpus = len(gpu_to_cpu_aff[worker_idx]) - 4
+        wm_affinity = gpu_to_cpu_aff[worker_idx][:wm_cpus]
+        disp_affinity.extend(gpu_to_cpu_aff[worker_idx][wm_cpus:])
+        worker_manager_procs.append(service_as_dragon_proc(
+                worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[worker_idx]
+            ))
+
+    dispatcher_proc = service_as_dragon_proc(dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[])
+
+    # TODO: use ProcessGroup and restart=True?
+    all_procs = [dispatcher_proc, *worker_manager_procs]
+
+    print(f"Dispatcher proc: {dispatcher_proc}")
+    for proc in all_procs:
+        proc.start()
+
+    while all(proc.is_alive for proc in all_procs):
+        time.sleep(1)
diff --git a/pyproject.toml b/pyproject.toml
index 62df92f0c..61e17891b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,6 +62,7 @@ markers = [
   "group_a: fast test subset a",
   "group_b: fast test subset b",
   "slow_tests: tests that take a long duration to complete",
+  "dragon: tests that must be executed in a dragon runtime",
 ]
 
 [tool.isort]
diff --git a/setup.py b/setup.py
index 328bf1ffb..709913eda 100644
--- a/setup.py
+++ b/setup.py
@@ -177,7 +177,8 @@ class BuildError(Exception):
         "filelock>=3.4.2",
         "protobuf~=3.20",
         "jinja2>=3.1.2",
-        "watchdog>=4.0.0",
+        "watchdog>=4.0.0,<5",
+        "pycapnp==2.0.0",
         "pydantic==1.10.14",
         "pyzmq>=25.1.2",
         "pygithub>=2.3.0",
diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py
index a2e8ed36f..f88af4eb4 100644
--- a/smartsim/_core/_cli/scripts/dragon_install.py
+++ b/smartsim/_core/_cli/scripts/dragon_install.py
@@ -1,7 +1,9 @@
 import os
 import pathlib
+import shutil
 import sys
 import typing as t
+from urllib.request import urlretrieve
 
 from github import Github
 from github.GitReleaseAsset import GitReleaseAsset
@@ -155,38 +157,53 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib
 
     :param working_dir: location in file system where assets should be written
     :param asset: GitHub release asset to retrieve
-    :returns: path to the downloaded asset"""
-    if working_dir.exists() and list(working_dir.rglob("*.whl")):
-        return working_dir
+    :returns: path to the directory containing the extracted release asset"""
+    download_dir = working_dir / str(asset.id)
 
-    archive = WebTGZ(asset.browser_download_url)
-    archive.extract(working_dir)
+    # if we've previously downloaded the release and still have
+    # wheels laying around, use that cached version instead
+    if download_dir.exists() or list(download_dir.rglob("*.whl")):
+        return download_dir
 
-    logger.debug(f"Retrieved {asset.browser_download_url} to {working_dir}")
-    return working_dir
+    download_dir.mkdir(parents=True, exist_ok=True)
+
+    # grab a copy of the complete asset
+    asset_path = download_dir / str(asset.name)
+    download_url = asset.browser_download_url
+
+    try:
+        urlretrieve(download_url, str(asset_path))
+        logger.debug(f"Retrieved asset {asset.name} from {download_url}")
+    except Exception:
+        logger.exception(f"Unable to download asset from: {download_url}")
+
+    # extract the asset
+    archive = WebTGZ(download_url)
+    archive.extract(download_dir)
+
+    logger.debug(f"Extracted {download_url} to {download_dir}")
+    return download_dir
 
 
 def install_package(asset_dir: pathlib.Path) -> int:
     """Install the package found in `asset_dir` into the current python environment
 
     :param asset_dir: path to a decompressed archive contents for a release asset"""
-    wheels = asset_dir.rglob("*.whl")
-    wheel_path = next(wheels, None)
-    if not wheel_path:
-        logger.error(f"No wheel found for package in {asset_dir}")
+    found_wheels = list(asset_dir.rglob("*.whl"))
+    if not found_wheels:
+        logger.error(f"No wheel(s) found for package in {asset_dir}")
         return 1
 
-    create_dotenv(wheel_path.parent)
+    create_dotenv(found_wheels[0].parent)
 
-    while wheel_path is not None:
-        logger.info(f"Installing package: {wheel_path.absolute()}")
+    try:
+        wheels = list(map(str, found_wheels))
+        logger.info("Installing packages:\n%s", "\n".join(wheels))
 
-        try:
-            pip("install", "--force-reinstall", str(wheel_path), "numpy<2")
-            wheel_path = next(wheels, None)
-        except Exception:
-            logger.error(f"Unable to install from {asset_dir}")
-            return 1
+        pip("install", *wheels)
+    except Exception:
+        logger.error(f"Unable to install from {asset_dir}")
+        return 1
 
     return 0
 
diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py
new file mode 100644
index 000000000..6b4ef74b6
--- /dev/null
+++ b/smartsim/_core/entrypoints/service.py
@@ -0,0 +1,136 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import datetime
+import time
+import typing as t
+from abc import ABC, abstractmethod
+
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class Service(ABC):
+    """Base contract for standalone entrypoint scripts. Defines API for entrypoint
+    behaviors (event loop, automatic shutdown, cooldown) as well as simple
+    hooks for status changes"""
+
+    def __init__(
+        self, as_service: bool = False, cooldown: int = 0, loop_delay: int = 0
+    ) -> None:
+        """Initialize the ServiceHost
+        :param as_service: Determines if the host will run until shutdown criteria
+        are met or as a run-once instance
+        :param cooldown: Period of time to allow service to run before automatic
+        shutdown, in seconds. A non-zero, positive integer.
+        :param loop_delay: delay between iterations of the event loop"""
+        self._as_service = as_service
+        """If the service should run until shutdown function returns True"""
+        self._cooldown = abs(cooldown)
+        """Duration of a cooldown period between requests to the service
+        before shutdown"""
+        self._loop_delay = abs(loop_delay)
+        """Forced delay between iterations of the event loop"""
+
+    @abstractmethod
+    def _on_iteration(self) -> None:
+        """The user-defined event handler. Executed repeatedly until shutdown
+        conditions are satisfied and cooldown is elapsed.
+        """
+
+    @abstractmethod
+    def _can_shutdown(self) -> bool:
+        """Return true when the criteria to shut down the service are met."""
+
+    def _on_start(self) -> None:
+        """Empty hook method for use by subclasses. Called on initial entry into
+        ServiceHost `execute` event loop before `_on_iteration` is invoked."""
+        logger.debug(f"Starting {self.__class__.__name__}")
+
+    def _on_shutdown(self) -> None:
+        """Empty hook method for use by subclasses. Called immediately after exiting
+        the main event loop during automatic shutdown."""
+        logger.debug(f"Shutting down {self.__class__.__name__}")
+
+    def _on_cooldown_elapsed(self) -> None:
+        """Empty hook method for use by subclasses. Called on every event loop
+        iteration immediately upon exceeding the cooldown period"""
+        logger.debug(f"Cooldown exceeded by {self.__class__.__name__}")
+
+    def _on_delay(self) -> None:
+        """Empty hook method for use by subclasses. Called on every event loop
+        iteration immediately before executing a delay before the next iteration"""
+        logger.debug(f"Service iteration waiting for {self.__class__.__name__}s")
+
+    def _log_cooldown(self, elapsed: float) -> None:
+        """Log the remaining cooldown time, if any"""
+        remaining = self._cooldown - elapsed
+        if remaining > 0:
+            logger.debug(f"{abs(remaining):.2f}s remains of {self._cooldown}s cooldown")
+        else:
+            logger.info(f"exceeded cooldown {self._cooldown}s by {abs(remaining):.2f}s")
+
+    def execute(self) -> None:
+        """The main event loop of a service host. Evaluates shutdown criteria and
+        combines with a cooldown period to allow automatic service termination.
+        Responsible for executing calls to subclass implementation of `_on_iteration`"""
+        self._on_start()
+
+        running = True
+        cooldown_start: t.Optional[datetime.datetime] = None
+
+        while running:
+            self._on_iteration()
+
+            # allow immediate shutdown if not set to run as a service
+            if not self._as_service:
+                running = False
+                continue
+
+            # reset cooldown period if shutdown criteria are not met
+            if not self._can_shutdown():
+                cooldown_start = None
+
+            # start tracking cooldown elapsed once eligible to quit
+            if cooldown_start is None:
+                cooldown_start = datetime.datetime.now()
+
+            # change running state if cooldown period is exceeded
+            if self._cooldown > 0:
+                elapsed = datetime.datetime.now() - cooldown_start
+                running = elapsed.total_seconds() < self._cooldown
+                self._log_cooldown(elapsed.total_seconds())
+                if not running:
+                    self._on_cooldown_elapsed()
+            elif self._cooldown < 1 and self._can_shutdown():
+                running = False
+
+            if self._loop_delay:
+                self._on_delay()
+                time.sleep(self._loop_delay)
+
+        self._on_shutdown()
diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py
index 4aba60d55..7526af14a 100644
--- a/smartsim/_core/launcher/dragon/dragonBackend.py
+++ b/smartsim/_core/launcher/dragon/dragonBackend.py
@@ -36,13 +36,17 @@
 
 # pylint: disable=import-error
 # isort: off
+import dragon.data.ddict.ddict as dragon_ddict
 import dragon.infrastructure.connection as dragon_connection
 import dragon.infrastructure.policy as dragon_policy
+import dragon.infrastructure.process_desc as dragon_process_desc
 import dragon.native.group_state as dragon_group_state
 import dragon.native.process as dragon_process
 import dragon.native.process_group as dragon_process_group
 import dragon.native.machine as dragon_machine
 
+from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter
+
 # pylint: enable=import-error
 # isort: on
 from ...._core.config import get_config
@@ -153,7 +157,6 @@ def __init__(self, pid: int) -> None:
         self._step_ids = (f"{create_short_id_str()}-{id}" for id in itertools.count())
         """Incremental ID to assign to new steps prior to execution"""
 
-        self._initialize_hosts()
         self._queued_steps: "collections.OrderedDict[str, DragonRunRequest]" = (
             collections.OrderedDict()
         )
@@ -184,9 +187,23 @@ def __init__(self, pid: int) -> None:
             else 5
         )
         """Time in seconds needed to server to complete shutdown"""
+        self._infra_ddict: t.Optional[dragon_ddict.DDict] = None
+
+        self._nodes: t.List["dragon_machine.Node"] = []
+        """Node capability information for hosts in the allocation"""
+        self._hosts: t.List[str] = []
+        """List of hosts available in allocation"""
+        self._cpus: t.List[int] = []
+        """List of cpu-count by node"""
+        self._gpus: t.List[int] = []
+        """List of gpu-count by node"""
+        self._allocated_hosts: t.Dict[str, t.Set[str]] = {}
+        """Mapping with hostnames as keys and a set of running step IDs as the value"""
 
+        self._initialize_hosts()
         self._view = DragonBackendView(self)
         logger.debug(self._view.host_desc)
+        self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock)
 
     @property
     def hosts(self) -> list[str]:
@@ -194,34 +211,39 @@ def hosts(self) -> list[str]:
             return self._hosts
 
     @property
-    def allocated_hosts(self) -> dict[str, str]:
+    def allocated_hosts(self) -> dict[str, t.Set[str]]:
+        """A map of host names to the step id executing on a host
+
+        :returns: Dictionary with host name as key and step id as value"""
         with self._queue_lock:
             return self._allocated_hosts
 
     @property
-    def free_hosts(self) -> t.Deque[str]:
+    def free_hosts(self) -> t.Sequence[str]:
+        """Find hosts that do not have a step assigned
+
+        :returns: List of host names"""
         with self._queue_lock:
-            return self._free_hosts
+            return list(map(lambda x: x.hostname, self._prioritizer.unassigned()))
 
     @property
     def group_infos(self) -> dict[str, ProcessGroupInfo]:
+        """Find information pertaining to process groups executing on a host
+
+        :returns: Dictionary with host name as key and group information as value"""
         with self._queue_lock:
             return self._group_infos
 
     def _initialize_hosts(self) -> None:
+        """Prepare metadata about the allocation"""
         with self._queue_lock:
             self._nodes = [
                 dragon_machine.Node(node) for node in dragon_machine.System().nodes
             ]
-            self._hosts: t.List[str] = sorted(node.hostname for node in self._nodes)
+            self._hosts = sorted(node.hostname for node in self._nodes)
             self._cpus = [node.num_cpus for node in self._nodes]
             self._gpus = [node.num_gpus for node in self._nodes]
-
-            """List of hosts available in allocation"""
-            self._free_hosts: t.Deque[str] = collections.deque(self._hosts)
-            """List of hosts on which steps can be launched"""
-            self._allocated_hosts: t.Dict[str, str] = {}
-            """Mapping of hosts on which a step is already running to step ID"""
+            self._allocated_hosts = collections.defaultdict(set)
 
     def __str__(self) -> str:
         return self.status_message
@@ -230,7 +252,7 @@ def __str__(self) -> str:
     def status_message(self) -> str:
         """Message with status of available nodes and history of launched jobs.
 
-        :returns: Status message
+        :returns: a status message
         """
         return (
             "Dragon server backend update\n"
@@ -242,9 +264,8 @@ def _heartbeat(self) -> None:
 
     @property
     def cooldown_period(self) -> int:
-        """Time (in seconds) the server will wait before shutting down
-
-        when exit conditions are met (see ``should_shutdown()`` for further details).
+        """Time (in seconds) the server will wait before shutting down when
+        exit conditions are met (see ``should_shutdown()`` for further details).
         """
         return self._cooldown_period
 
@@ -278,6 +299,8 @@ def should_shutdown(self) -> bool:
         and it requested immediate shutdown, or if it did not request immediate
         shutdown, but all jobs have been executed.
         In both cases, a cooldown period may need to be waited before shutdown.
+
+        :returns: `True` if the server should terminate, otherwise `False`
         """
         if self._shutdown_requested and self._can_shutdown:
             return self._has_cooled_down
@@ -285,7 +308,9 @@ def should_shutdown(self) -> bool:
 
     @property
     def current_time(self) -> float:
-        """Current time for DragonBackend object, in seconds since the Epoch"""
+        """Current time for DragonBackend object, in seconds since the Epoch
+
+        :returns: the current timestamp"""
         return time.time()
 
     def _can_honor_policy(
@@ -293,63 +318,149 @@ def _can_honor_policy(
     ) -> t.Tuple[bool, t.Optional[str]]:
         """Check if the policy can be honored with resources available
         in the allocation.
-        :param request: DragonRunRequest containing policy information
+
+        :param request: `DragonRunRequest` to validate
         :returns: Tuple indicating if the policy can be honored and
         an optional error message"""
         # ensure the policy can be honored
         if request.policy:
+            logger.debug(f"{request.policy=}{self._cpus=}{self._gpus=}")
+
             if request.policy.cpu_affinity:
                 # make sure some node has enough CPUs
-                available = max(self._cpus)
+                last_available = max(self._cpus or [-1])
                 requested = max(request.policy.cpu_affinity)
-
-                if requested >= available:
+                if not any(self._cpus) or requested >= last_available:
                     return False, "Cannot satisfy request, not enough CPUs available"
-
             if request.policy.gpu_affinity:
                 # make sure some node has enough GPUs
-                available = max(self._gpus)
+                last_available = max(self._gpus or [-1])
                 requested = max(request.policy.gpu_affinity)
-
-                if requested >= available:
+                if not any(self._gpus) or requested >= last_available:
+                    logger.warning(
+                        f"failed check w/{self._gpus=}, {requested=}, {last_available=}"
+                    )
                     return False, "Cannot satisfy request, not enough GPUs available"
-
         return True, None
 
     def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]:
-        """Check if request can be honored with resources available in the allocation.
-
-        Currently only checks for total number of nodes,
-        in the future it will also look at other constraints
-        such as memory, accelerators, and so on.
+        """Check if request can be honored with resources available in
+        the allocation. Currently only checks for total number of nodes,
+        in the future it will also look at other constraints such as memory,
+        accelerators, and so on.
+
+        :param request: `DragonRunRequest` to validate
+        :returns: Tuple indicating if the request can be honored and
+        an optional error message
         """
-        if request.nodes > len(self._hosts):
-            message = f"Cannot satisfy request. Requested {request.nodes} nodes, "
-            message += f"but only {len(self._hosts)} nodes are available."
-            return False, message
-        if self._shutdown_requested:
-            message = "Cannot satisfy request, server is shutting down."
-            return False, message
+        honorable, err = self._can_honor_state(request)
+        if not honorable:
+            return False, err
 
         honorable, err = self._can_honor_policy(request)
         if not honorable:
             return False, err
 
+        honorable, err = self._can_honor_hosts(request)
+        if not honorable:
+            return False, err
+
+        return True, None
+
+    def _can_honor_hosts(
+        self, request: DragonRunRequest
+    ) -> t.Tuple[bool, t.Optional[str]]:
+        """Check if the current state of the backend process inhibits executing
+        the request.
+
+        :param request: `DragonRunRequest` to validate
+        :returns: Tuple indicating if the request can be honored and
+        an optional error message"""
+        all_hosts = frozenset(self._hosts)
+        num_nodes = request.nodes
+
+        # fail if requesting more nodes than the total number available
+        if num_nodes > len(all_hosts):
+            message = f"Cannot satisfy request. {num_nodes} requested nodes"
+            message += f" exceeds {len(all_hosts)} available."
+            return False, message
+
+        requested_hosts = all_hosts
+        if request.hostlist:
+            requested_hosts = frozenset(
+                {host.strip() for host in request.hostlist.split(",")}
+            )
+
+        valid_hosts = all_hosts.intersection(requested_hosts)
+        invalid_hosts = requested_hosts - valid_hosts
+
+        logger.debug(f"{num_nodes=}{valid_hosts=}{invalid_hosts=}")
+
+        if invalid_hosts:
+            logger.warning(f"Some invalid hostnames were requested: {invalid_hosts}")
+
+        # fail if requesting specific hostnames and there aren't enough available
+        if num_nodes > len(valid_hosts):
+            message = f"Cannot satisfy request. Requested {num_nodes} nodes, "
+            message += f"but only {len(valid_hosts)} named hosts are available."
+            return False, message
+
+        return True, None
+
+    def _can_honor_state(
+        self, _request: DragonRunRequest
+    ) -> t.Tuple[bool, t.Optional[str]]:
+        """Check if the current state of the backend process inhibits executing
+        the request.
+        :param _request: the DragonRunRequest to verify
+        :returns: Tuple indicating if the request can be honored and
+        an optional error message"""
+        if self._shutdown_requested:
+            message = "Cannot satisfy request, server is shutting down."
+            return False, message
+
         return True, None
 
     def _allocate_step(
         self, step_id: str, request: DragonRunRequest
     ) -> t.Optional[t.List[str]]:
+        """Identify the hosts on which the request will be executed
 
+        :param step_id: The identifier of a step that will be executed on the host
+        :param request: The request to be executed
+        :returns: A list of selected hostnames"""
+        # ensure at least one host is selected
         num_hosts: int = request.nodes
+
         with self._queue_lock:
-            if num_hosts <= 0 or num_hosts > len(self._free_hosts):
+            if num_hosts <= 0 or num_hosts > len(self._hosts):
+                logger.debug(
+                    f"The number of requested hosts ({num_hosts}) is invalid or"
+                    f" cannot be satisfied with {len(self._hosts)} available nodes"
+                )
+                return None
+
+            hosts = []
+            if request.hostlist:
+                # convert the comma-separated argument into a real list
+                hosts = [host for host in request.hostlist.split(",") if host]
+
+            filter_on: t.Optional[PrioritizerFilter] = None
+            if request.policy and request.policy.gpu_affinity:
+                filter_on = PrioritizerFilter.GPU
+
+            nodes = self._prioritizer.next_n(num_hosts, filter_on, step_id, hosts)
+
+            if len(nodes) < num_hosts:
+                # exit if the prioritizer can't identify enough nodes
                 return None
-            to_allocate = []
-            for _ in range(num_hosts):
-                host = self._free_hosts.popleft()
-                self._allocated_hosts[host] = step_id
-                to_allocate.append(host)
+
+            to_allocate = [node.hostname for node in nodes]
+
+            for hostname in to_allocate:
+                # track assigning this step to each node
+                self._allocated_hosts[hostname].add(step_id)
+
             return to_allocate
 
     @staticmethod
@@ -389,6 +500,7 @@ def _create_redirect_workers(
         return grp_redir
 
     def _stop_steps(self) -> None:
+        """Trigger termination of all currently executing steps"""
         self._heartbeat()
         with self._queue_lock:
             while len(self._stop_requests) > 0:
@@ -427,18 +539,34 @@ def _stop_steps(self) -> None:
                 self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
                 self._group_infos[step_id].return_codes = [-9]
 
+    @property
+    def infra_ddict(self) -> str:
+        """Create a Dragon distributed dictionary and return its
+        serialized descriptor
+        """
+        if self._infra_ddict is None:
+            logger.info("Creating DDict")
+            self._infra_ddict = dragon_ddict.DDict(
+                n_nodes=len(self._hosts), total_mem=len(self._hosts) * 1024**3
+            )  # todo: parametrize
+            logger.info("Created DDict")
+            self._infra_ddict["creation"] = str(time.time())
+            logger.info(self._infra_ddict["creation"])
+
+        return str(self._infra_ddict.serialize())
+
     @staticmethod
     def create_run_policy(
         request: DragonRequest, node_name: str
     ) -> "dragon_policy.Policy":
         """Create a dragon Policy from the request and node name
+
         :param request: DragonRunRequest containing policy information
         :param node_name: Name of the node on which the process will run
         :returns: dragon_policy.Policy object mapped from request properties"""
         if isinstance(request, DragonRunRequest):
             run_request: DragonRunRequest = request
 
-            affinity = dragon_policy.Policy.Affinity.DEFAULT
             cpu_affinity: t.List[int] = []
             gpu_affinity: t.List[int] = []
 
@@ -446,25 +574,20 @@ def create_run_policy(
             if run_request.policy is not None:
                 # Affinities are not mutually exclusive. If specified, both are used
                 if run_request.policy.cpu_affinity:
-                    affinity = dragon_policy.Policy.Affinity.SPECIFIC
                     cpu_affinity = run_request.policy.cpu_affinity
 
                 if run_request.policy.gpu_affinity:
-                    affinity = dragon_policy.Policy.Affinity.SPECIFIC
                     gpu_affinity = run_request.policy.gpu_affinity
             logger.debug(
-                f"Affinity strategy: {affinity}, "
                 f"CPU affinity mask: {cpu_affinity}, "
                 f"GPU affinity mask: {gpu_affinity}"
             )
-            if affinity != dragon_policy.Policy.Affinity.DEFAULT:
-                return dragon_policy.Policy(
-                    placement=dragon_policy.Policy.Placement.HOST_NAME,
-                    host_name=node_name,
-                    affinity=affinity,
-                    cpu_affinity=cpu_affinity,
-                    gpu_affinity=gpu_affinity,
-                )
+            return dragon_policy.Policy(
+                placement=dragon_policy.Policy.Placement.HOST_NAME,
+                host_name=node_name,
+                cpu_affinity=cpu_affinity,
+                gpu_affinity=gpu_affinity,
+            )
 
         return dragon_policy.Policy(
             placement=dragon_policy.Policy.Placement.HOST_NAME,
@@ -482,10 +605,8 @@ def _start_steps(self) -> None:
 
                 logger.debug(f"Step id {step_id} allocated on {hosts}")
 
-                global_policy = dragon_policy.Policy(
-                    placement=dragon_policy.Policy.Placement.HOST_NAME,
-                    host_name=hosts[0],
-                )
+                global_policy = self.create_run_policy(request, hosts[0])
+                options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
                 grp = dragon_process_group.ProcessGroup(
                     restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy
                 )
@@ -498,10 +619,15 @@ def _start_steps(self) -> None:
                         target=request.exe,
                         args=request.exe_args,
                         cwd=request.path,
-                        env={**request.current_env, **request.env},
+                        env={
+                            **request.current_env,
+                            **request.env,
+                            "_SMARTSIM_INFRA_BACKBONE": self.infra_ddict,
+                        },
                         stdout=dragon_process.Popen.PIPE,
                         stderr=dragon_process.Popen.PIPE,
                         policy=local_policy,
+                        options=options,
                     )
                     grp.add_process(nproc=request.tasks_per_node, template=tmp_proc)
 
@@ -567,9 +693,11 @@ def _start_steps(self) -> None:
                     logger.error(e)
 
     def _refresh_statuses(self) -> None:
+        """Query underlying management system for step status and update
+        stored assigned and unassigned task information"""
         self._heartbeat()
         with self._queue_lock:
-            terminated = []
+            terminated: t.Set[str] = set()
             for step_id in self._running_steps:
                 group_info = self._group_infos[step_id]
                 grp = group_info.process_group
@@ -603,11 +731,15 @@ def _refresh_statuses(self) -> None:
                             )
 
                 if group_info.status in TERMINAL_STATUSES:
-                    terminated.append(step_id)
+                    terminated.add(step_id)
 
             if terminated:
                 logger.debug(f"{terminated=}")
 
+            # remove all the terminated steps from all hosts
+            for host in list(self._allocated_hosts.keys()):
+                self._allocated_hosts[host].difference_update(terminated)
+
             for step_id in terminated:
                 self._running_steps.remove(step_id)
                 self._completed_steps.append(step_id)
@@ -615,11 +747,13 @@ def _refresh_statuses(self) -> None:
                 if group_info is not None:
                     for host in group_info.hosts:
                         logger.debug(f"Releasing host {host}")
-                        try:
-                            self._allocated_hosts.pop(host)
-                        except KeyError:
+                        if host not in self._allocated_hosts:
                             logger.error(f"Tried to free a non-allocated host: {host}")
-                        self._free_hosts.append(host)
+                        else:
+                            # remove any hosts that have had all their steps terminated
+                            if not self._allocated_hosts[host]:
+                                self._allocated_hosts.pop(host)
+                        self._prioritizer.decrement(host, step_id)
                     group_info.process_group = None
                     group_info.redir_workers = None
 
@@ -643,6 +777,7 @@ def _should_print_status(self) -> bool:
         return False
 
     def _update(self) -> None:
+        """Trigger all update queries and update local state database"""
         self._stop_steps()
         self._start_steps()
         self._refresh_statuses()
@@ -730,8 +865,12 @@ def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse:
 
 
 class DragonBackendView:
-    def __init__(self, backend: DragonBackend):
+    def __init__(self, backend: DragonBackend) -> None:
+        """Initialize the instance
+
+        :param backend: A dragon backend used to produce the view"""
         self._backend = backend
+        """A dragon backend used to produce the view"""
 
     @property
     def host_desc(self) -> str:
@@ -793,9 +932,7 @@ def step_table(self) -> str:
     @property
     def host_table(self) -> str:
         """Table representation of current state of nodes available
-
-        in the allocation.
-        """
+        in the allocation."""
         headers = ["Host", "Status"]
         hosts = self._backend.hosts
         free_hosts = self._backend.free_hosts
diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py
index 9078fed54..e8391410b 100644
--- a/smartsim/_core/launcher/dragon/dragonLauncher.py
+++ b/smartsim/_core/launcher/dragon/dragonLauncher.py
@@ -170,6 +170,7 @@ def run(self, step: Step) -> t.Optional[str]:
             merged_env = self._connector.merge_persisted_env(os.environ.copy())
             nodes = int(run_args.get("nodes", None) or 1)
             tasks_per_node = int(run_args.get("tasks-per-node", None) or 1)
+            hosts = str(run_args.get("host-list", ""))
 
             policy = DragonRunPolicy.from_run_args(run_args)
 
@@ -187,6 +188,7 @@ def run(self, step: Step) -> t.Optional[str]:
                         output_file=out,
                         error_file=err,
                         policy=policy,
+                        hostlist=hosts,
                     )
                 ),
                 DragonRunResponse,
diff --git a/smartsim/_core/launcher/dragon/pqueue.py b/smartsim/_core/launcher/dragon/pqueue.py
new file mode 100644
index 000000000..8c14a828f
--- /dev/null
+++ b/smartsim/_core/launcher/dragon/pqueue.py
@@ -0,0 +1,461 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# import collections
+import enum
+import heapq
+import threading
+import typing as t
+
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class Node(t.Protocol):
+    """Base Node API required to support the NodePrioritizer"""
+
+    @property
+    def hostname(self) -> str:
+        """The hostname of the node"""
+
+    @property
+    def num_cpus(self) -> int:
+        """The number of CPUs in the node"""
+
+    @property
+    def num_gpus(self) -> int:
+        """The number of GPUs in the node"""
+
+
+class NodeReferenceCount(t.Protocol):
+    """Contains details pertaining to references to a node"""
+
+    @property
+    def hostname(self) -> str:
+        """The hostname of the node"""
+
+    @property
+    def num_refs(self) -> int:
+        """The number of jobs assigned to the node"""
+
+
+class _TrackedNode:
+    """Node API required to have support in the NodePrioritizer"""
+
+    def __init__(self, node: Node) -> None:
+        self._node = node
+        """The node being tracked"""
+        self._num_refs = 0
+        """The number of references to the tracked node"""
+        self._assigned_tasks: t.Set[str] = set()
+        """The unique identifiers of processes using this node"""
+        self._is_dirty = False
+        """Flag indicating that tracking information has been modified"""
+
+    @property
+    def hostname(self) -> str:
+        """Returns the hostname of the node"""
+        return self._node.hostname
+
+    @property
+    def num_cpus(self) -> int:
+        """Returns the number of CPUs in the node"""
+        return self._node.num_cpus
+
+    @property
+    def num_gpus(self) -> int:
+        """Returns the number of GPUs attached to the node"""
+        return self._node.num_gpus
+
+    @property
+    def num_refs(self) -> int:
+        """Returns the number of processes currently running on the node"""
+        return self._num_refs
+
+    @property
+    def is_assigned(self) -> bool:
+        """Returns `True` if no references are currently counted, `False` otherwise"""
+        return self._num_refs > 0
+
+    @property
+    def assigned_tasks(self) -> t.Set[str]:
+        """Returns the set of unique IDs for currently running processes"""
+        return self._assigned_tasks
+
+    @property
+    def is_dirty(self) -> bool:
+        """Returns a flag indicating if the reference counter has changed. `True`
+        if references have been added or removed, `False` otherwise."""
+        return self._is_dirty
+
+    def clean(self) -> None:
+        """Marks the node as unmodified"""
+        self._is_dirty = False
+
+    def add(
+        self,
+        tracking_id: t.Optional[str] = None,
+    ) -> None:
+        """Update the node to indicate the addition of a process that must be
+        reference counted.
+
+        :param tracking_id: a unique task identifier executing on the node
+        to add
+        :raises ValueError: if tracking_id is already assigned to this node"""
+        if tracking_id in self.assigned_tasks:
+            raise ValueError("Attempted adding task more than once")
+
+        self._num_refs = self._num_refs + 1
+        if tracking_id:
+            self._assigned_tasks = self._assigned_tasks.union({tracking_id})
+        self._is_dirty = True
+
+    def remove(
+        self,
+        tracking_id: t.Optional[str] = None,
+    ) -> None:
+        """Update the reference counter to indicate the removal of a process.
+
+        :param tracking_id: a unique task identifier executing on the node
+        to remove
+        :raises ValueError: if tracking_id is already assigned to this node"""
+        self._num_refs = max(self._num_refs - 1, 0)
+        if tracking_id:
+            self._assigned_tasks = self._assigned_tasks - {tracking_id}
+        self._is_dirty = True
+
+    def __lt__(self, other: "_TrackedNode") -> bool:
+        """Comparison operator used to evaluate the ordering of nodes within
+        the prioritizer. This comparison only considers reference counts.
+
+        :param other: Another node to compare against
+        :returns: True if this node has fewer references than the other node"""
+        if self.num_refs < other.num_refs:
+            return True
+
+        return False
+
+
+class PrioritizerFilter(str, enum.Enum):
+    """A filter used to select a subset of nodes to be queried"""
+
+    CPU = enum.auto()
+    GPU = enum.auto()
+
+
+class NodePrioritizer:
+    def __init__(self, nodes: t.List[Node], lock: threading.RLock) -> None:
+        """Initialize the prioritizer
+
+        :param nodes: node attribute information for initializing the priorizer
+        :param lock: a lock used to ensure threadsafe operations
+        :raises SmartSimError: if the nodes collection is empty
+        """
+        if not nodes:
+            raise SmartSimError("Missing nodes to prioritize")
+
+        self._lock = lock
+        """Lock used to ensure thread safe changes of the reference counters"""
+        self._cpu_refs: t.List[_TrackedNode] = []
+        """Track reference counts to CPU-only nodes"""
+        self._gpu_refs: t.List[_TrackedNode] = []
+        """Track reference counts to GPU nodes"""
+        self._nodes: t.Dict[str, _TrackedNode] = {}
+
+        self._initialize_reference_counters(nodes)
+
+    def _initialize_reference_counters(self, nodes: t.List[Node]) -> None:
+        """Perform initialization of reference counters for nodes in the allocation
+
+        :param nodes: node attribute information for initializing the priorizer"""
+        for node in nodes:
+            # create a set of reference counters for the nodes
+            tracked = _TrackedNode(node)
+
+            self._nodes[node.hostname] = tracked  # for O(1) access
+
+            if node.num_gpus:
+                self._gpu_refs.append(tracked)
+            else:
+                self._cpu_refs.append(tracked)
+
+    def increment(
+        self, host: str, tracking_id: t.Optional[str] = None
+    ) -> NodeReferenceCount:
+        """Directly increment the reference count of a given node and ensure the
+        ref counter is marked as dirty to trigger a reordering on retrieval
+
+        :param host: a hostname that should have a reference counter selected
+        :param tracking_id: a unique task identifier executing on the node
+        to add"""
+        with self._lock:
+            tracked_node = self._nodes[host]
+            tracked_node.add(tracking_id)
+            return tracked_node
+
+    def _heapify_all_refs(self) -> t.List[_TrackedNode]:
+        """Combine the CPU and GPU nodes into a single heap
+
+        :returns: list of all reference counters"""
+        refs = [*self._cpu_refs, *self._gpu_refs]
+        heapq.heapify(refs)
+        return refs
+
+    def get_tracking_info(self, host: str) -> NodeReferenceCount:
+        """Returns the reference counter information for a single node
+
+        :param host: a hostname that should have a reference counter selected
+        :returns: a reference counter for the node
+        :raises ValueError: if the hostname is not in the set of managed nodes"""
+        if host not in self._nodes:
+            raise ValueError("The supplied hostname was not found")
+
+        return self._nodes[host]
+
+    def decrement(
+        self, host: str, tracking_id: t.Optional[str] = None
+    ) -> NodeReferenceCount:
+        """Directly decrement the reference count of a given node and ensure the
+        ref counter is marked as dirty to trigger a reordering
+
+        :param host: a hostname that should have a reference counter decremented
+        :param tracking_id: unique task identifier to remove"""
+        with self._lock:
+            tracked_node = self._nodes[host]
+            tracked_node.remove(tracking_id)
+
+            return tracked_node
+
+    def _create_sub_heap(
+        self,
+        hosts: t.Optional[t.List[str]] = None,
+        filter_on: t.Optional[PrioritizerFilter] = None,
+    ) -> t.List[_TrackedNode]:
+        """Create a new heap from the primary heap with user-specified nodes
+
+        :param hosts: a list of hostnames used to filter the available nodes
+        :returns: a list of assigned reference counters
+        """
+        nodes_tracking_info: t.List[_TrackedNode] = []
+        heap = self._get_filtered_heap(filter_on)
+
+        # Collect all the tracking info for the requested nodes...
+        for node in heap:
+            if not hosts or node.hostname in hosts:
+                nodes_tracking_info.append(node)
+
+        # ... and use it to create a new heap from a specified subset of nodes
+        heapq.heapify(nodes_tracking_info)
+
+        return nodes_tracking_info
+
+    def unassigned(
+        self, heap: t.Optional[t.List[_TrackedNode]] = None
+    ) -> t.Sequence[Node]:
+        """Select nodes that are currently not assigned a task
+
+        :param heap: a subset of the node heap to consider
+        :returns: a list of reference counts for all unassigned nodes"""
+        if heap is None:
+            heap = list(self._nodes.values())
+
+        nodes: t.List[_TrackedNode] = []
+        for item in heap:
+            if item.num_refs == 0:
+                nodes.append(item)
+        return nodes
+
+    def assigned(
+        self, heap: t.Optional[t.List[_TrackedNode]] = None
+    ) -> t.Sequence[Node]:
+        """Helper method to identify the nodes that are currently assigned
+
+        :param heap: a subset of the node heap to consider
+        :returns: a list of reference counts for all assigned nodes"""
+        if heap is None:
+            heap = list(self._nodes.values())
+
+        nodes: t.List[_TrackedNode] = []
+        for item in heap:
+            if item.num_refs > 0:
+                nodes.append(item)
+        return nodes
+
+    def _check_satisfiable_n(
+        self, num_items: int, heap: t.Optional[t.List[_TrackedNode]] = None
+    ) -> bool:
+        """Validates that a request for some number of nodes `n` can be
+        satisfied by the prioritizer given the set of nodes available
+
+        :param num_items: the desired number of nodes to allocate
+        :param heap: a subset of the node heap to consider
+        :returns: True if the request can be fulfilled, False otherwise"""
+        num_nodes = len(self._nodes.keys())
+
+        if num_items < 1:
+            msg = "Cannot handle request; request requires a positive integer"
+            logger.warning(msg)
+            return False
+
+        if num_nodes < num_items:
+            msg = f"Cannot satisfy request for {num_items} nodes; {num_nodes} in pool"
+            logger.warning(msg)
+            return False
+
+        num_open = len(self.unassigned(heap))
+        if num_open < num_items:
+            msg = f"Cannot satisfy request for {num_items} nodes; {num_open} available"
+            logger.warning(msg)
+            return False
+
+        return True
+
+    def _get_next_unassigned_node(
+        self,
+        heap: t.List[_TrackedNode],
+        tracking_id: t.Optional[str] = None,
+    ) -> t.Optional[Node]:
+        """Finds the next node with no running processes and
+        ensures that any elements that were directly updated are updated in
+        the priority structure before being made available
+
+        :param heap: a subset of the node heap to consider
+        :param tracking_id: unique task identifier to track
+        :returns: a reference counter for an available node if an unassigned node
+        exists, `None` otherwise"""
+        tracking_info: t.Optional[_TrackedNode] = None
+
+        with self._lock:
+            # re-sort the heap to handle any tracking changes
+            if any(node.is_dirty for node in heap):
+                heapq.heapify(heap)
+
+            # grab the min node from the heap
+            tracking_info = heapq.heappop(heap)
+
+            # the node is available if it has no assigned tasks
+            is_assigned = tracking_info.is_assigned
+            if not is_assigned:
+                # track the new process on the node
+                tracking_info.add(tracking_id)
+
+            # add the node that was popped back into the heap
+            heapq.heappush(heap, tracking_info)
+
+            # mark all nodes as clean now that everything is updated & sorted
+            for node in heap:
+                node.clean()
+
+            # next available must only return previously unassigned nodes
+            if is_assigned:
+                return None
+
+        return tracking_info
+
+    def _get_next_n_available_nodes(
+        self,
+        num_items: int,
+        heap: t.List[_TrackedNode],
+        tracking_id: t.Optional[str] = None,
+    ) -> t.List[Node]:
+        """Find the next N available nodes w/least amount of references using
+        the supplied filter to target a specific node capability
+
+        :param num_items: number of nodes to reserve
+        :param heap: a subset of the node heap to consider
+        :param tracking_id: unique task identifier to track
+        :returns: a list of reference counters for a available nodes if enough
+        unassigned nodes exists, `None` otherwise
+        :raises ValueError: if the number of requested nodes is not a positive integer
+        """
+        next_nodes: t.List[Node] = []
+
+        if num_items < 1:
+            raise ValueError(f"Number of items requested {num_items} is invalid")
+
+        if not self._check_satisfiable_n(num_items, heap):
+            return next_nodes
+
+        while len(next_nodes) < num_items:
+            if next_node := self._get_next_unassigned_node(heap, tracking_id):
+                next_nodes.append(next_node)
+                continue
+            break
+
+        return next_nodes
+
+    def _get_filtered_heap(
+        self, filter_on: t.Optional[PrioritizerFilter] = None
+    ) -> t.List[_TrackedNode]:
+        """Helper method to select the set of nodes to include in a filtered
+        heap.
+
+        :param filter_on: A list of nodes that satisfy the filter. If no
+        filter is supplied, all nodes are returned"""
+        if filter_on == PrioritizerFilter.GPU:
+            return self._gpu_refs
+        if filter_on == PrioritizerFilter.CPU:
+            return self._cpu_refs
+
+        return self._heapify_all_refs()
+
+    def next(
+        self,
+        filter_on: t.Optional[PrioritizerFilter] = None,
+        tracking_id: t.Optional[str] = None,
+        hosts: t.Optional[t.List[str]] = None,
+    ) -> t.Optional[Node]:
+        """Find the next unsassigned node using the supplied filter to target
+        a specific node capability
+
+        :param filter_on: the subset of nodes to query for available nodes
+        :param tracking_id: unique task identifier to track
+        :param hosts: a list of hostnames used to filter the available nodes
+        :returns: a reference counter for an available node if an unassigned node
+        exists, `None` otherwise"""
+        if results := self.next_n(1, filter_on, tracking_id, hosts):
+            return results[0]
+        return None
+
+    def next_n(
+        self,
+        num_items: int = 1,
+        filter_on: t.Optional[PrioritizerFilter] = None,
+        tracking_id: t.Optional[str] = None,
+        hosts: t.Optional[t.List[str]] = None,
+    ) -> t.List[Node]:
+        """Find the next N available nodes w/least amount of references using
+        the supplied filter to target a specific node capability
+
+        :param num_items: number of nodes to reserve
+        :param filter_on: the subset of nodes to query for available nodes
+        :param tracking_id: unique task identifier to track
+        :param hosts: a list of hostnames used to filter the available nodes
+        :returns: Collection of reserved nodes
+        :raises ValueError: if the hosts parameter is an empty list"""
+        heap = self._create_sub_heap(hosts, filter_on)
+        return self._get_next_n_available_nodes(num_items, heap, tracking_id)
diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py
index dd93d7910..21fdc697c 100644
--- a/smartsim/_core/launcher/step/dragonStep.py
+++ b/smartsim/_core/launcher/step/dragonStep.py
@@ -169,6 +169,7 @@ def _write_request_file(self) -> str:
             env = run_settings.env_vars
             nodes = int(run_args.get("nodes", None) or 1)
             tasks_per_node = int(run_args.get("tasks-per-node", None) or 1)
+            hosts_csv = str(run_args.get("host-list", ""))
 
             policy = DragonRunPolicy.from_run_args(run_args)
 
@@ -187,6 +188,7 @@ def _write_request_file(self) -> str:
                 output_file=out,
                 error_file=err,
                 policy=policy,
+                hostlist=hosts_csv,
             )
             requests.append(request_registry.to_string(request))
         with open(request_file, "w", encoding="utf-8") as script_file:
diff --git a/smartsim/_core/mli/__init__.py b/smartsim/_core/mli/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/smartsim/_core/mli/comm/channel/__init__.py b/smartsim/_core/mli/comm/channel/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py
new file mode 100644
index 000000000..d91859126
--- /dev/null
+++ b/smartsim/_core/mli/comm/channel/channel.py
@@ -0,0 +1,59 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+from abc import ABC, abstractmethod
+
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class CommChannelBase(ABC):
+    """Base class for abstracting a message passing mechanism"""
+
+    def __init__(self, descriptor: t.Union[str, bytes]) -> None:
+        """Initialize the CommChannel instance"""
+        self._descriptor = descriptor
+
+    @abstractmethod
+    def send(self, value: bytes) -> None:
+        """Send a message through the underlying communication channel
+
+        :param value: The value to send"""
+
+    @abstractmethod
+    def recv(self) -> t.List[bytes]:
+        """Receieve a message through the underlying communication channel
+
+        :returns: the received message"""
+
+    @property
+    def descriptor(self) -> bytes:
+        """Return the channel descriptor for the underlying dragon channel"""
+        if isinstance(self._descriptor, str):
+            return self._descriptor.encode("utf-8")
+        return self._descriptor
diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py
new file mode 100644
index 000000000..89b90f2e6
--- /dev/null
+++ b/smartsim/_core/mli/comm/channel/dragonchannel.py
@@ -0,0 +1,74 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import base64
+import sys
+import typing as t
+
+import smartsim._core.mli.comm.channel.channel as cch
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+import dragon.channels as dch
+
+
+class DragonCommChannel(cch.CommChannelBase):
+    """Passes messages by writing to a Dragon channel"""
+
+    def __init__(self, key: bytes) -> None:
+        """Initialize the DragonCommChannel instance"""
+        super().__init__(key)
+        self._channel: dch.Channel = dch.Channel.attach(key)
+
+    def send(self, value: bytes) -> None:
+        """Send a message throuh the underlying communication channel
+        :param value: The value to send"""
+        with self._channel.sendh(timeout=None) as sendh:
+            sendh.send_bytes(value)
+
+    def recv(self) -> t.List[bytes]:
+        """Receieve a message through the underlying communication channel
+
+        :returns: the received message"""
+        with self._channel.recvh(timeout=None) as recvh:
+            message_bytes: bytes = recvh.recv_bytes(timeout=None)
+            return [message_bytes]
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "DragonCommChannel":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached DragonCommChannel"""
+        try:
+            return DragonCommChannel(base64.b64decode(descriptor))
+        except:
+            logger.error(f"Failed to create dragon comm channel: {descriptor}")
+            raise
diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py
new file mode 100644
index 000000000..130c5cf5e
--- /dev/null
+++ b/smartsim/_core/mli/comm/channel/dragonfli.py
@@ -0,0 +1,96 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# isort: off
+from dragon import fli
+import dragon.channels as dch
+
+# isort: on
+
+import base64
+import typing as t
+
+import smartsim._core.mli.comm.channel.channel as cch
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class DragonFLIChannel(cch.CommChannelBase):
+    """Passes messages by writing to a Dragon FLI Channel"""
+
+    def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None:
+        """Initialize the DragonFLIChannel instance
+
+        :param fli_desc: the descriptor of the FLI channel to attach
+        :param sender_supplied: flag indicating if the FLI uses sender-supplied streams
+        """
+        super().__init__(fli_desc)
+        # todo: do we need memory pool information to construct the channel correctly?
+        self._fli: "fli" = fli.FLInterface.attach(fli_desc)
+        self._channel: t.Optional["dch"] = (
+            dch.Channel.make_process_local() if sender_supplied else None
+        )
+
+    def send(self, value: bytes) -> None:
+        """Send a message through the underlying communication channel
+
+        :param value: The value to send"""
+        with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh:
+            sendh.send_bytes(value)
+
+    def recv(self) -> t.List[bytes]:
+        """Receieve a message through the underlying communication channel
+
+        :returns: the received message"""
+        messages = []
+        eot = False
+        with self._fli.recvh(timeout=0.001) as recvh:
+            while not eot:
+                try:
+                    message, _ = recvh.recv_bytes(timeout=None)
+                    messages.append(message)
+                except fli.FLIEOT:
+                    eot = True
+        return messages
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "DragonFLIChannel":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached DragonFLIChannel"""
+        try:
+            return DragonFLIChannel(
+                fli_desc=base64.b64decode(descriptor),
+                sender_supplied=True,
+            )
+        except:
+            logger.error(f"Error while creating DragonFLIChannel: {descriptor}")
+            raise
diff --git a/smartsim/_core/mli/infrastructure/__init__.py b/smartsim/_core/mli/infrastructure/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/smartsim/_core/mli/infrastructure/control/__init__.py b/smartsim/_core/mli/infrastructure/control/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py
new file mode 100644
index 000000000..3570bd51e
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py
@@ -0,0 +1,146 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+from contextlib import _GeneratorContextManager, contextmanager
+
+from .....log import get_logger
+from ...infrastructure.storage.featurestore import FeatureStore
+from ..worker.worker import MachineLearningWorkerBase, RequestBatch
+
+logger = get_logger(__name__)
+
+
+class WorkerDevice:
+    def __init__(self, name: str) -> None:
+        """Wrapper around a device to keep track of loaded Models and availability
+        :param name: name used by the toolkit to identify this device, e.g. ``cuda:0``
+        """
+        self._name = name
+        """The name used by the toolkit to identify this device"""
+        self._models: dict[str, t.Any] = {}
+        """Dict of keys to models which are loaded on this device"""
+
+    @property
+    def name(self) -> str:
+        """The identifier of the device represented by this object"""
+        return self._name
+
+    def add_model(self, key: str, model: t.Any) -> None:
+        """Add a reference to a model loaded on this device and assign it a key
+
+        :param key: The key under which the model is saved
+        :param model: The model which is added
+        """
+        self._models[key] = model
+
+    def remove_model(self, key: str) -> None:
+        """Remove the reference to a model loaded on this device
+
+        :param key: The key of the model to remove
+        """
+        self._models.pop(key)
+
+    def get_model(self, key: str) -> t.Any:
+        """Get the model corresponding to a given key
+
+        :param key: the model key
+        :returns: the model for the given key
+        """
+        return self._models[key]
+
+    def __contains__(self, key: str) -> bool:
+        """Check if model with a given key is available on the device
+
+        :param key: the key of the model to check for existence
+        :returns: whether the model is available on the device
+        """
+        return key in self._models
+
+    @contextmanager
+    def get(self, key_to_remove: t.Optional[str]) -> t.Iterator["WorkerDevice"]:
+        yield self
+        if key_to_remove is not None:
+            self.remove_model(key_to_remove)
+
+
+class DeviceManager:
+    def __init__(self, device: WorkerDevice):
+        """An object to manage devices such as GPUs and CPUs.
+
+        The main goal of the ``DeviceManager`` is to ensure that
+        the managed device is ready to be used by a worker to
+        run a given model
+        :param device: The managed device
+        """
+        self._device = device
+        """Device managed by this object"""
+
+    def _load_model_on_device(
+        self,
+        worker: MachineLearningWorkerBase,
+        batch: RequestBatch,
+        feature_stores: dict[str, FeatureStore],
+    ) -> None:
+        """Load the model needed to execute on a batch on the managed device.
+
+        The model is loaded by the worker.
+
+        :param worker: the worker that loads the model
+        :param batch: the batch for which the model is needed
+        :param feature_stores: feature stores where the model could be stored
+        """
+
+        model_bytes = worker.fetch_model(batch, feature_stores)
+        loaded_model = worker.load_model(batch, model_bytes, self._device.name)
+        self._device.add_model(batch.model_id.key, loaded_model.model)
+
+    def get_device(
+        self,
+        worker: MachineLearningWorkerBase,
+        batch: RequestBatch,
+        feature_stores: dict[str, FeatureStore],
+    ) -> _GeneratorContextManager[WorkerDevice]:
+        """Get the device managed by this object
+
+        the model needed to run the batch of requests is
+        guaranteed to be available on the model
+
+        :param worker: The worker that wants to access the device
+        :param batch: The batch of requests
+        :param feature_store: The feature store on which part of the
+        data needed by the request may be stored
+        :return: A generator yielding the device
+        """
+        model_in_request = batch.has_raw_model
+
+        # Load model if not already loaded, or
+        # because it is sent with the request
+        if model_in_request or not batch.model_id.key in self._device:
+            self._load_model_on_device(worker, batch, feature_stores)
+
+        key_to_remove = batch.model_id.key if model_in_request else None
+        return self._device.get(key_to_remove)
diff --git a/smartsim/_core/mli/infrastructure/control/error_handling.py b/smartsim/_core/mli/infrastructure/control/error_handling.py
new file mode 100644
index 000000000..e2c5bcd9e
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/error_handling.py
@@ -0,0 +1,70 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+from .....log import get_logger
+from ...comm.channel.channel import CommChannelBase
+from ...message_handler import MessageHandler
+from ...mli_schemas.response.response_capnp import ResponseBuilder
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+
+logger = get_logger(__file__)
+
+
+def build_failure_reply(status: "Status", message: str) -> ResponseBuilder:
+    return MessageHandler.build_response(
+        status=status,
+        message=message,
+        result=[],
+        custom_attributes=None,
+    )
+
+
+def exception_handler(
+    exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str
+) -> None:
+    """
+    Logs exceptions and sends a failure response.
+
+    :param exc: The exception to be logged
+    :param reply_channel: The channel used to send replies
+    :param failure_message: Failure message to log and send back
+    """
+    logger.exception(
+        f"{failure_message}\n"
+        f"Exception type: {type(exc).__name__}\n"
+        f"Exception message: {str(exc)}"
+    )
+    serialized_resp = MessageHandler.serialize_response(
+        build_failure_reply("fail", failure_message)
+    )
+    if reply_channel:
+        reply_channel.send(serialized_resp)
+    else:
+        logger.warning("Unable to notify client of error without reply_channel")
diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
new file mode 100644
index 000000000..d56912a8f
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py
@@ -0,0 +1,504 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# pylint: disable=import-error
+# pylint: disable-next=unused-import
+import dragon
+import dragon.globalservices.pool as dragon_gs_pool
+from dragon.managed_memory import MemoryPool
+from dragon.mpbridge.queues import DragonQueue
+
+# pylint: enable=import-error
+
+# isort: off
+# isort: on
+
+import multiprocessing as mp
+import time
+import typing as t
+import uuid
+from queue import Empty, Full, Queue
+
+from smartsim._core.entrypoints.service import Service
+
+from .....error import SmartSimError
+from .....log import get_logger
+from ....utils.timings import PerfTimer
+from ...infrastructure.environmentloader import EnvironmentConfigLoader
+from ...infrastructure.storage.featurestore import FeatureStore
+from ...infrastructure.worker.worker import (
+    InferenceRequest,
+    MachineLearningWorkerBase,
+    ModelIdentifier,
+    RequestBatch,
+)
+from .error_handling import exception_handler
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+
+logger = get_logger("Request Dispatcher")
+
+
+class BatchQueue(Queue[InferenceRequest]):
+    def __init__(
+        self, batch_timeout: float, batch_size: int, model_id: ModelIdentifier
+    ) -> None:
+        """Queue used to store inference requests waiting to be batched and
+        sent to Worker Managers.
+        :param batch_timeout: Time in seconds that has to be waited before flushing a
+        non-full queue. The time of the first item put is 0 seconds.
+        :param batch_size: Total capacity of the queue.
+        :param model_id: Key of the model which needs to be executed on the queued
+        requests
+        """
+        super().__init__(maxsize=batch_size)
+        self._batch_timeout = batch_timeout
+        """Time in seconds that has to be waited before flushing a non-full queue.
+        The time of the first item put is 0 seconds."""
+        self._batch_size = batch_size
+        """Total capacity of the queue."""
+        self._first_put: t.Optional[float] = None
+        """Time at which the first item was put on the queue"""
+        self._disposable = False
+        """Whether the queue will not be used again and can be deleted.
+        A disposable queue is always full."""
+        self._model_id: ModelIdentifier = model_id
+        """Key of the model which needs to be executed on the queued requests"""
+        self._uid = str(uuid.uuid4())
+        """Unique ID of queue"""
+
+    @property
+    def uid(self) -> str:
+        """ID of this queue"""
+        return self._uid
+
+    @property
+    def model_id(self) -> ModelIdentifier:
+        """Key of the model which needs to be run on the queued requests"""
+        return self._model_id
+
+    def put(
+        self,
+        item: InferenceRequest,
+        block: bool = False,
+        timeout: t.Optional[float] = 0.0,
+    ) -> None:
+        """Put an inference request in the queue
+        :param item: The request
+        :param block: Whether to block when trying to put the item
+        :param timeout: Time (in seconds) to wait if block==True
+        :raises Full: If an item cannot be put on the queue
+        """
+        super().put(item, block=block, timeout=timeout)
+        if self._first_put is None:
+            self._first_put = time.time()
+
+    @property
+    def _elapsed_time(self) -> float:
+        """Time elapsed since the first item was put on this queue"""
+        if self.empty() or self._first_put is None:
+            return 0
+        return time.time() - self._first_put
+
+    @property
+    def ready(self) -> bool:
+        """True if the queue can be flushed"""
+        if self.empty():
+            return False
+
+        timed_out = (
+            self._batch_timeout > 0 and self._elapsed_time >= self._batch_timeout
+        )
+        logger.debug(f"Is full: {self.full()} or has timed out: {timed_out}")
+        return self.full() or timed_out
+
+    def make_disposable(self) -> None:
+        """Set this queue as disposable, and never use it again after it gets flushed"""
+        self._disposable = True
+
+    @property
+    def can_be_removed(self) -> bool:
+        """Whether this queue can be deleted and garbage collected"""
+        return self.empty() and self._disposable
+
+    def flush(self) -> list[t.Any]:
+        """Get all requests from queue
+        :return: Requests waiting to be executed
+        """
+        num_items = self.qsize()
+        self._first_put = None
+        items = []
+        for _ in range(num_items):
+            try:
+                items.append(self.get())
+            except Empty:
+                break
+
+        return items
+
+    def full(self) -> bool:
+        """Return True if the queue has reached its maximum capacity"""
+        if self._disposable:
+            return True
+        return self.qsize() >= self._batch_size
+
+    def empty(self) -> bool:
+        """Return True if the queue has 0 elements"""
+        return self.qsize() == 0
+
+
+class RequestDispatcher(Service):
+    def __init__(
+        self,
+        batch_timeout: float,
+        batch_size: int,
+        config_loader: EnvironmentConfigLoader,
+        worker_type: t.Type[MachineLearningWorkerBase],
+        mem_pool_size: int = 2 * 1024**3,
+    ) -> None:
+        """The RequestDispatcher intercepts inference requests, stages them in
+        queues and batches them together before making them available to Worker
+        Managers.
+        :param batch_timeout: Maximum elapsed time before flushing a complete or
+        incomplete batch
+        :param batch_size: Total capacity of each batch queue.
+        :param mem_pool: Memory pool used to share batched input tensors with worker
+        managers
+        :param config_loader: Object to load configuration from environment
+        :param worker_type: Type of worker to instantiate to batch inputs
+        :param mem_pool_size: Size of the memory pool used to allocate tensors
+        :raises SmartSimError: If config_loaded.get_queue() does not return a channel
+        """
+        super().__init__(as_service=True, cooldown=1)
+        self._queues: dict[str, list[BatchQueue]] = {}
+        """Dict of all batch queues available for a given model id"""
+        self._active_queues: dict[str, BatchQueue] = {}
+        """Mapping telling which queue is the recipient of requests for a given model
+        key"""
+        self._batch_timeout = batch_timeout
+        """Time in seconds that has to be waited before flushing a non-full queue"""
+        self._batch_size = batch_size
+        """Total capacity of each batch queue."""
+        incoming_channel = config_loader.get_queue()
+        if incoming_channel is None:
+            raise SmartSimError("No incoming channel for dispatcher")
+        self._incoming_channel = incoming_channel
+        """The channel the dispatcher monitors for new tasks"""
+        self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0)
+        """The queue on which batched inference requests are placed"""
+        self._feature_stores: t.Dict[str, FeatureStore] = {}
+        """A collection of attached feature stores"""
+        self._featurestore_factory = config_loader._featurestore_factory
+        """A factory method to create a desired feature store client type"""
+        self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone()
+        """A standalone, system-created feature store used to share internal
+        information among MLI components"""
+        self._callback_factory = config_loader._callback_factory
+        """The type of communication channel to construct for callbacks"""
+        self._worker = worker_type()
+        """The worker used to batch inputs"""
+        self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(mem_pool_size).sdesc)
+        """Memory pool used to share batched input tensors with the Worker Managers"""
+        self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True)
+        """Performance timer"""
+
+    def _check_feature_stores(self, request: InferenceRequest) -> bool:
+        """Ensures that all feature stores required by the request are available
+
+        :param request: The request to validate
+        :returns: False if feature store validation fails for the request, True
+        otherwise
+        """
+        # collect all feature stores required by the request
+        fs_model: t.Set[str] = set()
+        if request.model_key:
+            fs_model = {request.model_key.descriptor}
+        fs_inputs = {key.descriptor for key in request.input_keys}
+        fs_outputs = {key.descriptor for key in request.output_keys}
+
+        # identify which feature stores are requested and unknown
+        fs_desired = fs_model.union(fs_inputs).union(fs_outputs)
+        fs_actual = {item.descriptor for item in self._feature_stores.values()}
+        fs_missing = fs_desired - fs_actual
+
+        if self._featurestore_factory is None:
+            logger.error("No feature store factory configured")
+            return False
+
+        # create the feature stores we need to service request
+        if fs_missing:
+            logger.debug(f"Adding feature store(s): {fs_missing}")
+            for descriptor in fs_missing:
+                feature_store = self._featurestore_factory(descriptor)
+                self._feature_stores[descriptor] = feature_store
+
+        return True
+
+    # pylint: disable-next=no-self-use
+    def _check_model(self, request: InferenceRequest) -> bool:
+        """Ensure that a model is available for the request
+
+        :param request: The request to validate
+        :returns: False if model validation fails for the request, True otherwise
+        """
+        if request.model_key or request.raw_model:
+            return True
+
+        logger.error("Unable to continue without model bytes or feature store key")
+        return False
+
+    # pylint: disable-next=no-self-use
+    def _check_inputs(self, request: InferenceRequest) -> bool:
+        """Ensure that inputs are available for the request
+
+        :param request: The request to validate
+        :returns: False if input validation fails for the request, True otherwise
+        """
+        if request.input_keys or request.raw_inputs:
+            return True
+
+        logger.error("Unable to continue without input bytes or feature store keys")
+        return False
+
+    # pylint: disable-next=no-self-use
+    def _check_callback(self, request: InferenceRequest) -> bool:
+        """Ensure that a callback channel is available for the request
+
+        :param request: The request to validate
+        :returns: False if callback validation fails for the request, True otherwise
+        """
+        if request.callback is not None:
+            return True
+
+        logger.error("No callback channel provided in request")
+        return False
+
+    def _validate_request(self, request: InferenceRequest) -> bool:
+        """Ensure the request can be processed
+
+        :param request: The request to validate
+        :return: False if the request fails any validation checks, True otherwise"""
+        checks = [
+            self._check_feature_stores(request),
+            self._check_model(request),
+            self._check_inputs(request),
+            self._check_callback(request),
+        ]
+
+        return all(checks)
+
+    def _on_iteration(self) -> None:
+        """This method is executed repeatedly until ``Service`` shutdown
+        conditions are satisfied and cooldown is elapsed.
+        """
+        try:
+            self._perf_timer.set_active(True)
+            bytes_list: t.List[bytes] = self._incoming_channel.recv()
+        except Exception:
+            self._perf_timer.set_active(False)
+        else:
+            if not bytes_list:
+                exception_handler(
+                    ValueError("No request data found"),
+                    None,
+                    "No request data found.",
+                )
+
+            request_bytes = bytes_list[0]
+            tensor_bytes_list = bytes_list[1:]
+            self._perf_timer.start_timings()
+
+            request = self._worker.deserialize_message(
+                request_bytes, self._callback_factory
+            )
+            if request.input_meta and tensor_bytes_list:
+                request.raw_inputs = tensor_bytes_list
+
+            self._perf_timer.measure_time("deserialize_message")
+
+            if not self._validate_request(request):
+                exception_handler(
+                    ValueError("Error validating the request"),
+                    request.callback,
+                    "Error validating the request.",
+                )
+                self._perf_timer.measure_time("validate_request")
+            else:
+                self._perf_timer.measure_time("validate_request")
+                self.dispatch(request)
+                self._perf_timer.measure_time("dispatch")
+        finally:
+            self.flush_requests()
+            self.remove_queues()
+
+            self._perf_timer.end_timings()
+
+        if self._perf_timer.max_length == 801 and self._perf_timer.is_active:
+            self._perf_timer.print_timings(True)
+
+    def remove_queues(self) -> None:
+        """Remove references to queues that can be removed
+        and allow them to be garbage collected"""
+        queue_lists_to_remove = []
+        for key, queues in self._queues.items():
+            queues_to_remove = []
+            for queue in queues:
+                if queue.can_be_removed:
+                    queues_to_remove.append(queue)
+
+            for queue_to_remove in queues_to_remove:
+                queues.remove(queue_to_remove)
+                if (
+                    key in self._active_queues
+                    and self._active_queues[key] == queue_to_remove
+                ):
+                    del self._active_queues[key]
+
+            if len(queues) == 0:
+                queue_lists_to_remove.append(key)
+
+        for key in queue_lists_to_remove:
+            del self._queues[key]
+
+    @property
+    def task_queue(self) -> DragonQueue:
+        """The queue on which batched requests are placed"""
+        return self._outgoing_queue
+
+    def _swap_queue(self, model_id: ModelIdentifier) -> None:
+        """Get an empty queue or create a new one
+
+        and make it the active one for a given model.
+        :param model_id: The id of the model for which the
+        queue has to be swapped
+        """
+        if model_id.key in self._queues:
+            for queue in self._queues[model_id.key]:
+                if not queue.full():
+                    self._active_queues[model_id.key] = queue
+                    return
+
+        new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_id)
+        if model_id.key in self._queues:
+            self._queues[model_id.key].append(new_queue)
+        else:
+            self._queues[model_id.key] = [new_queue]
+        self._active_queues[model_id.key] = new_queue
+        return
+
+    def dispatch(self, request: InferenceRequest) -> None:
+        """Assign a request to a batch queue
+        :param request: the request to place
+        """
+        if request.raw_model is not None:
+            logger.debug("Direct inference requested, creating tmp queue")
+            tmp_id = f"_tmp_{str(uuid.uuid4())}"
+            tmp_queue: BatchQueue = BatchQueue(
+                batch_timeout=0,
+                batch_size=1,
+                model_id=ModelIdentifier(key=tmp_id, descriptor="TMP"),
+            )
+            self._active_queues[tmp_id] = tmp_queue
+            self._queues[tmp_id] = [tmp_queue]
+            tmp_queue.put_nowait(request)
+            tmp_queue.make_disposable()
+            return
+
+        if request.model_key:
+            success = False
+            while not success:
+                try:
+                    self._active_queues[request.model_key.key].put_nowait(request)
+                    success = True
+                except (Full, KeyError):
+                    self._swap_queue(request.model_key)
+
+    def flush_requests(self) -> None:
+        """Get all requests from queues which are ready to be flushed. Place all
+        avaliable request batches in the outgoing queue.
+        """
+        for queue_list in self._queues.values():
+            for queue in queue_list:
+                if queue.ready:
+                    self._perf_timer.measure_time("find_queue")
+                    try:
+                        batch = RequestBatch(
+                            requests=queue.flush(),
+                            inputs=None,
+                            model_id=queue.model_id,
+                        )
+                    finally:
+                        self._perf_timer.measure_time("flush_requests")
+                    try:
+                        fetch_results = self._worker.fetch_inputs(
+                            batch=batch, feature_stores=self._feature_stores
+                        )
+                    except Exception as exc:
+                        exception_handler(
+                            exc,
+                            None,
+                            "Error fetching input.",
+                        )
+                        continue
+                    self._perf_timer.measure_time("fetch_input")
+                    try:
+                        transformed_inputs = self._worker.transform_input(
+                            batch=batch,
+                            fetch_results=fetch_results,
+                            mem_pool=self._mem_pool,
+                        )
+                    except Exception as exc:
+                        exception_handler(
+                            exc,
+                            None,
+                            "Error Transforming input.",
+                        )
+                        continue
+
+                    self._perf_timer.measure_time("transform_input")
+                    batch.inputs = transformed_inputs
+                    for request in batch.requests:
+                        request.raw_inputs = []
+                        request.input_meta = []
+
+                    try:
+                        self._outgoing_queue.put(batch)
+                    except Exception as exc:
+                        exception_handler(
+                            exc,
+                            None,
+                            "Error placing batch on task queue.",
+                        )
+                        continue
+                    self._perf_timer.measure_time("put")
+
+    def _can_shutdown(self) -> bool:
+        """Whether the Service can be shut down"""
+        return False
+
+    def __del__(self) -> None:
+        self._mem_pool.destroy()
diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py
new file mode 100644
index 000000000..54a245b81
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/control/workermanager.py
@@ -0,0 +1,321 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# pylint: disable=import-error
+# pylint: disable-next=unused-import
+import dragon
+
+# pylint: enable=import-error
+
+# isort: off
+# isort: on
+
+import multiprocessing as mp
+import time
+import typing as t
+from queue import Empty
+
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+
+from .....log import get_logger
+from ....entrypoints.service import Service
+from ....utils.timings import PerfTimer
+from ...infrastructure.environmentloader import EnvironmentConfigLoader
+from ...infrastructure.worker.worker import (
+    InferenceReply,
+    LoadModelResult,
+    MachineLearningWorkerBase,
+    RequestBatch,
+)
+from ...message_handler import MessageHandler
+from .devicemanager import DeviceManager, WorkerDevice
+from .error_handling import build_failure_reply, exception_handler
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+
+logger = get_logger(__name__)
+
+
+class WorkerManager(Service):
+    """An implementation of a service managing distribution of tasks to
+    machine learning workers"""
+
+    def __init__(
+        self,
+        config_loader: EnvironmentConfigLoader,
+        worker_type: t.Type[MachineLearningWorkerBase],
+        dispatcher_queue: "mp.Queue[RequestBatch]",
+        as_service: bool = False,
+        cooldown: int = 0,
+        device: t.Literal["cpu", "gpu"] = "cpu",
+    ) -> None:
+        """Initialize the WorkerManager
+
+        :param config_loader: Environment config loader for loading queues
+        and feature stores
+        :param worker_type: The type of worker to manage
+        :param dispatcher_queue: Queue from which the batched requests are pulled
+        :param as_service: Specifies run-once or run-until-complete behavior of service
+        :param cooldown: Number of seconds to wait before shutting down after
+        shutdown criteria are met
+        :param device: The device on which the Worker should run. Every worker manager
+        is assigned one single GPU (if available), thus the device should have no index.
+        """
+        super().__init__(as_service, cooldown)
+
+        self._dispatcher_queue = dispatcher_queue
+        """The Dispatcher queue that the WorkerManager monitors for new batches"""
+        self._worker = worker_type()
+        """The ML Worker implementation"""
+        self._callback_factory = config_loader._callback_factory
+        """The type of communication channel to construct for callbacks"""
+        self._device = device
+        """Device on which workers need to run"""
+        self._cached_models: dict[str, t.Any] = {}
+        """Dictionary of previously loaded models"""
+        self._feature_stores: t.Dict[str, FeatureStore] = {}
+        """A collection of attached feature stores"""
+        self._featurestore_factory = config_loader._featurestore_factory
+        """A factory method to create a desired feature store client type"""
+        self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone()
+        """A standalone, system-created feature store used to share internal
+        information among MLI components"""
+        self._device_manager: t.Optional[DeviceManager] = None
+        """Object responsible for model caching and device access"""
+        self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True)
+        """Performance timer"""
+
+    def _on_start(self) -> None:
+        """Called on initial entry into Service `execute` event loop before
+        `_on_iteration` is invoked."""
+        self._device_manager = DeviceManager(WorkerDevice(self._device))
+
+    def _check_feature_stores(self, batch: RequestBatch) -> bool:
+        """Ensures that all feature stores required by the request are available
+
+        :param batch: The batch of requests to validate
+        :returns: False if feature store validation fails for the batch, True otherwise
+        """
+        # collect all feature stores required by the request
+        fs_model: t.Set[str] = set()
+        if batch.model_id.key:
+            fs_model = {batch.model_id.descriptor}
+        fs_inputs = {key.descriptor for key in batch.input_keys}
+        fs_outputs = {key.descriptor for key in batch.output_keys}
+
+        # identify which feature stores are requested and unknown
+        fs_desired = fs_model.union(fs_inputs).union(fs_outputs)
+        fs_actual = {item.descriptor for item in self._feature_stores.values()}
+        fs_missing = fs_desired - fs_actual
+
+        if self._featurestore_factory is None:
+            logger.error("No feature store factory configured")
+            return False
+
+        # create the feature stores we need to service request
+        if fs_missing:
+            logger.debug(f"Adding feature store(s): {fs_missing}")
+            for descriptor in fs_missing:
+                feature_store = self._featurestore_factory(descriptor)
+                self._feature_stores[descriptor] = feature_store
+
+        return True
+
+    def _validate_batch(self, batch: RequestBatch) -> bool:
+        """Ensure the request can be processed
+
+        :param batch: The batch of requests to validate
+        :return: False if the request fails any validation checks, True otherwise"""
+
+        if batch is None or len(batch.requests) == 0:
+            return False
+
+        return self._check_feature_stores(batch)
+
+    # remove this when we are done with time measurements
+    # pylint: disable-next=too-many-statements
+    def _on_iteration(self) -> None:
+        """Executes calls to the machine learning worker implementation to complete
+
+        the inference pipeline"""
+
+        pre_batch_time = time.perf_counter()
+        try:
+            batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001)
+        except Empty:
+            return
+
+        self._perf_timer.start_timings(
+            "flush_requests", time.perf_counter() - pre_batch_time
+        )
+
+        if not self._validate_batch(batch):
+            exception_handler(
+                ValueError("An invalid batch was received"),
+                None,
+                "Error batching inputs, the batch was invalid.",
+            )
+            return
+
+        if self._device_manager is None:
+            for request in batch.requests:
+                msg = "No Device Manager found. WorkerManager._on_start() "
+                "must be called after initialization. If possible, "
+                "you should use `WorkerManager.execute()` instead of "
+                "directly calling `_on_iteration()`."
+                try:
+                    self._dispatcher_queue.put(batch)
+                except Exception:
+                    msg += "\nThe batch could not be put back in the queue "
+                    "and will not be processed."
+                exception_handler(
+                    RuntimeError(msg),
+                    request.callback,
+                    "Error acquiring device manager",
+                )
+            return
+
+        try:
+            device_cm = self._device_manager.get_device(
+                worker=self._worker,
+                batch=batch,
+                feature_stores=self._feature_stores,
+            )
+        except Exception as exc:
+            for request in batch.requests:
+                exception_handler(
+                    exc,
+                    request.callback,
+                    "Error loading model on device or getting device.",
+                )
+            return
+        self._perf_timer.measure_time("fetch_model")
+
+        with device_cm as device:
+
+            try:
+                model_result = LoadModelResult(device.get_model(batch.model_id.key))
+            except Exception as exc:
+                for request in batch.requests:
+                    exception_handler(
+                        exc, request.callback, "Error getting model from device."
+                    )
+                return
+            self._perf_timer.measure_time("load_model")
+
+            if batch.inputs is None:
+                for request in batch.requests:
+                    exception_handler(
+                        ValueError("Error batching inputs"),
+                        request.callback,
+                        "Error batching inputs.",
+                    )
+                return
+            transformed_input = batch.inputs
+
+            try:
+                execute_result = self._worker.execute(
+                    batch, model_result, transformed_input, device.name
+                )
+            except Exception as e:
+                for request in batch.requests:
+                    exception_handler(e, request.callback, "Failed while executing.")
+                return
+            self._perf_timer.measure_time("execute")
+
+            try:
+                transformed_outputs = self._worker.transform_output(
+                    batch, execute_result
+                )
+            except Exception as e:
+                for request in batch.requests:
+                    exception_handler(
+                        e, request.callback, "Failed while transforming the output."
+                    )
+                return
+
+            for request, transformed_output in zip(batch.requests, transformed_outputs):
+                reply = InferenceReply()
+                if request.output_keys:
+                    try:
+                        reply.output_keys = self._worker.place_output(
+                            request,
+                            transformed_output,
+                            self._feature_stores,
+                        )
+                    except Exception as e:
+                        exception_handler(
+                            e, request.callback, "Failed while placing the output."
+                        )
+                        continue
+                else:
+                    reply.outputs = transformed_output.outputs
+                self._perf_timer.measure_time("assign_output")
+
+                if reply.outputs is None or not reply.outputs:
+                    response = build_failure_reply("fail", "Outputs not found.")
+                else:
+                    reply.status_enum = "complete"
+                    reply.message = "Success"
+
+                    results = self._worker.prepare_outputs(reply)
+                    response = MessageHandler.build_response(
+                        status=reply.status_enum,
+                        message=reply.message,
+                        result=results,
+                        custom_attributes=None,
+                    )
+
+                self._perf_timer.measure_time("build_reply")
+
+                serialized_resp = MessageHandler.serialize_response(response)
+
+                self._perf_timer.measure_time("serialize_resp")
+
+                if request.callback:
+                    request.callback.send(serialized_resp)
+                    if reply.outputs:
+                        # send tensor data after response
+                        for output in reply.outputs:
+                            request.callback.send(output)
+                self._perf_timer.measure_time("send")
+
+        self._perf_timer.end_timings()
+
+        if self._perf_timer.max_length == 801:
+            self._perf_timer.print_timings(True)
+
+    def _can_shutdown(self) -> bool:
+        """Return true when the criteria to shut down the service are met."""
+        # todo: determine shutdown criteria
+        # will we receive a completion message?
+        # will we let MLI mgr just kill this?
+        # time_diff = self._last_event - datetime.datetime.now()
+        # if time_diff.total_seconds() > self._cooldown:
+        #     return True
+        # return False
+        return self._worker is None
diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py
new file mode 100644
index 000000000..99202ef2e
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/environmentloader.py
@@ -0,0 +1,105 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import typing as t
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class EnvironmentConfigLoader:
+    """
+    Facilitates the loading of a FeatureStore and Queue into the WorkerManager.
+    """
+
+    def __init__(
+        self,
+        featurestore_factory: t.Callable[[str], FeatureStore],
+        callback_factory: t.Callable[[bytes], CommChannelBase],
+        queue_factory: t.Callable[[str], CommChannelBase],
+    ) -> None:
+        """Initialize the config loader instance with the factories necessary for
+        creating additional objects.
+
+        :param featurestore_factory: A factory method that produces a feature store
+        given a descriptor
+        :param callback_factory: A factory method that produces a callback
+        channel given a descriptor
+        :param queue_factory: A factory method that produces a queue
+        channel given a descriptor"""
+        self.queue: t.Optional[CommChannelBase] = None
+        """The attached incoming event queue channel"""
+        self.backbone: t.Optional[FeatureStore] = None
+        """The attached backbone feature store"""
+        self._featurestore_factory = featurestore_factory
+        """A factory method to instantiate a FeatureStore"""
+        self._callback_factory = callback_factory
+        """A factory method to instantiate a concrete CommChannelBase
+        for inference callbacks"""
+        self._queue_factory = queue_factory
+        """A factory method to instantiate a concrete CommChannelBase
+        for inference requests"""
+
+    def get_backbone(self) -> t.Optional[FeatureStore]:
+        """Attach to the backbone feature store using the descriptor found in
+        an environment variable. The backbone is a standalone, system-created
+        feature store used to share internal information among MLI components
+
+        :returns: The attached feature store via _SMARTSIM_INFRA_BACKBONE"""
+        descriptor = os.getenv("_SMARTSIM_INFRA_BACKBONE", "")
+
+        if not descriptor:
+            logger.warning("No backbone descriptor is configured")
+            return None
+
+        if self._featurestore_factory is None:
+            logger.warning("No feature store factory is configured")
+            return None
+
+        self.backbone = self._featurestore_factory(descriptor)
+        return self.backbone
+
+    def get_queue(self) -> t.Optional[CommChannelBase]:
+        """Attach to a queue-like communication channel using the descriptor
+        found in an environment variable.
+
+        :returns: The attached queue specified via `_SMARTSIM_REQUEST_QUEUE`"""
+        descriptor = os.getenv("_SMARTSIM_REQUEST_QUEUE", "")
+
+        if not descriptor:
+            logger.warning("No queue descriptor is configured")
+            return None
+
+        if self._queue_factory is None:
+            logger.warning("No queue factory is configured")
+            return None
+
+        self.queue = self._queue_factory(descriptor)
+        return self.queue
diff --git a/smartsim/_core/mli/infrastructure/storage/__init__.py b/smartsim/_core/mli/infrastructure/storage/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
new file mode 100644
index 000000000..e89abcd2a
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py
@@ -0,0 +1,108 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+# pylint: disable=import-error
+# isort: off
+import dragon.data.ddict.ddict as dragon_ddict
+
+# isort: on
+
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim.error import SmartSimError
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class DragonFeatureStore(FeatureStore):
+    """A feature store backed by a dragon distributed dictionary"""
+
+    def __init__(self, storage: "dragon_ddict.DDict") -> None:
+        """Initialize the DragonFeatureStore instance
+
+        :param storage: A distributed dictionary to be used as the underlying
+        storage mechanism of the feature store"""
+        self._storage = storage
+
+    def __getitem__(self, key: str) -> t.Union[str, bytes]:
+        """Retrieve an item using key
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: The value identified by the supplied key
+        :raises KeyError: if the key is not found in the feature store
+        :raises SmartSimError: if retrieval from the feature store fails"""
+        try:
+            value: t.Union[str, bytes] = self._storage[key]
+            return value
+        except KeyError:
+            logger.warning(f"An unknown key was requested: {key}")
+            raise
+        except Exception as ex:
+            # note: explicitly avoid round-trip to check for key existence
+            raise SmartSimError(
+                f"Could not get value for existing key {key}, error:\n{ex}"
+            ) from ex
+
+    def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None:
+        """Assign a value using key
+
+        :param key: Unique key of an item to set in the feature store
+        :param value: Value to persist in the feature store"""
+        self._storage[key] = value
+
+    def __contains__(self, key: str) -> bool:
+        """Membership operator to test for a key existing within the feature store.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
+        return key in self._storage
+
+    @property
+    def descriptor(self) -> str:
+        """A unique identifier enabling a client to connect to the feature store
+
+        :returns: A descriptor encoded as a string"""
+        return str(self._storage.serialize())
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "DragonFeatureStore":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached DragonFeatureStore
+        :raises SmartSimError: if attachment to DragonFeatureStore fails"""
+        try:
+            return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor))
+        except Exception as ex:
+            logger.error(f"Error creating dragon feature store: {descriptor}")
+            raise SmartSimError(
+                f"Error creating dragon feature store: {descriptor}"
+            ) from ex
diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py
new file mode 100644
index 000000000..31e3866e7
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py
@@ -0,0 +1,85 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+@dataclass(frozen=True)
+class FeatureStoreKey:
+    """A key,descriptor pair enabling retrieval of an item from a feature store"""
+
+    key: str
+    """The unique key of an item in a feature store"""
+    descriptor: str
+    """The unique identifier of the feature store containing the key"""
+
+    def __post_init__(self) -> None:
+        """Ensure the key and descriptor have at least one character
+
+        :raises ValueError: if key or descriptor are empty strings
+        """
+        if len(self.key) < 1:
+            raise ValueError("Key must have at least one character.")
+        if len(self.descriptor) < 1:
+            raise ValueError("Descriptor must have at least one character.")
+
+
+class FeatureStore(ABC):
+    """Abstract base class providing the common interface for retrieving
+    values from a feature store implementation"""
+
+    @abstractmethod
+    def __getitem__(self, key: str) -> t.Union[str, bytes]:
+        """Retrieve an item using key
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+
+    @abstractmethod
+    def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None:
+        """Assign a value using key
+
+        :param key: Unique key of an item to set in the feature store
+        :param value: Value to persist in the feature store"""
+
+    @abstractmethod
+    def __contains__(self, key: str) -> bool:
+        """Membership operator to test for a key existing within the feature store.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
+
+    @property
+    @abstractmethod
+    def descriptor(self) -> str:
+        """Unique identifier enabling a client to connect to the feature store
+
+        :returns: A descriptor encoded as a string"""
diff --git a/smartsim/_core/mli/infrastructure/worker/__init__.py b/smartsim/_core/mli/infrastructure/worker/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
new file mode 100644
index 000000000..0639d5969
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py
@@ -0,0 +1,208 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+
+import numpy as np
+import torch
+
+# pylint: disable=import-error
+from dragon.managed_memory import MemoryAlloc, MemoryPool
+
+from .....error import SmartSimError
+from .....log import get_logger
+from ...mli_schemas.tensor import tensor_capnp
+from .worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    LoadModelResult,
+    MachineLearningWorkerBase,
+    RequestBatch,
+    TransformInputResult,
+    TransformOutputResult,
+)
+
+# pylint: enable=import-error
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(4)
+logger = get_logger(__name__)
+
+
+class TorchWorker(MachineLearningWorkerBase):
+    """A worker that executes a PyTorch model."""
+
+    @staticmethod
+    def load_model(
+        batch: RequestBatch, fetch_result: FetchModelResult, device: str
+    ) -> LoadModelResult:
+        if fetch_result.model_bytes:
+            model_bytes = fetch_result.model_bytes
+        elif batch.raw_model and batch.raw_model.data:
+            model_bytes = batch.raw_model.data
+        else:
+            raise ValueError("Unable to load model without reference object")
+
+        device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        for old, new in device_to_torch.items():
+            device = device.replace(old, new)
+
+        buffer = io.BytesIO(initial_bytes=model_bytes)
+        with torch.no_grad():
+            model = torch.jit.load(buffer, map_location=device)  # type: ignore
+            model.eval()
+        result = LoadModelResult(model)
+        return result
+
+    @staticmethod
+    def transform_input(
+        batch: RequestBatch,
+        fetch_results: list[FetchInputResult],
+        mem_pool: MemoryPool,
+    ) -> TransformInputResult:
+        results: list[torch.Tensor] = []
+        total_samples = 0
+        slices: list[slice] = []
+
+        all_dims: list[list[int]] = []
+        all_dtypes: list[str] = []
+        if fetch_results[0].meta is None:
+            raise ValueError("Cannot reconstruct tensor without meta information")
+        # Traverse inputs to get total number of samples and compute slices
+        # Assumption: first dimension is samples, all tensors in the same input
+        # have same number of samples
+        # thus we only look at the first tensor for each input
+        for res_idx, fetch_result in enumerate(fetch_results):
+            if fetch_result.meta is None or any(
+                item_meta is None for item_meta in fetch_result.meta
+            ):
+                raise ValueError("Cannot reconstruct tensor without meta information")
+            first_tensor_desc: tensor_capnp.TensorDescriptor = fetch_result.meta[0]
+            num_samples = first_tensor_desc.dimensions[0]
+            slices.append(slice(total_samples, total_samples + num_samples))
+            total_samples = total_samples + num_samples
+
+            if res_idx == len(fetch_results) - 1:
+                # For each tensor in the last input, get remaining dimensions
+                # Assumptions: all inputs have the same number of tensors and
+                # last N-1 dimensions match across inputs for corresponding tensors
+                # thus: resulting array will be of size (num_samples, all_other_dims)
+                for item_meta in fetch_result.meta:
+                    tensor_desc: tensor_capnp.TensorDescriptor = item_meta
+                    tensor_dims = list(tensor_desc.dimensions)
+                    all_dims.append([total_samples, *tensor_dims[1:]])
+                    all_dtypes.append(str(tensor_desc.dataType))
+
+        for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)):
+            itemsize = np.empty((1), dtype=dtype).itemsize
+            alloc_size = int(np.prod(dims) * itemsize)
+            mem_alloc = mem_pool.alloc(alloc_size)
+            mem_view = mem_alloc.get_memview()
+            mem_view[:alloc_size] = b"".join(
+                [
+                    fetch_result.inputs[result_tensor_idx]
+                    for fetch_result in fetch_results
+                ]
+            )
+
+            results.append(mem_alloc.serialize())
+
+        return TransformInputResult(results, slices, all_dims, all_dtypes)
+
+    # pylint: disable-next=unused-argument
+    @staticmethod
+    def execute(
+        batch: RequestBatch,
+        load_result: LoadModelResult,
+        transform_result: TransformInputResult,
+        device: str,
+    ) -> ExecuteResult:
+        if not load_result.model:
+            raise SmartSimError("Model must be loaded to execute")
+        device_to_torch = {"cpu": "cpu", "gpu": "cuda"}
+        for old, new in device_to_torch.items():
+            device = device.replace(old, new)
+
+        tensors = []
+        mem_allocs = []
+        for transformed, dims, dtype in zip(
+            transform_result.transformed, transform_result.dims, transform_result.dtypes
+        ):
+            mem_alloc = MemoryAlloc.attach(transformed)
+            mem_allocs.append(mem_alloc)
+            itemsize = np.empty((1), dtype=dtype).itemsize
+            tensors.append(
+                torch.from_numpy(
+                    np.frombuffer(
+                        mem_alloc.get_memview()[0 : np.prod(dims) * itemsize],
+                        dtype=dtype,
+                    ).reshape(dims)
+                )
+            )
+
+        model: torch.nn.Module = load_result.model
+        with torch.no_grad():
+            model.eval()
+            results = [
+                model(
+                    *[
+                        tensor.to(device, non_blocking=True).detach()
+                        for tensor in tensors
+                    ]
+                )
+            ]
+
+        transform_result.transformed = []
+
+        execute_result = ExecuteResult(results, transform_result.slices)
+        for mem_alloc in mem_allocs:
+            mem_alloc.free()
+        return execute_result
+
+    @staticmethod
+    def transform_output(
+        batch: RequestBatch,
+        execute_result: ExecuteResult,
+    ) -> list[TransformOutputResult]:
+        transformed_list: list[TransformOutputResult] = []
+        cpu_predictions = [
+            prediction.cpu() for prediction in execute_result.predictions
+        ]
+        for result_slice in execute_result.slices:
+            transformed = []
+            for cpu_item in cpu_predictions:
+                transformed.append(cpu_item[result_slice].numpy().tobytes())
+
+                # todo: need the shape from latest schemas added here.
+                transformed_list.append(
+                    TransformOutputResult(transformed, None, "c", "float32")
+                )  # fixme
+
+        execute_result.predictions = []
+
+        return transformed_list
diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py
new file mode 100644
index 000000000..25e4dc49f
--- /dev/null
+++ b/smartsim/_core/mli/infrastructure/worker/worker.py
@@ -0,0 +1,477 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# pylint: disable=import-error
+from dragon.managed_memory import MemoryPool
+
+# isort: off
+# isort: on
+
+import typing as t
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+from .....error import SmartSimError
+from .....log import get_logger
+from ...comm.channel.channel import CommChannelBase
+from ...infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey
+from ...message_handler import MessageHandler
+from ...mli_schemas.model.model_capnp import Model
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+    from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor
+
+logger = get_logger(__name__)
+
+# Placeholder
+ModelIdentifier = FeatureStoreKey
+
+
+class InferenceRequest:
+    """Internal representation of an inference request from a client"""
+
+    def __init__(
+        self,
+        model_key: t.Optional[FeatureStoreKey] = None,
+        callback: t.Optional[CommChannelBase] = None,
+        raw_inputs: t.Optional[t.List[bytes]] = None,
+        input_keys: t.Optional[t.List[FeatureStoreKey]] = None,
+        input_meta: t.Optional[t.List[t.Any]] = None,
+        output_keys: t.Optional[t.List[FeatureStoreKey]] = None,
+        raw_model: t.Optional[Model] = None,
+        batch_size: int = 0,
+    ):
+        """Initialize the object"""
+        self.model_key = model_key
+        """A tuple containing a (key, descriptor) pair"""
+        self.raw_model = raw_model
+        """Raw bytes of an ML model"""
+        self.callback = callback
+        """The channel used for notification of inference completion"""
+        self.raw_inputs = raw_inputs or []
+        """Raw bytes of tensor inputs"""
+        self.input_keys = input_keys or []
+        """A list of tuples containing a (key, descriptor) pair"""
+        self.input_meta = input_meta or []
+        """Metadata about the input data"""
+        self.output_keys = output_keys or []
+        """A list of tuples containing a (key, descriptor) pair"""
+        self.batch_size = batch_size
+        """The batch size to apply when batching"""
+
+
+class InferenceReply:
+    """Internal representation of the reply to a client request for inference"""
+
+    def __init__(
+        self,
+        outputs: t.Optional[t.Collection[t.Any]] = None,
+        output_keys: t.Optional[t.Collection[FeatureStoreKey]] = None,
+        status_enum: "Status" = "running",
+        message: str = "In progress",
+    ) -> None:
+        """Initialize the object"""
+        self.outputs: t.Collection[t.Any] = outputs or []
+        self.output_keys: t.Collection[t.Optional[FeatureStoreKey]] = output_keys or []
+        self.status_enum = status_enum
+        self.message = message
+
+
+class LoadModelResult:
+    """A wrapper around a loaded model"""
+
+    def __init__(self, model: t.Any) -> None:
+        """Initialize the object"""
+        self.model = model
+
+
+class TransformInputResult:
+    """A wrapper around a transformed batch of input tensors"""
+
+    def __init__(
+        self,
+        result: t.Any,
+        slices: list[slice],
+        dims: list[list[int]],
+        dtypes: list[str],
+    ) -> None:
+        """Initialize the object"""
+        self.transformed = result
+        """List of Dragon MemoryAlloc objects on which the tensors are stored"""
+        self.slices = slices
+        """Each slice represents which portion of the input tensors belongs to
+        which request"""
+        self.dims = dims
+        """Dimension of the transformed tensors"""
+        self.dtypes = dtypes
+        """Data type of transformed tensors"""
+
+
+class ExecuteResult:
+    """A wrapper around inference results"""
+
+    def __init__(self, result: t.Any, slices: list[slice]) -> None:
+        """Initialize the object"""
+        self.predictions = result
+        self.slices = slices
+
+
+class FetchInputResult:
+    """A wrapper around fetched inputs"""
+
+    def __init__(self, result: t.List[bytes], meta: t.Optional[t.List[t.Any]]) -> None:
+        """Initialize the object"""
+        self.inputs = result
+        self.meta = meta
+
+
+class TransformOutputResult:
+    """A wrapper around inference results transformed for transmission"""
+
+    def __init__(
+        self, result: t.Any, shape: t.Optional[t.List[int]], order: str, dtype: str
+    ) -> None:
+        """Initialize the OutputTransformResult"""
+        self.outputs = result
+        self.shape = shape
+        self.order = order
+        self.dtype = dtype
+
+
+class CreateInputBatchResult:
+    """A wrapper around inputs batched into a single request"""
+
+    def __init__(self, result: t.Any) -> None:
+        """Initialize the object"""
+        self.batch = result
+
+
+class FetchModelResult:
+    """A wrapper around raw fetched models"""
+
+    def __init__(self, result: bytes) -> None:
+        """Initialize the object"""
+        self.model_bytes: bytes = result
+
+
+@dataclass
+class RequestBatch:
+    """A batch of aggregated inference requests"""
+
+    requests: list[InferenceRequest]
+    inputs: t.Optional[TransformInputResult]
+    model_id: ModelIdentifier
+
+    @property
+    def has_valid_requests(self) -> bool:
+        """Returns whether the batch contains at least one request.
+
+        :return: True if at least one request is available
+        """
+        return len(self.requests) > 0
+
+    @property
+    def has_raw_model(self) -> bool:
+        """Returns whether the batch has a raw model
+
+        :return: True if the batch has a raw model
+        """
+        return self.raw_model is not None
+
+    @property
+    def raw_model(self) -> t.Optional[t.Any]:
+        """Returns the raw model to use to execute for this batch
+        if it is available.
+        :return: A model if available, otherwise None"""
+        if self.has_valid_requests:
+            return self.requests[0].raw_model
+        return None
+
+    @property
+    def input_keys(self) -> t.List[FeatureStoreKey]:
+        """All input keys available in this batch's requests
+
+        :return: All input keys belonging to requests in this batch"""
+        keys = []
+        for request in self.requests:
+            keys.extend(request.input_keys)
+
+        return keys
+
+    @property
+    def output_keys(self) -> t.List[FeatureStoreKey]:
+        """All output keys available in this batch's requests
+
+        :return: All output keys belonging to requests in this batch"""
+        keys = []
+        for request in self.requests:
+            keys.extend(request.output_keys)
+
+        return keys
+
+
+class MachineLearningWorkerCore:
+    """Basic functionality of ML worker that is shared across all worker types"""
+
+    @staticmethod
+    def deserialize_message(
+        data_blob: bytes,
+        callback_factory: t.Callable[[bytes], CommChannelBase],
+    ) -> InferenceRequest:
+        """Deserialize a message from a byte stream into an InferenceRequest
+        :param data_blob: The byte stream to deserialize
+        :param callback_factory: A factory method that can create an instance
+        of the desired concrete comm channel type
+        :returns: The raw input message deserialized into an InferenceRequest
+        """
+        request = MessageHandler.deserialize_request(data_blob)
+        model_key: t.Optional[FeatureStoreKey] = None
+        model_bytes: t.Optional[Model] = None
+
+        if request.model.which() == "key":
+            model_key = FeatureStoreKey(
+                key=request.model.key.key,
+                descriptor=request.model.key.featureStoreDescriptor,
+            )
+        elif request.model.which() == "data":
+            model_bytes = request.model.data
+
+        callback_key = request.replyChannel.descriptor
+        comm_channel = callback_factory(callback_key)
+        input_keys: t.Optional[t.List[FeatureStoreKey]] = None
+        input_bytes: t.Optional[t.List[bytes]] = None
+        output_keys: t.Optional[t.List[FeatureStoreKey]] = None
+        input_meta: t.Optional[t.List[TensorDescriptor]] = None
+
+        if request.input.which() == "keys":
+            input_keys = [
+                FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor)
+                for value in request.input.keys
+            ]
+        elif request.input.which() == "descriptors":
+            input_meta = request.input.descriptors  # type: ignore
+
+        if request.output:
+            output_keys = [
+                FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor)
+                for value in request.output
+            ]
+
+        inference_request = InferenceRequest(
+            model_key=model_key,
+            callback=comm_channel,
+            raw_inputs=input_bytes,
+            input_meta=input_meta,
+            input_keys=input_keys,
+            output_keys=output_keys,
+            raw_model=model_bytes,
+            batch_size=0,
+        )
+        return inference_request
+
+    @staticmethod
+    def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]:
+        prepared_outputs: t.List[t.Any] = []
+        if reply.output_keys:
+            for value in reply.output_keys:
+                if not value:
+                    continue
+                msg_key = MessageHandler.build_tensor_key(value.key, value.descriptor)
+                prepared_outputs.append(msg_key)
+        elif reply.outputs:
+            for _ in reply.outputs:
+                msg_tensor_desc = MessageHandler.build_tensor_descriptor(
+                    "c",
+                    "float32",
+                    [1],
+                )
+                prepared_outputs.append(msg_tensor_desc)
+        return prepared_outputs
+
+    @staticmethod
+    def fetch_model(
+        batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore]
+    ) -> FetchModelResult:
+        """Given a resource key, retrieve the raw model from a feature store
+        :param batch: The batch of requests that triggered the pipeline
+        :param feature_stores: Available feature stores used for persistence
+        :return: Raw bytes of the model
+        :raises SmartSimError: if neither a key or a model are provided or the
+        model cannot be retrieved from the feature store
+        :raises ValueError: if a feature store is not available and a raw
+        model is not provided"""
+
+        # All requests in the same batch share the model
+        if batch.raw_model:
+            return FetchModelResult(batch.raw_model.data)
+
+        if not feature_stores:
+            raise ValueError("Feature store is required for model retrieval")
+
+        if batch.model_id is None:
+            raise SmartSimError(
+                "Key must be provided to retrieve model from feature store"
+            )
+
+        key, fsd = batch.model_id.key, batch.model_id.descriptor
+
+        try:
+            feature_store = feature_stores[fsd]
+            raw_bytes: bytes = t.cast(bytes, feature_store[key])
+            return FetchModelResult(raw_bytes)
+        except FileNotFoundError as ex:
+            logger.exception(ex)
+            raise SmartSimError(f"Model could not be retrieved with key {key}") from ex
+
+    @staticmethod
+    def fetch_inputs(
+        batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore]
+    ) -> t.List[FetchInputResult]:
+        """Given a collection of ResourceKeys, identify the physical location
+        and input metadata
+        :param batch: The batch of requests that triggered the pipeline
+        :param feature_stores: Available feature stores used for persistence
+        :return: the fetched input
+        :raises ValueError: If neither an input key or an input tensor are provided
+        :raises SmartSimError: If a tensor for a given key cannot be retrieved"""
+        fetch_results = []
+        for request in batch.requests:
+            if request.raw_inputs:
+                fetch_results.append(
+                    FetchInputResult(request.raw_inputs, request.input_meta)
+                )
+                continue
+
+            if not feature_stores:
+                raise ValueError("No input and no feature store provided")
+
+            if request.input_keys:
+                data: t.List[bytes] = []
+
+                for fs_key in request.input_keys:
+                    try:
+                        feature_store = feature_stores[fs_key.descriptor]
+                        tensor_bytes = t.cast(bytes, feature_store[fs_key.key])
+                        data.append(tensor_bytes)
+                    except KeyError as ex:
+                        logger.exception(ex)
+                        raise SmartSimError(
+                            f"Tensor could not be retrieved with key {fs_key.key}"
+                        ) from ex
+                fetch_results.append(
+                    FetchInputResult(data, meta=None)
+                )  # fixme: need to get both tensor and descriptor
+                continue
+
+            raise ValueError("No input source")
+
+        return fetch_results
+
+    @staticmethod
+    def place_output(
+        request: InferenceRequest,
+        transform_result: TransformOutputResult,
+        feature_stores: t.Dict[str, FeatureStore],
+    ) -> t.Collection[t.Optional[FeatureStoreKey]]:
+        """Given a collection of data, make it available as a shared resource in the
+        feature store
+        :param request: The request that triggered the pipeline
+        :param execute_result: Results from inference
+        :param feature_stores: Available feature stores used for persistence
+        :return: A collection of keys that were placed in the feature store
+        :raises ValueError: If a feature store is not provided
+        """
+        if not feature_stores:
+            raise ValueError("Feature store is required for output persistence")
+
+        keys: t.List[t.Optional[FeatureStoreKey]] = []
+        # need to decide how to get back to original sub-batch inputs so they can be
+        # accurately placed, datum might need to include this.
+
+        # Consider parallelizing all PUT feature_store operations
+        for fs_key, v in zip(request.output_keys, transform_result.outputs):
+            feature_store = feature_stores[fs_key.descriptor]
+            feature_store[fs_key.key] = v
+            keys.append(fs_key)
+
+        return keys
+
+
+class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC):
+    """Abstract base class providing contract for a machine learning
+    worker implementation."""
+
+    @staticmethod
+    @abstractmethod
+    def load_model(
+        batch: RequestBatch, fetch_result: FetchModelResult, device: str
+    ) -> LoadModelResult:
+        """Given a loaded MachineLearningModel, ensure it is loaded into
+        device memory
+        :param request: The request that triggered the pipeline
+        :param device: The device on which the model must be placed
+        :return: ModelLoadResult wrapping the model loaded for the request"""
+
+    @staticmethod
+    @abstractmethod
+    def transform_input(
+        batch: RequestBatch,
+        fetch_results: list[FetchInputResult],
+        mem_pool: MemoryPool,
+    ) -> TransformInputResult:
+        """Given a collection of data, perform a transformation on the data and put
+        the raw tensor data on a MemoryPool allocation.
+        :param request: The request that triggered the pipeline
+        :param fetch_result: Raw outputs from fetching inputs out of a feature store
+        :param mem_pool: The memory pool used to access batched input tensors
+        :return: The transformed inputs wrapped in a InputTransformResult"""
+
+    @staticmethod
+    @abstractmethod
+    def execute(
+        batch: RequestBatch,
+        load_result: LoadModelResult,
+        transform_result: TransformInputResult,
+        device: str,
+    ) -> ExecuteResult:
+        """Execute an ML model on inputs transformed for use by the model
+        :param batch: The batch of requests that triggered the pipeline
+        :param load_result: The result of loading the model onto device memory
+        :param transform_result: The result of transforming inputs for model consumption
+        :param device: The device on which the model will be executed
+        :return: The result of inference wrapped in an ExecuteResult"""
+
+    @staticmethod
+    @abstractmethod
+    def transform_output(
+        batch: RequestBatch, execute_result: ExecuteResult
+    ) -> t.List[TransformOutputResult]:
+        """Given inference results, perform transformations required to
+        transmit results to the requestor.
+        :param batch: The batch of requests that triggered the pipeline
+        :param execute_result: The result of inference wrapped in an ExecuteResult
+        :return: A list of transformed outputs"""
diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py
new file mode 100644
index 000000000..ee632e24e
--- /dev/null
+++ b/smartsim/_core/mli/message_handler.py
@@ -0,0 +1,552 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import typing as t
+
+from .mli_schemas.data import data_references_capnp
+from .mli_schemas.model import model_capnp
+from .mli_schemas.request import request_capnp
+from .mli_schemas.request.request_attributes import request_attributes_capnp
+from .mli_schemas.response import response_capnp
+from .mli_schemas.response.response_attributes import response_attributes_capnp
+from .mli_schemas.tensor import tensor_capnp
+
+
+class MessageHandler:
+    @staticmethod
+    def build_tensor_descriptor(
+        order: "tensor_capnp.Order",
+        data_type: "tensor_capnp.NumericalType",
+        dimensions: t.List[int],
+    ) -> tensor_capnp.TensorDescriptor:
+        """
+        Builds a TensorDescriptor message using the provided
+        order, data type, and dimensions.
+
+        :param order: Order of the tensor, such as row-major (c) or column-major (f)
+        :param data_type: Data type of the tensor
+        :param dimensions: Dimensions of the tensor
+        :raises ValueError: if building fails
+        """
+        try:
+            description = tensor_capnp.TensorDescriptor.new_message()
+            description.order = order
+            description.dataType = data_type
+            description.dimensions = dimensions
+        except Exception as e:
+            raise ValueError(
+                "Error building tensor descriptor."
+            ) from e  # TODO: create custom exception
+
+        return description
+
+    @staticmethod
+    def build_output_tensor_descriptor(
+        order: "tensor_capnp.Order",
+        keys: t.List["data_references_capnp.TensorKey"],
+        data_type: "tensor_capnp.ReturnNumericalType",
+        dimensions: t.List[int],
+    ) -> tensor_capnp.OutputDescriptor:
+        """
+        Builds an OutputDescriptor message using the provided
+        order, data type, and dimensions.
+
+        :param order: Order of the tensor, such as row-major (c) or column-major (f)
+        :param keys: List of TensorKeys to apply transorm descriptor to
+        :param data_type: Tranform data type of the tensor
+        :param dimensions: Transform dimensions of the tensor
+        :raises ValueError: if building fails
+        """
+        try:
+            description = tensor_capnp.OutputDescriptor.new_message()
+            description.order = order
+            description.optionalKeys = keys
+            description.optionalDatatype = data_type
+            description.optionalDimension = dimensions
+
+        except Exception as e:
+            raise ValueError("Error building output tensor descriptor.") from e
+
+        return description
+
+    @staticmethod
+    def build_tensor_key(
+        key: str, feature_store_descriptor: str
+    ) -> data_references_capnp.TensorKey:
+        """
+        Builds a new TensorKey message with the provided key.
+
+        :param key: String to set the TensorKey
+        :param feature_store_descriptor: A descriptor identifying the feature store
+        containing the key
+        :raises ValueError: if building fails
+        """
+        try:
+            tensor_key = data_references_capnp.TensorKey.new_message()
+            tensor_key.key = key
+            tensor_key.featureStoreDescriptor = feature_store_descriptor
+        except Exception as e:
+            raise ValueError("Error building tensor key.") from e
+        return tensor_key
+
+    @staticmethod
+    def build_model(data: bytes, name: str, version: str) -> model_capnp.Model:
+        """
+        Builds a new Model message with the provided data, name, and version.
+
+        :param data: Model data
+        :param name: Model name
+        :param version: Model version
+        :raises ValueError: if building fails
+        """
+        try:
+            model = model_capnp.Model.new_message()
+            model.data = data
+            model.name = name
+            model.version = version
+        except Exception as e:
+            raise ValueError("Error building model.") from e
+        return model
+
+    @staticmethod
+    def build_model_key(
+        key: str, feature_store_descriptor: str
+    ) -> data_references_capnp.ModelKey:
+        """
+        Builds a new ModelKey message with the provided key.
+
+        :param key: String to set the ModelKey
+        :param feature_store_descriptor: A descriptor identifying the feature store
+        containing the key
+        :raises ValueError: if building fails
+        """
+        try:
+            model_key = data_references_capnp.ModelKey.new_message()
+            model_key.key = key
+            model_key.featureStoreDescriptor = feature_store_descriptor
+        except Exception as e:
+            raise ValueError("Error building model key.") from e
+        return model_key
+
+    @staticmethod
+    def build_torch_request_attributes(
+        tensor_type: "request_attributes_capnp.TorchTensorType",
+    ) -> request_attributes_capnp.TorchRequestAttributes:
+        """
+        Builds a new TorchRequestAttributes message with the provided tensor type.
+
+        :param tensor_type: Type of the tensor passed in
+        :raises ValueError: if building fails
+        """
+        try:
+            attributes = request_attributes_capnp.TorchRequestAttributes.new_message()
+            attributes.tensorType = tensor_type
+        except Exception as e:
+            raise ValueError("Error building Torch request attributes.") from e
+        return attributes
+
+    @staticmethod
+    def build_tf_request_attributes(
+        name: str, tensor_type: "request_attributes_capnp.TFTensorType"
+    ) -> request_attributes_capnp.TensorFlowRequestAttributes:
+        """
+        Builds a new TensorFlowRequestAttributes message with
+        the provided name and tensor type.
+
+        :param name: Name of the tensor
+        :param tensor_type: Type of the tensor passed in
+        :raises ValueError: if building fails
+        """
+        try:
+            attributes = (
+                request_attributes_capnp.TensorFlowRequestAttributes.new_message()
+            )
+            attributes.name = name
+            attributes.tensorType = tensor_type
+        except Exception as e:
+            raise ValueError("Error building TensorFlow request attributes.") from e
+        return attributes
+
+    @staticmethod
+    def build_torch_response_attributes() -> (
+        response_attributes_capnp.TorchResponseAttributes
+    ):
+        """
+        Builds a new TorchResponseAttributes message.
+        """
+        return response_attributes_capnp.TorchResponseAttributes.new_message()
+
+    @staticmethod
+    def build_tf_response_attributes() -> (
+        response_attributes_capnp.TensorFlowResponseAttributes
+    ):
+        """
+        Builds a new TensorFlowResponseAttributes message.
+        """
+        return response_attributes_capnp.TensorFlowResponseAttributes.new_message()
+
+    @staticmethod
+    def _assign_model(
+        request: request_capnp.Request,
+        model: t.Union[data_references_capnp.ModelKey, model_capnp.Model],
+    ) -> None:
+        """
+        Assigns a model to the supplied request.
+
+        :param request: Request being built
+        :param model: Model to be assigned
+        :raises ValueError: if building fails
+        """
+        try:
+            class_name = model.schema.node.displayName.split(":")[-1]  # type: ignore
+            if class_name == "Model":
+                request.model.data = model  # type: ignore
+            elif class_name == "ModelKey":
+                request.model.key = model  # type: ignore
+            else:
+                raise ValueError("""Invalid custom attribute class name.
+                        Expected 'Model' or 'ModelKey'.""")
+        except Exception as e:
+            raise ValueError("Error building model portion of request.") from e
+
+    @staticmethod
+    def _assign_reply_channel(
+        request: request_capnp.Request, reply_channel: bytes
+    ) -> None:
+        """
+        Assigns a reply channel to the supplied request.
+
+        :param request: Request being built
+        :param reply_channel: Reply channel to be assigned
+        :raises ValueError: if building fails
+        """
+        try:
+            request.replyChannel.descriptor = reply_channel
+        except Exception as e:
+            raise ValueError("Error building reply channel portion of request.") from e
+
+    @staticmethod
+    def _assign_inputs(
+        request: request_capnp.Request,
+        inputs: t.Union[
+            t.List[data_references_capnp.TensorKey],
+            t.List[tensor_capnp.TensorDescriptor],
+        ],
+    ) -> None:
+        """
+        Assigns inputs to the supplied request.
+
+        :param request: Request being built
+        :param inputs: Inputs to be assigned
+        :raises ValueError: if building fails
+        """
+        try:
+            if inputs:
+                display_name = inputs[0].schema.node.displayName  # type: ignore
+                input_class_name = display_name.split(":")[-1]
+                if input_class_name == "TensorDescriptor":
+                    request.input.descriptors = inputs  # type: ignore
+                elif input_class_name == "TensorKey":
+                    request.input.keys = inputs  # type: ignore
+                else:
+                    raise ValueError("""Invalid input class name. Expected
+                        'TensorDescriptor' or 'TensorKey'.""")
+        except Exception as e:
+            raise ValueError("Error building inputs portion of request.") from e
+
+    @staticmethod
+    def _assign_outputs(
+        request: request_capnp.Request,
+        outputs: t.List[data_references_capnp.TensorKey],
+    ) -> None:
+        """
+        Assigns outputs to the supplied request.
+
+        :param request: Request being built
+        :param outputs: Outputs to be assigned
+        :raises ValueError: if building fails
+        """
+        try:
+            request.output = outputs
+
+        except Exception as e:
+            raise ValueError("Error building outputs portion of request.") from e
+
+    @staticmethod
+    def _assign_output_descriptors(
+        request: request_capnp.Request,
+        output_descriptors: t.List[tensor_capnp.OutputDescriptor],
+    ) -> None:
+        """
+        Assigns a list of output tensor descriptors to the supplied request.
+
+        :param request: Request being built
+        :param output_descriptors: Output descriptors to be assigned
+        :raises ValueError: if building fails
+        """
+        try:
+            request.outputDescriptors = output_descriptors
+        except Exception as e:
+            raise ValueError(
+                "Error building the output descriptors portion of request."
+            ) from e
+
+    @staticmethod
+    def _assign_custom_request_attributes(
+        request: request_capnp.Request,
+        custom_attrs: t.Union[
+            request_attributes_capnp.TorchRequestAttributes,
+            request_attributes_capnp.TensorFlowRequestAttributes,
+            None,
+        ],
+    ) -> None:
+        """
+        Assigns request attributes to the supplied request.
+
+        :param request: Request being built
+        :param custom_attrs: Custom attributes to be assigned
+        :raises ValueError: if building fails
+        """
+        try:
+            if custom_attrs is None:
+                request.customAttributes.none = custom_attrs
+            else:
+                custom_attribute_class_name = (
+                    custom_attrs.schema.node.displayName.split(":")[-1]  # type: ignore
+                )
+                if custom_attribute_class_name == "TorchRequestAttributes":
+                    request.customAttributes.torch = custom_attrs  # type: ignore
+                elif custom_attribute_class_name == "TensorFlowRequestAttributes":
+                    request.customAttributes.tf = custom_attrs  # type: ignore
+                else:
+                    raise ValueError("""Invalid custom attribute class name.
+                        Expected 'TensorFlowRequestAttributes' or
+                        'TorchRequestAttributes'.""")
+        except Exception as e:
+            raise ValueError(
+                "Error building custom attributes portion of request."
+            ) from e
+
+    @staticmethod
+    def build_request(
+        reply_channel: bytes,
+        model: t.Union[data_references_capnp.ModelKey, model_capnp.Model],
+        inputs: t.Union[
+            t.List[data_references_capnp.TensorKey],
+            t.List[tensor_capnp.TensorDescriptor],
+        ],
+        outputs: t.List[data_references_capnp.TensorKey],
+        output_descriptors: t.List[tensor_capnp.OutputDescriptor],
+        custom_attributes: t.Union[
+            request_attributes_capnp.TorchRequestAttributes,
+            request_attributes_capnp.TensorFlowRequestAttributes,
+            None,
+        ],
+    ) -> request_capnp.RequestBuilder:
+        """
+        Builds the request message.
+
+        :param reply_channel: Reply channel to be assigned to request
+        :param model: Model to be assigned to request
+        :param inputs: Inputs to be assigned to request
+        :param outputs: Outputs to be assigned to request
+        :param output_descriptors: Output descriptors to be assigned to request
+        :param custom_attributes: Custom attributes to be assigned to request
+        """
+        request = request_capnp.Request.new_message()
+        MessageHandler._assign_reply_channel(request, reply_channel)
+        MessageHandler._assign_model(request, model)
+        MessageHandler._assign_inputs(request, inputs)
+        MessageHandler._assign_outputs(request, outputs)
+        MessageHandler._assign_output_descriptors(request, output_descriptors)
+        MessageHandler._assign_custom_request_attributes(request, custom_attributes)
+        return request
+
+    @staticmethod
+    def serialize_request(request: request_capnp.RequestBuilder) -> bytes:
+        """
+        Serializes a built request message.
+
+        :param request: Request to be serialized
+        """
+        return request.to_bytes()
+
+    @staticmethod
+    def deserialize_request(request_bytes: bytes) -> request_capnp.Request:
+        """
+        Deserializes a serialized request message.
+
+        :param request_bytes: Bytes to be deserialized into a Request
+        """
+        bytes_message = request_capnp.Request.from_bytes(
+            request_bytes, traversal_limit_in_words=2**63
+        )
+
+        with bytes_message as message:
+            return message
+
+    @staticmethod
+    def _assign_status(
+        response: response_capnp.Response, status: "response_capnp.Status"
+    ) -> None:
+        """
+        Assigns a status to the supplied response.
+
+        :param response: Response being built
+        :param status: Status to be assigned
+        :raises ValueError: if building fails
+        """
+        try:
+            response.status = status
+        except Exception as e:
+            raise ValueError("Error assigning status to response.") from e
+
+    @staticmethod
+    def _assign_message(response: response_capnp.Response, message: str) -> None:
+        """
+        Assigns a message to the supplied response.
+
+        :param response: Response being built
+        :param message: Message to be assigned
+        :raises ValueError: if building fails
+        """
+        try:
+            response.message = message
+        except Exception as e:
+            raise ValueError("Error assigning message to response.") from e
+
+    @staticmethod
+    def _assign_result(
+        response: response_capnp.Response,
+        result: t.Union[
+            t.List[tensor_capnp.TensorDescriptor],
+            t.List[data_references_capnp.TensorKey],
+            None,
+        ],
+    ) -> None:
+        """
+        Assigns a result to the supplied response.
+
+        :param response: Response being built
+        :param result: Result to be assigned
+        :raises ValueError: if building fails
+        """
+        try:
+            if result:
+                first_result = result[0]
+                display_name = first_result.schema.node.displayName  # type: ignore
+                result_class_name = display_name.split(":")[-1]
+                if result_class_name == "TensorDescriptor":
+                    response.result.descriptors = result  # type: ignore
+                elif result_class_name == "TensorKey":
+                    response.result.keys = result  # type: ignore
+                else:
+                    raise ValueError("""Invalid custom attribute class name.
+                        Expected 'TensorDescriptor' or 'TensorKey'.""")
+        except Exception as e:
+            raise ValueError("Error assigning result to response.") from e
+
+    @staticmethod
+    def _assign_custom_response_attributes(
+        response: response_capnp.Response,
+        custom_attrs: t.Union[
+            response_attributes_capnp.TorchResponseAttributes,
+            response_attributes_capnp.TensorFlowResponseAttributes,
+            None,
+        ],
+    ) -> None:
+        """
+        Assigns custom attributes to the supplied response.
+
+        :param response: Response being built
+        :param custom_attrs: Custom attributes to be assigned
+        :raises ValueError: if building fails
+        """
+        try:
+            if custom_attrs is None:
+                response.customAttributes.none = custom_attrs
+            else:
+                custom_attribute_class_name = (
+                    custom_attrs.schema.node.displayName.split(":")[-1]  # type: ignore
+                )
+                if custom_attribute_class_name == "TorchResponseAttributes":
+                    response.customAttributes.torch = custom_attrs  # type: ignore
+                elif custom_attribute_class_name == "TensorFlowResponseAttributes":
+                    response.customAttributes.tf = custom_attrs  # type: ignore
+                else:
+                    raise ValueError("""Invalid custom attribute class name.
+                        Expected 'TensorFlowResponseAttributes' or
+                        'TorchResponseAttributes'.""")
+        except Exception as e:
+            raise ValueError("Error assigning custom attributes to response.") from e
+
+    @staticmethod
+    def build_response(
+        status: "response_capnp.Status",
+        message: str,
+        result: t.Union[
+            t.List[tensor_capnp.TensorDescriptor],
+            t.List[data_references_capnp.TensorKey],
+            None,
+        ],
+        custom_attributes: t.Union[
+            response_attributes_capnp.TorchResponseAttributes,
+            response_attributes_capnp.TensorFlowResponseAttributes,
+            None,
+        ],
+    ) -> response_capnp.ResponseBuilder:
+        """
+        Builds the response message.
+
+        :param status: Status to be assigned to response
+        :param message: Message to be assigned to response
+        :param result: Result to be assigned to response
+        :param custom_attributes: Custom attributes to be assigned to response
+        """
+        response = response_capnp.Response.new_message()
+        MessageHandler._assign_status(response, status)
+        MessageHandler._assign_message(response, message)
+        MessageHandler._assign_result(response, result)
+        MessageHandler._assign_custom_response_attributes(response, custom_attributes)
+        return response
+
+    @staticmethod
+    def serialize_response(response: response_capnp.ResponseBuilder) -> bytes:
+        """
+        Serializes a built response message.
+        """
+        return response.to_bytes()
+
+    @staticmethod
+    def deserialize_response(response_bytes: bytes) -> response_capnp.Response:
+        """
+        Deserializes a serialized response message.
+        """
+        bytes_message = response_capnp.Response.from_bytes(
+            response_bytes, traversal_limit_in_words=2**63
+        )
+
+        with bytes_message as message:
+            return message
diff --git a/smartsim/_core/mli/mli_schemas/data/data_references.capnp b/smartsim/_core/mli/mli_schemas/data/data_references.capnp
new file mode 100644
index 000000000..699abe5d2
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/data/data_references.capnp
@@ -0,0 +1,37 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0x8ca69fd1aacb6668;
+
+struct ModelKey {
+  key @0 :Text;
+  featureStoreDescriptor @1 :Text;
+}
+
+struct TensorKey {
+  key @0 :Text;
+  featureStoreDescriptor @1 :Text;
+}
diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py
new file mode 100644
index 000000000..099d10c43
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `data_references.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "data_references.capnp"))
+ModelKey = capnp.load(module_file).ModelKey
+ModelKeyBuilder = ModelKey
+ModelKeyReader = ModelKey
+TensorKey = capnp.load(module_file).TensorKey
+TensorKeyBuilder = TensorKey
+TensorKeyReader = TensorKey
diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi
new file mode 100644
index 000000000..bcf53e0a0
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi
@@ -0,0 +1,107 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `data_references.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator
+
+class ModelKey:
+    key: str
+    featureStoreDescriptor: str
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[ModelKeyReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> ModelKeyReader: ...
+    @staticmethod
+    def new_message() -> ModelKeyBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class ModelKeyReader(ModelKey):
+    def as_builder(self) -> ModelKeyBuilder: ...
+
+class ModelKeyBuilder(ModelKey):
+    @staticmethod
+    def from_dict(dictionary: dict) -> ModelKeyBuilder: ...
+    def copy(self) -> ModelKeyBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> ModelKeyReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
+
+class TensorKey:
+    key: str
+    featureStoreDescriptor: str
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TensorKeyReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TensorKeyReader: ...
+    @staticmethod
+    def new_message() -> TensorKeyBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TensorKeyReader(TensorKey):
+    def as_builder(self) -> TensorKeyBuilder: ...
+
+class TensorKeyBuilder(TensorKey):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TensorKeyBuilder: ...
+    def copy(self) -> TensorKeyBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TensorKeyReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/model/__init__.py b/smartsim/_core/mli/mli_schemas/model/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/smartsim/_core/mli/mli_schemas/model/model.capnp b/smartsim/_core/mli/mli_schemas/model/model.capnp
new file mode 100644
index 000000000..fc9ed7366
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/model/model.capnp
@@ -0,0 +1,33 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0xaefb9301e14ba4bd;
+
+struct Model {
+  data @0 :Data;
+  name @1 :Text;
+  version @2 :Text;
+}
diff --git a/smartsim/_core/mli/mli_schemas/model/model_capnp.py b/smartsim/_core/mli/mli_schemas/model/model_capnp.py
new file mode 100644
index 000000000..be2c276c2
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/model/model_capnp.py
@@ -0,0 +1,38 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `model.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "model.capnp"))
+Model = capnp.load(module_file).Model
+ModelBuilder = Model
+ModelReader = Model
diff --git a/smartsim/_core/mli/mli_schemas/model/model_capnp.pyi b/smartsim/_core/mli/mli_schemas/model/model_capnp.pyi
new file mode 100644
index 000000000..6ca53a357
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/model/model_capnp.pyi
@@ -0,0 +1,72 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `model.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator
+
+class Model:
+    data: bytes
+    name: str
+    version: str
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[ModelReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> ModelReader: ...
+    @staticmethod
+    def new_message() -> ModelBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class ModelReader(Model):
+    def as_builder(self) -> ModelBuilder: ...
+
+class ModelBuilder(Model):
+    @staticmethod
+    def from_dict(dictionary: dict) -> ModelBuilder: ...
+    def copy(self) -> ModelBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> ModelReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp
new file mode 100644
index 000000000..4be1cfa21
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request.capnp
@@ -0,0 +1,55 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0xa27f0152c7bb299e;
+
+using Tensors = import "../tensor/tensor.capnp";
+using RequestAttributes = import "request_attributes/request_attributes.capnp";
+using DataRef = import "../data/data_references.capnp";
+using Models = import "../model/model.capnp";
+
+struct ChannelDescriptor {
+  descriptor @0 :Data;
+}
+
+struct Request {
+  replyChannel @0 :ChannelDescriptor;
+  model :union {
+    key @1 :DataRef.ModelKey;
+    data @2 :Models.Model;
+  }
+  input :union {
+    keys @3 :List(DataRef.TensorKey);
+    descriptors @4 :List(Tensors.TensorDescriptor);
+  }
+  output @5 :List(DataRef.TensorKey);
+  outputDescriptors @6 :List(Tensors.OutputDescriptor);
+  customAttributes :union {
+    torch @7 :RequestAttributes.TorchRequestAttributes;
+    tf @8 :RequestAttributes.TensorFlowRequestAttributes;
+    none @9 :Void;
+  }
+}
diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp
new file mode 100644
index 000000000..f0a319f0a
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes.capnp
@@ -0,0 +1,49 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0xdd14d8ba5c06743f;
+
+enum TorchTensorType {
+  nested @0; # ragged
+  sparse @1;
+  tensor @2; # "normal" tensor
+}
+
+enum TFTensorType {
+  ragged @0;
+  sparse @1;
+  variable @2;
+  constant @3;
+}
+
+struct TorchRequestAttributes {
+  tensorType @0 :TorchTensorType;
+}
+
+struct TensorFlowRequestAttributes {
+  name @0 :Text;
+  tensorType @1 :TFTensorType;
+}
diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py
new file mode 100644
index 000000000..8969f3845
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `request_attributes.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "request_attributes.capnp"))
+TorchRequestAttributes = capnp.load(module_file).TorchRequestAttributes
+TorchRequestAttributesBuilder = TorchRequestAttributes
+TorchRequestAttributesReader = TorchRequestAttributes
+TensorFlowRequestAttributes = capnp.load(module_file).TensorFlowRequestAttributes
+TensorFlowRequestAttributesBuilder = TensorFlowRequestAttributes
+TensorFlowRequestAttributesReader = TensorFlowRequestAttributes
diff --git a/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi
new file mode 100644
index 000000000..c474de4b4
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request_attributes/request_attributes_capnp.pyi
@@ -0,0 +1,109 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `request_attributes.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator, Literal
+
+TorchTensorType = Literal["nested", "sparse", "tensor"]
+TFTensorType = Literal["ragged", "sparse", "variable", "constant"]
+
+class TorchRequestAttributes:
+    tensorType: TorchTensorType
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TorchRequestAttributesReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TorchRequestAttributesReader: ...
+    @staticmethod
+    def new_message() -> TorchRequestAttributesBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TorchRequestAttributesReader(TorchRequestAttributes):
+    def as_builder(self) -> TorchRequestAttributesBuilder: ...
+
+class TorchRequestAttributesBuilder(TorchRequestAttributes):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TorchRequestAttributesBuilder: ...
+    def copy(self) -> TorchRequestAttributesBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TorchRequestAttributesReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
+
+class TensorFlowRequestAttributes:
+    name: str
+    tensorType: TFTensorType
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TensorFlowRequestAttributesReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TensorFlowRequestAttributesReader: ...
+    @staticmethod
+    def new_message() -> TensorFlowRequestAttributesBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TensorFlowRequestAttributesReader(TensorFlowRequestAttributes):
+    def as_builder(self) -> TensorFlowRequestAttributesBuilder: ...
+
+class TensorFlowRequestAttributesBuilder(TensorFlowRequestAttributes):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TensorFlowRequestAttributesBuilder: ...
+    def copy(self) -> TensorFlowRequestAttributesBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TensorFlowRequestAttributesReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.py b/smartsim/_core/mli/mli_schemas/request/request_capnp.py
new file mode 100644
index 000000000..90b8ce194
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `request.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "request.capnp"))
+ChannelDescriptor = capnp.load(module_file).ChannelDescriptor
+ChannelDescriptorBuilder = ChannelDescriptor
+ChannelDescriptorReader = ChannelDescriptor
+Request = capnp.load(module_file).Request
+RequestBuilder = Request
+RequestReader = Request
diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi
new file mode 100644
index 000000000..a4ad631f9
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi
@@ -0,0 +1,319 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `request.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator, Literal, Sequence, overload
+
+from ..data.data_references_capnp import (
+    ModelKey,
+    ModelKeyBuilder,
+    ModelKeyReader,
+    TensorKey,
+    TensorKeyBuilder,
+    TensorKeyReader,
+)
+from ..model.model_capnp import Model, ModelBuilder, ModelReader
+from ..tensor.tensor_capnp import (
+    OutputDescriptor,
+    OutputDescriptorBuilder,
+    OutputDescriptorReader,
+    TensorDescriptor,
+    TensorDescriptorBuilder,
+    TensorDescriptorReader,
+)
+from .request_attributes.request_attributes_capnp import (
+    TensorFlowRequestAttributes,
+    TensorFlowRequestAttributesBuilder,
+    TensorFlowRequestAttributesReader,
+    TorchRequestAttributes,
+    TorchRequestAttributesBuilder,
+    TorchRequestAttributesReader,
+)
+
+class ChannelDescriptor:
+    descriptor: bytes
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[ChannelDescriptorReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> ChannelDescriptorReader: ...
+    @staticmethod
+    def new_message() -> ChannelDescriptorBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class ChannelDescriptorReader(ChannelDescriptor):
+    def as_builder(self) -> ChannelDescriptorBuilder: ...
+
+class ChannelDescriptorBuilder(ChannelDescriptor):
+    @staticmethod
+    def from_dict(dictionary: dict) -> ChannelDescriptorBuilder: ...
+    def copy(self) -> ChannelDescriptorBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> ChannelDescriptorReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
+
+class Request:
+    class Model:
+        key: ModelKey | ModelKeyBuilder | ModelKeyReader
+        data: Model | ModelBuilder | ModelReader
+        def which(self) -> Literal["key", "data"]: ...
+        @overload
+        def init(self, name: Literal["key"]) -> ModelKey: ...
+        @overload
+        def init(self, name: Literal["data"]) -> Model: ...
+        @staticmethod
+        @contextmanager
+        def from_bytes(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Iterator[Request.ModelReader]: ...
+        @staticmethod
+        def from_bytes_packed(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Request.ModelReader: ...
+        @staticmethod
+        def new_message() -> Request.ModelBuilder: ...
+        def to_dict(self) -> dict: ...
+
+    class ModelReader(Request.Model):
+        key: ModelKeyReader
+        data: ModelReader
+        def as_builder(self) -> Request.ModelBuilder: ...
+
+    class ModelBuilder(Request.Model):
+        key: ModelKey | ModelKeyBuilder | ModelKeyReader
+        data: Model | ModelBuilder | ModelReader
+        @staticmethod
+        def from_dict(dictionary: dict) -> Request.ModelBuilder: ...
+        def copy(self) -> Request.ModelBuilder: ...
+        def to_bytes(self) -> bytes: ...
+        def to_bytes_packed(self) -> bytes: ...
+        def to_segments(self) -> list[bytes]: ...
+        def as_reader(self) -> Request.ModelReader: ...
+        @staticmethod
+        def write(file: BufferedWriter) -> None: ...
+        @staticmethod
+        def write_packed(file: BufferedWriter) -> None: ...
+
+    class Input:
+        keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+        descriptors: Sequence[
+            TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader
+        ]
+        def which(self) -> Literal["keys", "descriptors"]: ...
+        @staticmethod
+        @contextmanager
+        def from_bytes(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Iterator[Request.InputReader]: ...
+        @staticmethod
+        def from_bytes_packed(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Request.InputReader: ...
+        @staticmethod
+        def new_message() -> Request.InputBuilder: ...
+        def to_dict(self) -> dict: ...
+
+    class InputReader(Request.Input):
+        keys: Sequence[TensorKeyReader]
+        descriptors: Sequence[TensorDescriptorReader]
+        def as_builder(self) -> Request.InputBuilder: ...
+
+    class InputBuilder(Request.Input):
+        keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+        descriptors: Sequence[
+            TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader
+        ]
+        @staticmethod
+        def from_dict(dictionary: dict) -> Request.InputBuilder: ...
+        def copy(self) -> Request.InputBuilder: ...
+        def to_bytes(self) -> bytes: ...
+        def to_bytes_packed(self) -> bytes: ...
+        def to_segments(self) -> list[bytes]: ...
+        def as_reader(self) -> Request.InputReader: ...
+        @staticmethod
+        def write(file: BufferedWriter) -> None: ...
+        @staticmethod
+        def write_packed(file: BufferedWriter) -> None: ...
+
+    class CustomAttributes:
+        torch: (
+            TorchRequestAttributes
+            | TorchRequestAttributesBuilder
+            | TorchRequestAttributesReader
+        )
+        tf: (
+            TensorFlowRequestAttributes
+            | TensorFlowRequestAttributesBuilder
+            | TensorFlowRequestAttributesReader
+        )
+        none: None
+        def which(self) -> Literal["torch", "tf", "none"]: ...
+        @overload
+        def init(self, name: Literal["torch"]) -> TorchRequestAttributes: ...
+        @overload
+        def init(self, name: Literal["tf"]) -> TensorFlowRequestAttributes: ...
+        @staticmethod
+        @contextmanager
+        def from_bytes(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Iterator[Request.CustomAttributesReader]: ...
+        @staticmethod
+        def from_bytes_packed(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Request.CustomAttributesReader: ...
+        @staticmethod
+        def new_message() -> Request.CustomAttributesBuilder: ...
+        def to_dict(self) -> dict: ...
+
+    class CustomAttributesReader(Request.CustomAttributes):
+        torch: TorchRequestAttributesReader
+        tf: TensorFlowRequestAttributesReader
+        def as_builder(self) -> Request.CustomAttributesBuilder: ...
+
+    class CustomAttributesBuilder(Request.CustomAttributes):
+        torch: (
+            TorchRequestAttributes
+            | TorchRequestAttributesBuilder
+            | TorchRequestAttributesReader
+        )
+        tf: (
+            TensorFlowRequestAttributes
+            | TensorFlowRequestAttributesBuilder
+            | TensorFlowRequestAttributesReader
+        )
+        @staticmethod
+        def from_dict(dictionary: dict) -> Request.CustomAttributesBuilder: ...
+        def copy(self) -> Request.CustomAttributesBuilder: ...
+        def to_bytes(self) -> bytes: ...
+        def to_bytes_packed(self) -> bytes: ...
+        def to_segments(self) -> list[bytes]: ...
+        def as_reader(self) -> Request.CustomAttributesReader: ...
+        @staticmethod
+        def write(file: BufferedWriter) -> None: ...
+        @staticmethod
+        def write_packed(file: BufferedWriter) -> None: ...
+    replyChannel: ChannelDescriptor | ChannelDescriptorBuilder | ChannelDescriptorReader
+    model: Request.Model | Request.ModelBuilder | Request.ModelReader
+    input: Request.Input | Request.InputBuilder | Request.InputReader
+    output: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+    outputDescriptors: Sequence[
+        OutputDescriptor | OutputDescriptorBuilder | OutputDescriptorReader
+    ]
+    customAttributes: (
+        Request.CustomAttributes
+        | Request.CustomAttributesBuilder
+        | Request.CustomAttributesReader
+    )
+    @overload
+    def init(self, name: Literal["replyChannel"]) -> ChannelDescriptor: ...
+    @overload
+    def init(self, name: Literal["model"]) -> Model: ...
+    @overload
+    def init(self, name: Literal["input"]) -> Input: ...
+    @overload
+    def init(self, name: Literal["customAttributes"]) -> CustomAttributes: ...
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[RequestReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> RequestReader: ...
+    @staticmethod
+    def new_message() -> RequestBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class RequestReader(Request):
+    replyChannel: ChannelDescriptorReader
+    model: Request.ModelReader
+    input: Request.InputReader
+    output: Sequence[TensorKeyReader]
+    outputDescriptors: Sequence[OutputDescriptorReader]
+    customAttributes: Request.CustomAttributesReader
+    def as_builder(self) -> RequestBuilder: ...
+
+class RequestBuilder(Request):
+    replyChannel: ChannelDescriptor | ChannelDescriptorBuilder | ChannelDescriptorReader
+    model: Request.Model | Request.ModelBuilder | Request.ModelReader
+    input: Request.Input | Request.InputBuilder | Request.InputReader
+    output: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+    outputDescriptors: Sequence[
+        OutputDescriptor | OutputDescriptorBuilder | OutputDescriptorReader
+    ]
+    customAttributes: (
+        Request.CustomAttributes
+        | Request.CustomAttributesBuilder
+        | Request.CustomAttributesReader
+    )
+    @staticmethod
+    def from_dict(dictionary: dict) -> RequestBuilder: ...
+    def copy(self) -> RequestBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> RequestReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/response/response.capnp b/smartsim/_core/mli/mli_schemas/response/response.capnp
new file mode 100644
index 000000000..7194524cd
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response.capnp
@@ -0,0 +1,52 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0xa05dcb4444780705;
+
+using Tensors = import "../tensor/tensor.capnp";
+using ResponseAttributes = import "response_attributes/response_attributes.capnp";
+using DataRef = import "../data/data_references.capnp";
+
+enum Status {
+  complete @0;
+  fail @1;
+  timeout @2;
+  running @3;
+}
+
+struct Response {
+  status @0 :Status;
+  message @1 :Text;
+  result :union {
+    keys @2 :List(DataRef.TensorKey);
+    descriptors @3 :List(Tensors.TensorDescriptor);
+  }
+  customAttributes :union {
+    torch @4 :ResponseAttributes.TorchResponseAttributes;
+    tf @5 :ResponseAttributes.TensorFlowResponseAttributes;
+    none @6 :Void;
+  }
+}
diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp
new file mode 100644
index 000000000..b4dcf18e8
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes.capnp
@@ -0,0 +1,33 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0xee59c60fccbb1bf9;
+
+struct TorchResponseAttributes {
+}
+
+struct TensorFlowResponseAttributes {
+}
diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py
new file mode 100644
index 000000000..4839334d5
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `response_attributes.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "response_attributes.capnp"))
+TorchResponseAttributes = capnp.load(module_file).TorchResponseAttributes
+TorchResponseAttributesBuilder = TorchResponseAttributes
+TorchResponseAttributesReader = TorchResponseAttributes
+TensorFlowResponseAttributes = capnp.load(module_file).TensorFlowResponseAttributes
+TensorFlowResponseAttributesBuilder = TensorFlowResponseAttributes
+TensorFlowResponseAttributesReader = TensorFlowResponseAttributes
diff --git a/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi
new file mode 100644
index 000000000..f40688d74
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response_attributes/response_attributes_capnp.pyi
@@ -0,0 +1,103 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `response_attributes.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator
+
+class TorchResponseAttributes:
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TorchResponseAttributesReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TorchResponseAttributesReader: ...
+    @staticmethod
+    def new_message() -> TorchResponseAttributesBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TorchResponseAttributesReader(TorchResponseAttributes):
+    def as_builder(self) -> TorchResponseAttributesBuilder: ...
+
+class TorchResponseAttributesBuilder(TorchResponseAttributes):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TorchResponseAttributesBuilder: ...
+    def copy(self) -> TorchResponseAttributesBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TorchResponseAttributesReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
+
+class TensorFlowResponseAttributes:
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TensorFlowResponseAttributesReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TensorFlowResponseAttributesReader: ...
+    @staticmethod
+    def new_message() -> TensorFlowResponseAttributesBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TensorFlowResponseAttributesReader(TensorFlowResponseAttributes):
+    def as_builder(self) -> TensorFlowResponseAttributesBuilder: ...
+
+class TensorFlowResponseAttributesBuilder(TensorFlowResponseAttributes):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TensorFlowResponseAttributesBuilder: ...
+    def copy(self) -> TensorFlowResponseAttributesBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TensorFlowResponseAttributesReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.py b/smartsim/_core/mli/mli_schemas/response/response_capnp.py
new file mode 100644
index 000000000..eaa345104
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.py
@@ -0,0 +1,38 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `response.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "response.capnp"))
+Response = capnp.load(module_file).Response
+ResponseBuilder = Response
+ResponseReader = Response
diff --git a/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi
new file mode 100644
index 000000000..6b4c50fd0
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/response/response_capnp.pyi
@@ -0,0 +1,212 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `response.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator, Literal, Sequence, overload
+
+from ..data.data_references_capnp import TensorKey, TensorKeyBuilder, TensorKeyReader
+from ..tensor.tensor_capnp import (
+    TensorDescriptor,
+    TensorDescriptorBuilder,
+    TensorDescriptorReader,
+)
+from .response_attributes.response_attributes_capnp import (
+    TensorFlowResponseAttributes,
+    TensorFlowResponseAttributesBuilder,
+    TensorFlowResponseAttributesReader,
+    TorchResponseAttributes,
+    TorchResponseAttributesBuilder,
+    TorchResponseAttributesReader,
+)
+
+Status = Literal["complete", "fail", "timeout", "running"]
+
+class Response:
+    class Result:
+        keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+        descriptors: Sequence[
+            TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader
+        ]
+        def which(self) -> Literal["keys", "descriptors"]: ...
+        @staticmethod
+        @contextmanager
+        def from_bytes(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Iterator[Response.ResultReader]: ...
+        @staticmethod
+        def from_bytes_packed(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Response.ResultReader: ...
+        @staticmethod
+        def new_message() -> Response.ResultBuilder: ...
+        def to_dict(self) -> dict: ...
+
+    class ResultReader(Response.Result):
+        keys: Sequence[TensorKeyReader]
+        descriptors: Sequence[TensorDescriptorReader]
+        def as_builder(self) -> Response.ResultBuilder: ...
+
+    class ResultBuilder(Response.Result):
+        keys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+        descriptors: Sequence[
+            TensorDescriptor | TensorDescriptorBuilder | TensorDescriptorReader
+        ]
+        @staticmethod
+        def from_dict(dictionary: dict) -> Response.ResultBuilder: ...
+        def copy(self) -> Response.ResultBuilder: ...
+        def to_bytes(self) -> bytes: ...
+        def to_bytes_packed(self) -> bytes: ...
+        def to_segments(self) -> list[bytes]: ...
+        def as_reader(self) -> Response.ResultReader: ...
+        @staticmethod
+        def write(file: BufferedWriter) -> None: ...
+        @staticmethod
+        def write_packed(file: BufferedWriter) -> None: ...
+
+    class CustomAttributes:
+        torch: (
+            TorchResponseAttributes
+            | TorchResponseAttributesBuilder
+            | TorchResponseAttributesReader
+        )
+        tf: (
+            TensorFlowResponseAttributes
+            | TensorFlowResponseAttributesBuilder
+            | TensorFlowResponseAttributesReader
+        )
+        none: None
+        def which(self) -> Literal["torch", "tf", "none"]: ...
+        @overload
+        def init(self, name: Literal["torch"]) -> TorchResponseAttributes: ...
+        @overload
+        def init(self, name: Literal["tf"]) -> TensorFlowResponseAttributes: ...
+        @staticmethod
+        @contextmanager
+        def from_bytes(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Iterator[Response.CustomAttributesReader]: ...
+        @staticmethod
+        def from_bytes_packed(
+            data: bytes,
+            traversal_limit_in_words: int | None = ...,
+            nesting_limit: int | None = ...,
+        ) -> Response.CustomAttributesReader: ...
+        @staticmethod
+        def new_message() -> Response.CustomAttributesBuilder: ...
+        def to_dict(self) -> dict: ...
+
+    class CustomAttributesReader(Response.CustomAttributes):
+        torch: TorchResponseAttributesReader
+        tf: TensorFlowResponseAttributesReader
+        def as_builder(self) -> Response.CustomAttributesBuilder: ...
+
+    class CustomAttributesBuilder(Response.CustomAttributes):
+        torch: (
+            TorchResponseAttributes
+            | TorchResponseAttributesBuilder
+            | TorchResponseAttributesReader
+        )
+        tf: (
+            TensorFlowResponseAttributes
+            | TensorFlowResponseAttributesBuilder
+            | TensorFlowResponseAttributesReader
+        )
+        @staticmethod
+        def from_dict(dictionary: dict) -> Response.CustomAttributesBuilder: ...
+        def copy(self) -> Response.CustomAttributesBuilder: ...
+        def to_bytes(self) -> bytes: ...
+        def to_bytes_packed(self) -> bytes: ...
+        def to_segments(self) -> list[bytes]: ...
+        def as_reader(self) -> Response.CustomAttributesReader: ...
+        @staticmethod
+        def write(file: BufferedWriter) -> None: ...
+        @staticmethod
+        def write_packed(file: BufferedWriter) -> None: ...
+    status: Status
+    message: str
+    result: Response.Result | Response.ResultBuilder | Response.ResultReader
+    customAttributes: (
+        Response.CustomAttributes
+        | Response.CustomAttributesBuilder
+        | Response.CustomAttributesReader
+    )
+    @overload
+    def init(self, name: Literal["result"]) -> Result: ...
+    @overload
+    def init(self, name: Literal["customAttributes"]) -> CustomAttributes: ...
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[ResponseReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> ResponseReader: ...
+    @staticmethod
+    def new_message() -> ResponseBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class ResponseReader(Response):
+    result: Response.ResultReader
+    customAttributes: Response.CustomAttributesReader
+    def as_builder(self) -> ResponseBuilder: ...
+
+class ResponseBuilder(Response):
+    result: Response.Result | Response.ResultBuilder | Response.ResultReader
+    customAttributes: (
+        Response.CustomAttributes
+        | Response.CustomAttributesBuilder
+        | Response.CustomAttributesReader
+    )
+    @staticmethod
+    def from_dict(dictionary: dict) -> ResponseBuilder: ...
+    def copy(self) -> ResponseBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> ResponseReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp
new file mode 100644
index 000000000..4b2218b16
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/tensor/tensor.capnp
@@ -0,0 +1,75 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@0x9a0aeb2e04838fb1;
+
+using DataRef = import "../data/data_references.capnp";
+
+enum Order {
+  c @0; # row major (contiguous layout)
+  f @1; # column major (fortran contiguous layout)
+}
+
+enum NumericalType {
+  int8 @0;
+  int16 @1;
+  int32 @2;
+  int64 @3;
+  uInt8 @4;
+  uInt16 @5;
+  uInt32 @6;
+  uInt64 @7;
+  float32 @8; 
+  float64 @9;
+}
+
+enum ReturnNumericalType {
+  int8 @0;
+  int16 @1;
+  int32 @2;
+  int64 @3;
+  uInt8 @4;
+  uInt16 @5;
+  uInt32 @6;
+  uInt64 @7;
+  float32 @8; 
+  float64 @9;
+  none @10;
+  auto @11;
+}
+
+struct TensorDescriptor {
+  dimensions @0 :List(Int32);
+  order @1 :Order;
+  dataType @2 :NumericalType;
+}
+
+struct OutputDescriptor {
+  order @0 :Order;
+  optionalKeys @1 :List(DataRef.TensorKey);
+  optionalDimension @2 :List(Int32);
+  optionalDatatype @3 :ReturnNumericalType;
+}
diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py
new file mode 100644
index 000000000..8c9d6c902
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.py
@@ -0,0 +1,41 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `tensor.capnp`."""
+
+import os
+
+import capnp  # type: ignore
+
+capnp.remove_import_hook()
+here = os.path.dirname(os.path.abspath(__file__))
+module_file = os.path.abspath(os.path.join(here, "tensor.capnp"))
+TensorDescriptor = capnp.load(module_file).TensorDescriptor
+TensorDescriptorBuilder = TensorDescriptor
+TensorDescriptorReader = TensorDescriptor
+OutputDescriptor = capnp.load(module_file).OutputDescriptor
+OutputDescriptorBuilder = OutputDescriptor
+OutputDescriptorReader = OutputDescriptor
diff --git a/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi
new file mode 100644
index 000000000..b55f26b45
--- /dev/null
+++ b/smartsim/_core/mli/mli_schemas/tensor/tensor_capnp.pyi
@@ -0,0 +1,142 @@
+# BSD 2-Clause License
+
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""This is an automatically generated stub for `tensor.capnp`."""
+
+# mypy: ignore-errors
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from io import BufferedWriter
+from typing import Iterator, Literal, Sequence
+
+from ..data.data_references_capnp import TensorKey, TensorKeyBuilder, TensorKeyReader
+
+Order = Literal["c", "f"]
+NumericalType = Literal[
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uInt8",
+    "uInt16",
+    "uInt32",
+    "uInt64",
+    "float32",
+    "float64",
+]
+ReturnNumericalType = Literal[
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uInt8",
+    "uInt16",
+    "uInt32",
+    "uInt64",
+    "float32",
+    "float64",
+    "none",
+    "auto",
+]
+
+class TensorDescriptor:
+    dimensions: Sequence[int]
+    order: Order
+    dataType: NumericalType
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[TensorDescriptorReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> TensorDescriptorReader: ...
+    @staticmethod
+    def new_message() -> TensorDescriptorBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class TensorDescriptorReader(TensorDescriptor):
+    def as_builder(self) -> TensorDescriptorBuilder: ...
+
+class TensorDescriptorBuilder(TensorDescriptor):
+    @staticmethod
+    def from_dict(dictionary: dict) -> TensorDescriptorBuilder: ...
+    def copy(self) -> TensorDescriptorBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> TensorDescriptorReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
+
+class OutputDescriptor:
+    order: Order
+    optionalKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+    optionalDimension: Sequence[int]
+    optionalDatatype: ReturnNumericalType
+    @staticmethod
+    @contextmanager
+    def from_bytes(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> Iterator[OutputDescriptorReader]: ...
+    @staticmethod
+    def from_bytes_packed(
+        data: bytes,
+        traversal_limit_in_words: int | None = ...,
+        nesting_limit: int | None = ...,
+    ) -> OutputDescriptorReader: ...
+    @staticmethod
+    def new_message() -> OutputDescriptorBuilder: ...
+    def to_dict(self) -> dict: ...
+
+class OutputDescriptorReader(OutputDescriptor):
+    optionalKeys: Sequence[TensorKeyReader]
+    def as_builder(self) -> OutputDescriptorBuilder: ...
+
+class OutputDescriptorBuilder(OutputDescriptor):
+    optionalKeys: Sequence[TensorKey | TensorKeyBuilder | TensorKeyReader]
+    @staticmethod
+    def from_dict(dictionary: dict) -> OutputDescriptorBuilder: ...
+    def copy(self) -> OutputDescriptorBuilder: ...
+    def to_bytes(self) -> bytes: ...
+    def to_bytes_packed(self) -> bytes: ...
+    def to_segments(self) -> list[bytes]: ...
+    def as_reader(self) -> OutputDescriptorReader: ...
+    @staticmethod
+    def write(file: BufferedWriter) -> None: ...
+    @staticmethod
+    def write_packed(file: BufferedWriter) -> None: ...
diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py
new file mode 100644
index 000000000..a61a24322
--- /dev/null
+++ b/smartsim/_core/utils/timings.py
@@ -0,0 +1,143 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import time
+import typing as t
+from collections import OrderedDict
+
+import numpy as np
+
+from ...log import get_logger
+
+logger = get_logger("PerfTimer")
+
+
+class PerfTimer:
+    def __init__(
+        self,
+        filename: str = "timings",
+        prefix: str = "",
+        timing_on: bool = True,
+        debug: bool = False,
+    ):
+        self._start: t.Optional[float] = None
+        self._interm: t.Optional[float] = None
+        self._timings: OrderedDict[str, list[t.Union[float, int, str]]] = OrderedDict()
+        self._timing_on = timing_on
+        self._filename = filename
+        self._prefix = prefix
+        self._debug = debug
+
+    def _add_label_to_timings(self, label: str) -> None:
+        if label not in self._timings:
+            self._timings[label] = []
+
+    @staticmethod
+    def _format_number(number: t.Union[float, int]) -> str:
+        return f"{number:0.4e}"
+
+    def start_timings(
+        self,
+        first_label: t.Optional[str] = None,
+        first_value: t.Optional[t.Union[float, int]] = None,
+    ) -> None:
+        if self._timing_on:
+            if first_label is not None and first_value is not None:
+                mod_label = self._make_label(first_label)
+                value = self._format_number(first_value)
+                self._log(f"Started timing: {first_label}: {value}")
+                self._add_label_to_timings(mod_label)
+                self._timings[mod_label].append(value)
+            self._start = time.perf_counter()
+            self._interm = time.perf_counter()
+
+    def end_timings(self) -> None:
+        if self._timing_on and self._start is not None:
+            mod_label = self._make_label("total_time")
+            self._add_label_to_timings(mod_label)
+            delta = self._format_number(time.perf_counter() - self._start)
+            self._timings[self._make_label("total_time")].append(delta)
+            self._log(f"Finished timing: {mod_label}: {delta}")
+            self._interm = None
+
+    def _make_label(self, label: str) -> str:
+        return self._prefix + label
+
+    def _get_delta(self) -> t.Union[float, int]:
+        if self._interm is None:
+            return 0
+        return time.perf_counter() - self._interm
+
+    def get_last(self, label: str) -> str:
+        mod_label = self._make_label(label)
+        if mod_label in self._timings:
+            value = self._timings[mod_label][-1]
+            if value:
+                return f"{label}: {value}"
+
+        return "Not measured yet"
+
+    def measure_time(self, label: str) -> None:
+        if self._timing_on and self._interm is not None:
+            mod_label = self._make_label(label)
+            self._add_label_to_timings(mod_label)
+            delta = self._format_number(self._get_delta())
+            self._timings[mod_label].append(delta)
+            self._log(f"{mod_label}: {delta}")
+            self._interm = time.perf_counter()
+
+    def _log(self, msg: str) -> None:
+        if self._debug:
+            logger.info(msg)
+
+    @property
+    def max_length(self) -> int:
+        if len(self._timings) == 0:
+            return 0
+        return max(len(value) for value in self._timings.values())
+
+    def print_timings(self, to_file: bool = False) -> None:
+        print(" ".join(self._timings.keys()))
+        try:
+            value_array = np.array(list(self._timings.values()), dtype=float)
+        except Exception as e:
+            logger.exception(e)
+            return
+        value_array = np.transpose(value_array)
+        if self._debug:
+            for i in range(value_array.shape[0]):
+                print(" ".join(self._format_number(value) for value in value_array[i]))
+        if to_file:
+            np.save(self._prefix + self._filename + ".npy", value_array)
+
+    def set_active(self, active: bool = True) -> None:
+        """Set whether the timer will record time"""
+        self._timing_on = active
+
+    @property
+    def is_active(self) -> bool:
+        """Returns true if the timer will record time"""
+        return self._timing_on
diff --git a/smartsim/settings/dragonRunSettings.py b/smartsim/settings/dragonRunSettings.py
index 69a91547e..15e585544 100644
--- a/smartsim/settings/dragonRunSettings.py
+++ b/smartsim/settings/dragonRunSettings.py
@@ -95,6 +95,26 @@ def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None:
 
         self.run_args["node-feature"] = ",".join(feature_list)
 
+    @override
+    def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None:
+        """Specify the hostlist for this job
+
+        :param host_list: hosts to launch on
+        :raises ValueError: if an empty host list is supplied
+        """
+        if not host_list:
+            raise ValueError("empty hostlist provided")
+
+        if isinstance(host_list, str):
+            host_list = host_list.replace(" ", "").split(",")
+
+        # strip out all whitespace-only values
+        cleaned_list = [host.strip() for host in host_list if host and host.strip()]
+        if not len(cleaned_list) == len(host_list):
+            raise ValueError(f"invalid names found in hostlist: {host_list}")
+
+        self.run_args["host-list"] = ",".join(cleaned_list)
+
     def set_cpu_affinity(self, devices: t.List[int]) -> None:
         """Set the CPU affinity for this job
 
diff --git a/tests/dragon/__init__.py b/tests/dragon/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/dragon/featurestore.py b/tests/dragon/featurestore.py
new file mode 100644
index 000000000..d06035fd7
--- /dev/null
+++ b/tests/dragon/featurestore.py
@@ -0,0 +1,156 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import typing as t
+
+import smartsim.error as sse
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class MemoryFeatureStore(FeatureStore):
+    """A feature store with values persisted only in local memory"""
+
+    def __init__(self) -> None:
+        """Initialize the MemoryFeatureStore instance"""
+        self._storage: t.Dict[str, bytes] = {}
+
+    def __getitem__(self, key: str) -> bytes:
+        """Retrieve an item using key
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+        if key not in self._storage:
+            raise sse.SmartSimError(f"{key} not found in feature store")
+        return self._storage[key]
+
+    def __setitem__(self, key: str, value: bytes) -> None:
+        """Membership operator to test for a key existing within the feature store.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
+        self._storage[key] = value
+
+    def __contains__(self, key: str) -> bool:
+        """Membership operator to test for a key existing within the feature store.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
+        return key in self._storage
+
+    @property
+    def descriptor(self) -> str:
+        """Unique identifier enabling a client to connect to the feature store
+
+        :returns: A descriptor encoded as a string"""
+        return "file-system-fs"
+
+
+class FileSystemFeatureStore(FeatureStore):
+    """Alternative feature store implementation for testing. Stores all
+    data on the file system"""
+
+    def __init__(
+        self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None
+    ) -> None:
+        """Initialize the FileSystemFeatureStore instance
+
+        :param storage_dir: (optional) root directory to store all data relative to"""
+        if isinstance(storage_dir, str):
+            storage_dir = pathlib.Path(storage_dir)
+        self._storage_dir = storage_dir
+
+    def __getitem__(self, key: str) -> bytes:
+        """Retrieve an item using key
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+        path = self._key_path(key)
+        if not path.exists():
+            raise sse.SmartSimError(f"{path} not found in feature store")
+        return path.read_bytes()
+
+    def __setitem__(self, key: str, value: bytes) -> None:
+        """Assign a value using key
+
+        :param key: Unique key of an item to set in the feature store
+        :param value: Value to persist in the feature store"""
+        path = self._key_path(key, create=True)
+        path.write_bytes(value)
+
+    def __contains__(self, key: str) -> bool:
+        """Membership operator to test for a key existing within the feature store.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
+        path = self._key_path(key)
+        return path.exists()
+
+    def _key_path(self, key: str, create: bool = False) -> pathlib.Path:
+        """Given a key, return a path that is optionally combined with a base
+        directory used by the FileSystemFeatureStore.
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+        value = pathlib.Path(key)
+
+        if self._storage_dir:
+            value = self._storage_dir / key
+
+        if create:
+            value.parent.mkdir(parents=True, exist_ok=True)
+
+        return value
+
+    @property
+    def descriptor(self) -> str:
+        """Unique identifier enabling a client to connect to the feature store
+
+        :returns: A descriptor encoded as a string"""
+        if not self._storage_dir:
+            raise ValueError("No storage path configured")
+        return self._storage_dir.as_posix()
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "FileSystemFeatureStore":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemFeatureStore"""
+        try:
+            path = pathlib.Path(descriptor)
+            path.mkdir(parents=True, exist_ok=True)
+            if not path.is_dir():
+                raise ValueError("FileSystemFeatureStore requires a directory path")
+            if not path.exists():
+                path.mkdir(parents=True, exist_ok=True)
+            return FileSystemFeatureStore(path)
+        except:
+            logger.error(f"Error while creating FileSystemFeatureStore: {descriptor}")
+            raise
diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py
new file mode 100644
index 000000000..231a97124
--- /dev/null
+++ b/tests/dragon/test_core_machine_learning_worker.py
@@ -0,0 +1,381 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import time
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+import torch
+
+import smartsim.error as sse
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStoreKey
+from smartsim._core.mli.infrastructure.worker.worker import (
+    InferenceRequest,
+    MachineLearningWorkerCore,
+    RequestBatch,
+    TransformInputResult,
+    TransformOutputResult,
+)
+from smartsim._core.utils import installed_redisai_backends
+
+from .featurestore import FileSystemFeatureStore, MemoryFeatureStore
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+# retrieved from pytest fixtures
+is_dragon = (
+    pytest.test_launcher == "dragon" if hasattr(pytest, "test_launcher") else False
+)
+torch_available = "torch" in installed_redisai_backends()
+
+
+@pytest.fixture
+def persist_torch_model(test_dir: str) -> pathlib.Path:
+    ts_start = time.time_ns()
+    print("Starting model file creation...")
+    test_path = pathlib.Path(test_dir)
+    model_path = test_path / "basic.pt"
+
+    model = torch.nn.Linear(2, 1)
+    torch.save(model, model_path)
+    ts_end = time.time_ns()
+
+    ts_elapsed = (ts_end - ts_start) / 1000000000
+    print(f"Model file creation took {ts_elapsed} seconds")
+    return model_path
+
+
+@pytest.fixture
+def persist_torch_tensor(test_dir: str) -> pathlib.Path:
+    ts_start = time.time_ns()
+    print("Starting model file creation...")
+    test_path = pathlib.Path(test_dir)
+    file_path = test_path / "tensor.pt"
+
+    tensor = torch.randn((100, 100, 2))
+    torch.save(tensor, file_path)
+    ts_end = time.time_ns()
+
+    ts_elapsed = (ts_end - ts_start) / 1000000000
+    print(f"Tensor file creation took {ts_elapsed} seconds")
+    return file_path
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> None:
+    """Verify that the ML worker successfully retrieves a model
+    when given a valid (file system) key"""
+    worker = MachineLearningWorkerCore
+    key = str(persist_torch_model)
+    feature_store = FileSystemFeatureStore(test_dir)
+    fsd = feature_store.descriptor
+    feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes()
+
+    model_key = FeatureStoreKey(key=key, descriptor=fsd)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_model(batch, {fsd: feature_store})
+    assert fetch_result.model_bytes
+    assert fetch_result.model_bytes == persist_torch_model.read_bytes()
+
+
+def test_fetch_model_disk_missing() -> None:
+    """Verify that the ML worker fails to retrieves a model
+    when given an invalid (file system) key"""
+    worker = MachineLearningWorkerCore
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    key = "/path/that/doesnt/exist"
+
+    model_key = FeatureStoreKey(key=key, descriptor=fsd)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    with pytest.raises(sse.SmartSimError) as ex:
+        worker.fetch_model(batch, {fsd: feature_store})
+
+    # ensure the error message includes key-identifying information
+    assert key in ex.value.args[0]
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves a model
+    when given a valid (file system) key"""
+    worker = MachineLearningWorkerCore
+
+    # create a key to retrieve from the feature store
+    key = "test-model"
+
+    # put model bytes into the feature store
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    feature_store[key] = persist_torch_model.read_bytes()
+
+    model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_model(batch, {fsd: feature_store})
+    assert fetch_result.model_bytes
+    assert fetch_result.model_bytes == persist_torch_model.read_bytes()
+
+
+def test_fetch_model_feature_store_missing() -> None:
+    """Verify that the ML worker fails to retrieves a model
+    when given an invalid (feature store) key"""
+    worker = MachineLearningWorkerCore
+
+    key = "some-key"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    # todo: consider that raising this exception shows impl. replace...
+    with pytest.raises(sse.SmartSimError) as ex:
+        worker.fetch_model(batch, {fsd: feature_store})
+
+    # ensure the error message includes key-identifying information
+    assert key in ex.value.args[0]
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves a model
+    when given a valid (file system) key"""
+    worker = MachineLearningWorkerCore
+
+    key = "test-model"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    feature_store[key] = persist_torch_model.read_bytes()
+
+    model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor)
+    request = InferenceRequest(model_key=model_key)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_model(batch, {fsd: feature_store})
+    assert fetch_result.model_bytes
+    assert fetch_result.model_bytes == persist_torch_model.read_bytes()
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves a tensor/input
+    when given a valid (file system) key"""
+    tensor_name = str(persist_torch_tensor)
+
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    request = InferenceRequest(
+        input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)]
+    )
+
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    worker = MachineLearningWorkerCore
+
+    feature_store[tensor_name] = persist_torch_tensor.read_bytes()
+
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+    assert fetch_result[0].inputs is not None
+
+
+def test_fetch_input_disk_missing() -> None:
+    """Verify that the ML worker fails to retrieves a tensor/input
+    when given an invalid (file system) key"""
+    worker = MachineLearningWorkerCore
+
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    key = "/path/that/doesnt/exist"
+
+    request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
+
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    with pytest.raises(sse.SmartSimError) as ex:
+        worker.fetch_inputs(batch, {fsd: feature_store})
+
+    # ensure the error message includes key-identifying information
+    assert key[0] in ex.value.args[0]
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves a tensor/input
+    when given a valid (feature store) key"""
+    worker = MachineLearningWorkerCore
+
+    tensor_name = "test-tensor"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    request = InferenceRequest(
+        input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)]
+    )
+
+    # put model bytes into the feature store
+    feature_store[tensor_name] = persist_torch_tensor.read_bytes()
+
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+    assert fetch_result[0].inputs
+    assert (
+        list(fetch_result[0].inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10]
+    )
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves multiple tensor/input
+    when given a valid collection of (feature store) keys"""
+    worker = MachineLearningWorkerCore
+
+    tensor_name = "test-tensor"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    # put model bytes into the feature store
+    body1 = persist_torch_tensor.read_bytes()
+    feature_store[tensor_name + "1"] = body1
+
+    body2 = b"abcdefghijklmnopqrstuvwxyz"
+    feature_store[tensor_name + "2"] = body2
+
+    body3 = b"mnopqrstuvwxyzabcdefghijkl"
+    feature_store[tensor_name + "3"] = body3
+
+    request = InferenceRequest(
+        input_keys=[
+            FeatureStoreKey(key=tensor_name + "1", descriptor=fsd),
+            FeatureStoreKey(key=tensor_name + "2", descriptor=fsd),
+            FeatureStoreKey(key=tensor_name + "3", descriptor=fsd),
+        ]
+    )
+
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+
+    raw_bytes = list(fetch_result[0].inputs)
+    assert raw_bytes
+    assert raw_bytes[0][:10] == persist_torch_tensor.read_bytes()[:10]
+    assert raw_bytes[1][:10] == body2[:10]
+    assert raw_bytes[2][:10] == body3[:10]
+
+
+def test_fetch_input_feature_store_missing() -> None:
+    """Verify that the ML worker fails to retrieves a tensor/input
+    when given an invalid (feature store) key"""
+    worker = MachineLearningWorkerCore
+
+    key = "bad-key"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+    request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
+
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    with pytest.raises(sse.SmartSimError) as ex:
+        worker.fetch_inputs(batch, {fsd: feature_store})
+
+    # ensure the error message includes key-identifying information
+    assert key in ex.value.args[0]
+
+
+@pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None:
+    """Verify that the ML worker successfully retrieves a tensor/input
+    when given a valid (file system) key"""
+    worker = MachineLearningWorkerCore
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    key = "test-model"
+    feature_store[key] = persist_torch_tensor.read_bytes()
+    request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)])
+
+    model_key = FeatureStoreKey(key="test-model", descriptor=fsd)
+    batch = RequestBatch([request], None, model_key)
+
+    fetch_result = worker.fetch_inputs(batch, {fsd: feature_store})
+    assert fetch_result[0].inputs is not None
+
+
+def test_place_outputs() -> None:
+    """Verify outputs are shared using the feature store"""
+    worker = MachineLearningWorkerCore
+
+    key_name = "test-model"
+    feature_store = MemoryFeatureStore()
+    fsd = feature_store.descriptor
+
+    # create a key to retrieve from the feature store
+    keys = [
+        FeatureStoreKey(key=key_name + "1", descriptor=fsd),
+        FeatureStoreKey(key=key_name + "2", descriptor=fsd),
+        FeatureStoreKey(key=key_name + "3", descriptor=fsd),
+    ]
+    data = [b"abcdef", b"ghijkl", b"mnopqr"]
+
+    for fsk, v in zip(keys, data):
+        feature_store[fsk.key] = v
+
+    request = InferenceRequest(output_keys=keys)
+    transform_result = TransformOutputResult(data, [1], "c", "float32")
+
+    worker.place_output(request, transform_result, {fsd: feature_store})
+
+    for i in range(3):
+        assert feature_store[keys[i].key] == data[i]
+
+
+@pytest.mark.parametrize(
+    "key, descriptor",
+    [
+        pytest.param("", "desc", id="invalid key"),
+        pytest.param("key", "", id="invalid descriptor"),
+    ],
+)
+def test_invalid_featurestorekey(key, descriptor) -> None:
+    with pytest.raises(ValueError):
+        fsk = FeatureStoreKey(key, descriptor)
diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py
new file mode 100644
index 000000000..8edeb60fb
--- /dev/null
+++ b/tests/dragon/test_device_manager.py
@@ -0,0 +1,185 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.infrastructure.control.devicemanager import (
+    DeviceManager,
+    WorkerDevice,
+)
+from smartsim._core.mli.infrastructure.storage.featurestore import (
+    FeatureStore,
+    FeatureStoreKey,
+)
+from smartsim._core.mli.infrastructure.worker.worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    InferenceRequest,
+    LoadModelResult,
+    MachineLearningWorkerBase,
+    RequestBatch,
+    TransformInputResult,
+    TransformOutputResult,
+)
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+class MockWorker(MachineLearningWorkerBase):
+    @staticmethod
+    def fetch_model(
+        batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore]
+    ) -> FetchModelResult:
+        if batch.has_raw_model:
+            return FetchModelResult(batch.raw_model)
+        return FetchModelResult(b"fetched_model")
+
+    @staticmethod
+    def load_model(
+        batch: RequestBatch, fetch_result: FetchModelResult, device: str
+    ) -> LoadModelResult:
+        return LoadModelResult(fetch_result.model_bytes)
+
+    @staticmethod
+    def transform_input(
+        batch: RequestBatch,
+        fetch_results: list[FetchInputResult],
+        mem_pool: "MemoryPool",
+    ) -> TransformInputResult:
+        return TransformInputResult(b"result", [slice(0, 1)], [[1, 2]], ["float32"])
+
+    @staticmethod
+    def execute(
+        batch: RequestBatch,
+        load_result: LoadModelResult,
+        transform_result: TransformInputResult,
+        device: str,
+    ) -> ExecuteResult:
+        return ExecuteResult(b"result", [slice(0, 1)])
+
+    @staticmethod
+    def transform_output(
+        batch: RequestBatch, execute_result: ExecuteResult
+    ) -> t.List[TransformOutputResult]:
+        return [TransformOutputResult(b"result", None, "c", "float32")]
+
+
+def test_worker_device():
+    worker_device = WorkerDevice("gpu:0")
+    assert worker_device.name == "gpu:0"
+
+    model_key = "my_model_key"
+    model = b"the model"
+
+    worker_device.add_model(model_key, model)
+
+    assert model_key in worker_device
+    assert worker_device.get_model(model_key) == model
+    worker_device.remove_model(model_key)
+
+    assert model_key not in worker_device
+
+
+def test_device_manager_model_in_request():
+
+    worker_device = WorkerDevice("gpu:0")
+    device_manager = DeviceManager(worker_device)
+
+    worker = MockWorker()
+
+    tensor_key = FeatureStoreKey(key="key", descriptor="desc")
+    output_key = FeatureStoreKey(key="key", descriptor="desc")
+    model_key = FeatureStoreKey(key="model key", descriptor="desc")
+
+    request = InferenceRequest(
+        model_key=model_key,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=b"raw model",
+        batch_size=0,
+    )
+
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
+        model_id=model_key,
+    )
+
+    with device_manager.get_device(
+        worker=worker, batch=request_batch, feature_stores={}
+    ) as returned_device:
+
+        assert returned_device == worker_device
+        assert worker_device.get_model(model_key.key) == b"raw model"
+
+    assert model_key.key not in worker_device
+
+
+def test_device_manager_model_key():
+
+    worker_device = WorkerDevice("gpu:0")
+    device_manager = DeviceManager(worker_device)
+
+    worker = MockWorker()
+
+    tensor_key = FeatureStoreKey(key="key", descriptor="desc")
+    output_key = FeatureStoreKey(key="key", descriptor="desc")
+    model_key = FeatureStoreKey(key="model key", descriptor="desc")
+
+    request = InferenceRequest(
+        model_key=model_key,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=None,
+        batch_size=0,
+    )
+
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
+        model_id=model_key,
+    )
+
+    with device_manager.get_device(
+        worker=worker, batch=request_batch, feature_stores={}
+    ) as returned_device:
+
+        assert returned_device == worker_device
+        assert worker_device.get_model(model_key.key) == b"fetched_model"
+
+    assert model_key.key in worker_device
diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py
new file mode 100644
index 000000000..8f2716488
--- /dev/null
+++ b/tests/dragon/test_environment_loader.py
@@ -0,0 +1,134 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+import dragon.utils as du
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.fli import DragonFLIError, FLInterface
+
+from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+@pytest.mark.parametrize(
+    "content",
+    [
+        pytest.param(b"a"),
+        pytest.param(b"new byte string"),
+    ],
+)
+def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch):
+    """A descriptor can be stored, loaded, and reattached"""
+    chan = Channel.make_process_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+    )
+
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    config_queue = config.get_queue()
+
+    _ = config_queue.send(content)
+
+    old_recv = queue.recvh()
+    result, _ = old_recv.recv_bytes()
+    assert result == content
+
+
+def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch):
+    """The serialized descriptors of a loaded and unloaded
+    queue are the same"""
+    chan = Channel.make_process_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+    )
+
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    config_queue = config.get_queue()
+    assert config_queue._fli.serialize() == queue.serialize()
+
+
+def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch):
+    """An incorrect serialized descriptor will fails to attach"""
+    monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring")
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=None,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    with pytest.raises(DragonFLIError):
+        config.get_queue()
+
+
+def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch):
+    """Verify the dragon feature store is loaded correctly by the
+    EnvironmentConfigLoader to demonstrate featurestore_factory correctness"""
+    feature_store = DragonFeatureStore(DDict())
+    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor)
+
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=None,
+        queue_factory=None,
+    )
+
+    print(f"calling config.get_backbone: `{feature_store.descriptor}`")
+
+    backbone = config.get_backbone()
+    assert backbone is not None
+
+
+def test_environment_variables_not_set():
+    """EnvironmentConfigLoader getters return None when environment
+    variables are not set"""
+    config = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonCommChannel.from_descriptor,
+    )
+    assert config.get_backbone() is None
+    assert config.get_queue() is None
diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py
new file mode 100644
index 000000000..b20424866
--- /dev/null
+++ b/tests/dragon/test_error_handling.py
@@ -0,0 +1,479 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from unittest.mock import MagicMock
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+import multiprocessing as mp
+
+import dragon.utils as du
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.fli import FLInterface
+from dragon.mpbridge.queues import DragonQueue
+
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.control.devicemanager import WorkerDevice
+from smartsim._core.mli.infrastructure.control.requestdispatcher import (
+    RequestDispatcher,
+)
+from smartsim._core.mli.infrastructure.control.workermanager import (
+    WorkerManager,
+    exception_handler,
+)
+from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.featurestore import (
+    FeatureStore,
+    FeatureStoreKey,
+)
+from smartsim._core.mli.infrastructure.worker.worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    InferenceReply,
+    InferenceRequest,
+    LoadModelResult,
+    RequestBatch,
+    TransformInputResult,
+    TransformOutputResult,
+)
+from smartsim._core.mli.message_handler import MessageHandler
+
+from .utils.channel import FileSystemCommChannel
+from .utils.worker import IntegratedTorchWorker
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+@pytest.fixture
+def backbone_descriptor() -> str:
+    # create a shared backbone featurestore
+    feature_store = DragonFeatureStore(DDict())
+    return feature_store.descriptor
+
+
+@pytest.fixture
+def app_feature_store() -> FeatureStore:
+    # create a standalone feature store to mimic a user application putting
+    # data into an application-owned resource (app should not access backbone)
+    app_fs = DragonFeatureStore(DDict())
+    return app_fs
+
+
+@pytest.fixture
+def setup_worker_manager_model_bytes(
+    test_dir,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+):
+    integrated_worker_type = IntegratedTorchWorker
+
+    chan = Channel.make_process_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    dispatcher_task_queue = mp.Queue(maxsize=0)
+
+    worker_manager = WorkerManager(
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+        dispatcher_queue=dispatcher_task_queue,
+        as_service=False,
+        cooldown=3,
+    )
+
+    tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+    output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+
+    request = InferenceRequest(
+        model_key=None,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=b"model",
+        batch_size=0,
+    )
+
+    model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
+        model_id=model_id,
+    )
+
+    dispatcher_task_queue.put(request_batch)
+    return worker_manager, integrated_worker_type
+
+
+@pytest.fixture
+def setup_worker_manager_model_key(
+    test_dir: str,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+):
+    integrated_worker_type = IntegratedTorchWorker
+
+    chan = Channel.make_process_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    dispatcher_task_queue = mp.Queue(maxsize=0)
+
+    worker_manager = WorkerManager(
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+        dispatcher_queue=dispatcher_task_queue,
+        as_service=False,
+        cooldown=3,
+    )
+
+    tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+    output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor)
+    model_id = FeatureStoreKey(key="model key", descriptor=app_feature_store.descriptor)
+
+    request = InferenceRequest(
+        model_key=model_id,
+        callback=None,
+        raw_inputs=None,
+        input_keys=[tensor_key],
+        input_meta=None,
+        output_keys=[output_key],
+        raw_model=b"model",
+        batch_size=0,
+    )
+    request_batch = RequestBatch(
+        [request],
+        TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]),
+        model_id=model_id,
+    )
+
+    dispatcher_task_queue.put(request_batch)
+    return worker_manager, integrated_worker_type
+
+
+@pytest.fixture
+def setup_request_dispatcher_model_bytes(
+    test_dir,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+):
+    integrated_worker_type = IntegratedTorchWorker
+
+    chan = Channel.make_process_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    request_dispatcher = RequestDispatcher(
+        batch_timeout=0,
+        batch_size=0,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+    )
+    request_dispatcher._on_start()
+
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    model = MessageHandler.build_model(b"model", "model name", "v 0.0.1")
+    request = MessageHandler.build_request(
+        test_dir, model, [tensor_key], [output_key], [], None
+    )
+    ser_request = MessageHandler.serialize_request(request)
+
+    request_dispatcher._incoming_channel.send(ser_request)
+
+    return request_dispatcher, integrated_worker_type
+
+
+@pytest.fixture
+def setup_request_dispatcher_model_key(
+    test_dir,
+    monkeypatch: pytest.MonkeyPatch,
+    backbone_descriptor: str,
+    app_feature_store: FeatureStore,
+):
+    integrated_worker_type = IntegratedTorchWorker
+
+    chan = Channel.make_process_local()
+    queue = FLInterface(main_ch=chan)
+    monkeypatch.setenv(
+        "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())
+    )
+    # Put backbone descriptor into env var for the `EnvironmentConfigLoader`
+    monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+
+    request_dispatcher = RequestDispatcher(
+        batch_timeout=0,
+        batch_size=0,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+    )
+    request_dispatcher._on_start()
+
+    tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor)
+    model_key = MessageHandler.build_model_key(
+        key="model key", feature_store_descriptor=app_feature_store.descriptor
+    )
+    request = MessageHandler.build_request(
+        test_dir, model_key, [tensor_key], [output_key], [], None
+    )
+    ser_request = MessageHandler.serialize_request(request)
+
+    request_dispatcher._incoming_channel.send(ser_request)
+
+    return request_dispatcher, integrated_worker_type
+
+
+def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage):
+    def mock_stage(*args, **kwargs):
+        raise ValueError(f"Simulated error in {stage}")
+
+    monkeypatch.setattr(integrated_worker, stage, mock_stage)
+    mock_reply_fn = MagicMock()
+    monkeypatch.setattr(
+        "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply",
+        mock_reply_fn,
+    )
+
+    def mock_exception_handler(exc, reply_channel, failure_message):
+        return exception_handler(exc, None, failure_message)
+
+    monkeypatch.setattr(
+        "smartsim._core.mli.infrastructure.control.workermanager.exception_handler",
+        mock_exception_handler,
+    )
+
+    return mock_reply_fn
+
+
+@pytest.mark.parametrize(
+    "setup_worker_manager",
+    [
+        pytest.param("setup_worker_manager_model_bytes"),
+        pytest.param("setup_worker_manager_model_key"),
+    ],
+)
+@pytest.mark.parametrize(
+    "stage, error_message",
+    [
+        pytest.param(
+            "fetch_model",
+            "Error loading model on device or getting device.",
+            id="fetch model",
+        ),
+        pytest.param(
+            "load_model",
+            "Error loading model on device or getting device.",
+            id="load model",
+        ),
+        pytest.param("execute", "Failed while executing.", id="execute"),
+        pytest.param(
+            "transform_output",
+            "Failed while transforming the output.",
+            id="transform output",
+        ),
+        pytest.param(
+            "place_output", "Failed while placing the output.", id="place output"
+        ),
+    ],
+)
+def test_wm_pipeline_stage_errors_handled(
+    request,
+    setup_worker_manager,
+    monkeypatch: pytest.MonkeyPatch,
+    stage: str,
+    error_message: str,
+):
+    """Ensures that the worker manager does not crash after a failure in various pipeline stages"""
+    worker_manager, integrated_worker_type = request.getfixturevalue(
+        setup_worker_manager
+    )
+    integrated_worker = worker_manager._worker
+
+    worker_manager._on_start()
+    device = worker_manager._device_manager._device
+    mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage)
+
+    if stage not in ["fetch_model"]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "fetch_model",
+            MagicMock(return_value=FetchModelResult(b"result_bytes")),
+        )
+    if stage not in ["fetch_model", "load_model"]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "load_model",
+            MagicMock(return_value=LoadModelResult(b"result_bytes")),
+        )
+        monkeypatch.setattr(
+            device,
+            "get_model",
+            MagicMock(return_value=b"result_bytes"),
+        )
+    if stage not in [
+        "fetch_model",
+        "execute",
+    ]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "execute",
+            MagicMock(return_value=ExecuteResult(b"result_bytes", [slice(0, 1)])),
+        )
+    if stage not in [
+        "fetch_model",
+        "execute",
+        "transform_output",
+    ]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "transform_output",
+            MagicMock(
+                return_value=[TransformOutputResult(b"result", [], "c", "float32")]
+            ),
+        )
+
+    worker_manager._on_iteration()
+
+    mock_reply_fn.assert_called_once()
+    mock_reply_fn.assert_called_with("fail", error_message)
+
+
+@pytest.mark.parametrize(
+    "setup_request_dispatcher",
+    [
+        pytest.param("setup_request_dispatcher_model_bytes"),
+        pytest.param("setup_request_dispatcher_model_key"),
+    ],
+)
+@pytest.mark.parametrize(
+    "stage, error_message",
+    [
+        pytest.param(
+            "fetch_inputs",
+            "Error fetching input.",
+            id="fetch input",
+        ),
+        pytest.param(
+            "transform_input",
+            "Error Transforming input.",
+            id="transform input",
+        ),
+    ],
+)
+def test_dispatcher_pipeline_stage_errors_handled(
+    request,
+    setup_request_dispatcher,
+    monkeypatch: pytest.MonkeyPatch,
+    stage: str,
+    error_message: str,
+):
+    """Ensures that the request dispatcher does not crash after a failure in various pipeline stages"""
+    request_dispatcher, integrated_worker_type = request.getfixturevalue(
+        setup_request_dispatcher
+    )
+    integrated_worker = request_dispatcher._worker
+
+    mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage)
+
+    if stage not in ["fetch_inputs"]:
+        monkeypatch.setattr(
+            integrated_worker,
+            "fetch_inputs",
+            MagicMock(return_value=[FetchInputResult(result=[b"result"], meta=None)]),
+        )
+
+    request_dispatcher._on_iteration()
+
+    mock_reply_fn.assert_called_once()
+    mock_reply_fn.assert_called_with("fail", error_message)
+
+
+def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch):
+    """Ensures that the worker manager does not crash after a failure in the
+    execute pipeline stage"""
+    reply = InferenceReply()
+
+    mock_reply_fn = MagicMock()
+    monkeypatch.setattr(
+        "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply",
+        mock_reply_fn,
+    )
+
+    test_exception = ValueError("Test ValueError")
+    exception_handler(test_exception, None, "Failure while fetching the model.")
+
+    mock_reply_fn.assert_called_once()
+    mock_reply_fn.assert_called_with("fail", "Failure while fetching the model.")
diff --git a/tests/dragon/test_reply_building.py b/tests/dragon/test_reply_building.py
new file mode 100644
index 000000000..5f179bbae
--- /dev/null
+++ b/tests/dragon/test_reply_building.py
@@ -0,0 +1,62 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import typing as t
+
+import pytest
+
+dragon = pytest.importorskip("dragon")
+
+from smartsim._core.mli.infrastructure.control.workermanager import build_failure_reply
+from smartsim._core.mli.infrastructure.worker.worker import InferenceReply
+
+if t.TYPE_CHECKING:
+    from smartsim._core.mli.mli_schemas.response.response_capnp import Status
+
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+@pytest.mark.parametrize(
+    "status, message",
+    [
+        pytest.param("timeout", "Worker timed out", id="timeout"),
+        pytest.param("fail", "Failed while executing", id="fail"),
+    ],
+)
+def test_build_failure_reply(status: "Status", message: str):
+    "Ensures failure replies can be built successfully"
+    response = build_failure_reply(status, message)
+    assert response.status == status
+    assert response.message == message
+
+
+def test_build_failure_reply_fails():
+    "Ensures ValueError is raised if a Status Enum is not used"
+    with pytest.raises(ValueError) as ex:
+        response = build_failure_reply("not a status enum", "message")
+
+    assert "Error assigning status to response" in ex.value.args[0]
diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py
new file mode 100644
index 000000000..c8d97dd7e
--- /dev/null
+++ b/tests/dragon/test_request_dispatcher.py
@@ -0,0 +1,331 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import gc
+import io
+import logging
+import pathlib
+import socket
+import time
+import typing as t
+from queue import Empty
+
+import numpy as np
+import pytest
+
+torch = pytest.importorskip("torch")
+dragon = pytest.importorskip("dragon")
+
+import base64
+import multiprocessing as mp
+
+try:
+    mp.set_start_method("dragon")
+except Exception:
+    pass
+
+import os
+
+import dragon.channels as dch
+import dragon.infrastructure.policy as dragon_policy
+import dragon.infrastructure.process_desc as dragon_process_desc
+import dragon.native.process as dragon_process
+from dragon import fli
+from dragon.channels import Channel
+from dragon.data.ddict.ddict import DDict
+from dragon.managed_memory import MemoryAlloc, MemoryPool
+from dragon.mpbridge.queues import DragonQueue
+
+from smartsim._core.entrypoints.service import Service
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.control.requestdispatcher import (
+    RequestBatch,
+    RequestDispatcher,
+)
+from smartsim._core.mli.infrastructure.control.workermanager import (
+    EnvironmentConfigLoader,
+)
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+from .featurestore import FileSystemFeatureStore
+from .utils.channel import FileSystemCommChannel
+
+logger = get_logger(__name__)
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
+    """Create a simple torch model and persist to disk for
+    testing purposes.
+
+    TODO: remove once unit tests are in place"""
+    # test_path = pathlib.Path(work_dir)
+    if not model_path.parent.exists():
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+    model_path.unlink(missing_ok=True)
+
+    model = torch.nn.Linear(2, 1)
+    torch.save(model, model_path)
+
+    return model_path
+
+
+def mock_messages(
+    request_dispatcher_queue: DragonFLIChannel,
+    feature_store: FeatureStore,
+    feature_store_root_dir: pathlib.Path,
+    comm_channel_root_dir: pathlib.Path,
+) -> None:
+    """Mock event producer for triggering the inference pipeline"""
+    feature_store_root_dir.mkdir(parents=True, exist_ok=True)
+    comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
+
+    model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt")
+    model_bytes = model_path.read_bytes()
+    model_key = str(feature_store_root_dir / "model_fs.pt")
+
+    feature_store[model_key] = model_bytes
+
+    for iteration_number in range(2):
+
+        channel_key = Channel.make_process_local().serialize()
+        callback_channel = DragonCommChannel(channel_key)
+
+        input_path = feature_store_root_dir / f"{iteration_number}/input.pt"
+        output_path = feature_store_root_dir / f"{iteration_number}/output.pt"
+
+        input_key = str(input_path)
+        output_key = str(output_path)
+
+        tensor = (
+            (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32)
+        ).numpy()
+        fsd = feature_store.descriptor
+
+        tensor_desc = MessageHandler.build_tensor_descriptor(
+            "c", "float32", list(tensor.shape)
+        )
+
+        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
+        message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
+        message_model_key = MessageHandler.build_model_key(model_key, fsd)
+
+        request = MessageHandler.build_request(
+            reply_channel=base64.b64encode(callback_channel.descriptor).decode("utf-8"),
+            model=message_model_key,
+            inputs=[tensor_desc],
+            outputs=[message_tensor_output_key],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+        request_bytes = MessageHandler.serialize_request(request)
+        with request_dispatcher_queue._fli.sendh(
+            timeout=None, stream_channel=request_dispatcher_queue._channel
+        ) as sendh:
+            sendh.send_bytes(request_bytes)
+            sendh.send_bytes(tensor.tobytes())
+        time.sleep(1)
+
+
+@pytest.fixture
+def prepare_environment(test_dir: str) -> pathlib.Path:
+    """Cleanup prior outputs to run demo repeatedly"""
+    path = pathlib.Path(f"{test_dir}/workermanager.log")
+    logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
+    return path
+
+
+def service_as_dragon_proc(
+    service: Service, cpu_affinity: list[int], gpu_affinity: list[int]
+) -> dragon_process.Process:
+
+    options = dragon_process_desc.ProcessOptions(make_inf_channels=True)
+    local_policy = dragon_policy.Policy(
+        placement=dragon_policy.Policy.Placement.HOST_NAME,
+        host_name=socket.gethostname(),
+        cpu_affinity=cpu_affinity,
+        gpu_affinity=gpu_affinity,
+    )
+    return dragon_process.Process(
+        target=service.execute,
+        args=[],
+        cwd=os.getcwd(),
+        policy=local_policy,
+        options=options,
+        stderr=dragon_process.Popen.STDOUT,
+        stdout=dragon_process.Popen.STDOUT,
+    )
+
+
+def test_request_dispatcher(prepare_environment: pathlib.Path) -> None:
+    """Test the request dispatcher batching and queueing system
+
+    This also includes setting a queue to disposable, checking that it is no
+    longer referenced by the dispatcher.
+    """
+
+    test_path = prepare_environment
+    fs_path = test_path / "feature_store"
+    comm_path = test_path / "comm_store"
+
+    to_worker_channel = dch.Channel.make_process_local()
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    to_worker_fli_serialized = to_worker_fli.serialize()
+
+    # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
+    # or test environment may be unable to send messages w/queue
+    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
+
+    ddict = DDict(1, 2, 4 * 1024**2)
+    dragon_fs = DragonFeatureStore(ddict)
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=DragonCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    integrated_worker_type = TorchWorker
+
+    request_dispatcher = RequestDispatcher(
+        batch_timeout=0,
+        batch_size=2,
+        config_loader=config_loader,
+        worker_type=integrated_worker_type,
+        mem_pool_size=2 * 1024**2,
+    )
+
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        logger.warn(
+            "FLI input queue not loaded correctly from config_loader: "
+            f"{config_loader._queue_descriptor}"
+        )
+
+    request_dispatcher._on_start()
+
+    for _ in range(2):
+        batch: t.Optional[RequestBatch] = None
+        mem_allocs = []
+        tensors = []
+        fs_path = test_path / f"feature_store"
+        comm_path = test_path / f"comm_store"
+        model_key = str(fs_path / "model_fs.pt")
+
+        # create a mock client application to populate the request queue
+        msg_pump = mp.Process(
+            target=mock_messages,
+            args=(
+                worker_queue,
+                dragon_fs,
+                fs_path,
+                comm_path,
+            ),
+        )
+
+        msg_pump.start()
+
+        time.sleep(1)
+
+        for attempts in range(15):
+            try:
+                request_dispatcher._on_iteration()
+                batch = request_dispatcher.task_queue.get(timeout=1)
+                break
+            except Empty:
+                continue
+            except Exception as exc:
+                raise exc
+
+        try:
+            assert batch is not None
+            assert batch.has_valid_requests
+
+            transform_result = batch.inputs
+            for transformed, dims, dtype in zip(
+                transform_result.transformed,
+                transform_result.dims,
+                transform_result.dtypes,
+            ):
+                mem_alloc = MemoryAlloc.attach(transformed)
+                mem_allocs.append(mem_alloc)
+                itemsize = np.empty((1), dtype=dtype).itemsize
+                tensors.append(
+                    torch.from_numpy(
+                        np.frombuffer(
+                            mem_alloc.get_memview()[0 : np.prod(dims) * itemsize],
+                            dtype=dtype,
+                        ).reshape(dims)
+                    )
+                )
+
+            assert len(batch.requests) == 2
+            assert batch.model_id.key == model_key
+            assert model_key in request_dispatcher._queues
+            assert model_key in request_dispatcher._active_queues
+            assert len(request_dispatcher._queues[model_key]) == 1
+            assert request_dispatcher._queues[model_key][0].empty()
+            assert request_dispatcher._queues[model_key][0].model_id.key == model_key
+            assert len(tensors) == 1
+            assert tensors[0].shape == torch.Size([2, 2])
+
+            for tensor in tensors:
+                for sample_idx in range(tensor.shape[0]):
+                    tensor_in = tensor[sample_idx]
+                    tensor_out = (sample_idx + 1) * torch.ones(
+                        (2,), dtype=torch.float32
+                    )
+                    assert torch.equal(tensor_in, tensor_out)
+
+        except Exception as exc:
+            raise exc
+        finally:
+            for mem_alloc in mem_allocs:
+                mem_alloc.free()
+
+            msg_pump.kill()
+
+        request_dispatcher._active_queues[model_key].make_disposable()
+        assert request_dispatcher._active_queues[model_key].can_be_removed
+
+        request_dispatcher._on_iteration()
+
+        assert model_key not in request_dispatcher._active_queues
+        assert model_key not in request_dispatcher._queues
+
+    # Try to remove the dispatcher and free the memory
+    del request_dispatcher
+    gc.collect()
diff --git a/tests/dragon/test_torch_worker.py b/tests/dragon/test_torch_worker.py
new file mode 100644
index 000000000..88e800240
--- /dev/null
+++ b/tests/dragon/test_torch_worker.py
@@ -0,0 +1,221 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import typing as t
+
+import numpy as np
+import pytest
+import torch
+
+dragon = pytest.importorskip("dragon")
+import dragon.globalservices.pool as dragon_gs_pool
+from dragon.managed_memory import MemoryAlloc, MemoryPool
+from torch import nn
+from torch.nn import functional as F
+
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStoreKey
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.infrastructure.worker.worker import (
+    ExecuteResult,
+    FetchInputResult,
+    FetchModelResult,
+    InferenceRequest,
+    LoadModelResult,
+    RequestBatch,
+    TransformInputResult,
+)
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+# simple MNIST in PyTorch
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x, y):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+
+torch_device = {"cpu": "cpu", "gpu": "cuda"}
+
+
+def get_batch() -> torch.Tensor:
+    return torch.rand(20, 1, 28, 28)
+
+
+def create_torch_model():
+    n = Net()
+    example_forward_input = get_batch()
+    module = torch.jit.trace(n, [example_forward_input, example_forward_input])
+    model_buffer = io.BytesIO()
+    torch.jit.save(module, model_buffer)
+    return model_buffer.getvalue()
+
+
+def get_request() -> InferenceRequest:
+
+    tensors = [get_batch() for _ in range(2)]
+    tensor_numpy = [tensor.numpy() for tensor in tensors]
+    serialized_tensors_descriptors = [
+        MessageHandler.build_tensor_descriptor("c", "float32", list(tensor.shape))
+        for tensor in tensors
+    ]
+
+    return InferenceRequest(
+        model_key=FeatureStoreKey(key="model", descriptor="xyz"),
+        callback=None,
+        raw_inputs=tensor_numpy,
+        input_keys=None,
+        input_meta=serialized_tensors_descriptors,
+        output_keys=None,
+        raw_model=create_torch_model(),
+        batch_size=0,
+    )
+
+
+def get_request_batch_from_request(
+    request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None
+) -> RequestBatch:
+
+    return RequestBatch([request], inputs, request.model_key)
+
+
+sample_request: InferenceRequest = get_request()
+sample_request_batch: RequestBatch = get_request_batch_from_request(sample_request)
+worker = TorchWorker()
+
+
+def test_load_model(mlutils) -> None:
+    fetch_model_result = FetchModelResult(sample_request.raw_model)
+    load_model_result = worker.load_model(
+        sample_request_batch, fetch_model_result, mlutils.get_test_device().lower()
+    )
+
+    assert load_model_result.model(
+        get_batch().to(torch_device[mlutils.get_test_device().lower()]),
+        get_batch().to(torch_device[mlutils.get_test_device().lower()]),
+    ).shape == torch.Size((20, 10))
+
+
+def test_transform_input(mlutils) -> None:
+    fetch_input_result = FetchInputResult(
+        sample_request.raw_inputs, sample_request.input_meta
+    )
+
+    mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc)
+
+    transform_input_result = worker.transform_input(
+        sample_request_batch, [fetch_input_result], mem_pool
+    )
+
+    batch = get_batch().numpy()
+    assert transform_input_result.slices[0] == slice(0, batch.shape[0])
+
+    for tensor_index in range(2):
+        assert torch.Size(transform_input_result.dims[tensor_index]) == batch.shape
+        assert transform_input_result.dtypes[tensor_index] == str(batch.dtype)
+        mem_alloc = MemoryAlloc.attach(transform_input_result.transformed[tensor_index])
+        itemsize = batch.itemsize
+        tensor = torch.from_numpy(
+            np.frombuffer(
+                mem_alloc.get_memview()[
+                    0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize
+                ],
+                dtype=transform_input_result.dtypes[tensor_index],
+            ).reshape(transform_input_result.dims[tensor_index])
+        )
+
+        assert torch.equal(
+            tensor, torch.from_numpy(sample_request.raw_inputs[tensor_index])
+        )
+
+    mem_pool.destroy()
+
+
+def test_execute(mlutils) -> None:
+    load_model_result = LoadModelResult(
+        Net().to(torch_device[mlutils.get_test_device().lower()])
+    )
+    fetch_input_result = FetchInputResult(
+        sample_request.raw_inputs, sample_request.input_meta
+    )
+
+    request_batch = get_request_batch_from_request(sample_request, fetch_input_result)
+
+    mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc)
+
+    transform_result = worker.transform_input(
+        request_batch, [fetch_input_result], mem_pool
+    )
+
+    execute_result = worker.execute(
+        request_batch,
+        load_model_result,
+        transform_result,
+        mlutils.get_test_device().lower(),
+    )
+
+    assert all(
+        result.shape == torch.Size((20, 10)) for result in execute_result.predictions
+    )
+
+    mem_pool.destroy()
+
+
+def test_transform_output(mlutils):
+    tensors = [torch.rand((20, 10)) for _ in range(2)]
+    execute_result = ExecuteResult(tensors, [slice(0, 20)])
+
+    transformed_output = worker.transform_output(sample_request_batch, execute_result)
+
+    assert transformed_output[0].outputs == [item.numpy().tobytes() for item in tensors]
+    assert transformed_output[0].shape == None
+    assert transformed_output[0].order == "c"
+    assert transformed_output[0].dtype == "float32"
diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py
new file mode 100644
index 000000000..a33416425
--- /dev/null
+++ b/tests/dragon/test_worker_manager.py
@@ -0,0 +1,218 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import logging
+import pathlib
+import time
+
+import pytest
+
+torch = pytest.importorskip("torch")
+dragon = pytest.importorskip("dragon")
+
+import base64
+import multiprocessing as mp
+
+try:
+    mp.set_start_method("dragon")
+except Exception:
+    pass
+
+import os
+
+import dragon.channels as dch
+from dragon import fli
+from dragon.mpbridge.queues import DragonQueue
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel
+from smartsim._core.mli.infrastructure.control.workermanager import (
+    EnvironmentConfigLoader,
+    WorkerManager,
+)
+from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import (
+    DragonFeatureStore,
+)
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker
+from smartsim._core.mli.message_handler import MessageHandler
+from smartsim.log import get_logger
+
+from .featurestore import FileSystemFeatureStore
+from .utils.channel import FileSystemCommChannel
+
+logger = get_logger(__name__)
+# The tests in this file belong to the dragon group
+pytestmark = pytest.mark.dragon
+
+
+def persist_model_file(model_path: pathlib.Path) -> pathlib.Path:
+    """Create a simple torch model and persist to disk for
+    testing purposes.
+
+    TODO: remove once unit tests are in place"""
+    # test_path = pathlib.Path(work_dir)
+    if not model_path.parent.exists():
+        model_path.parent.mkdir(parents=True, exist_ok=True)
+
+    model_path.unlink(missing_ok=True)
+    # model_path = test_path / "basic.pt"
+
+    model = torch.nn.Linear(2, 1)
+    torch.save(model, model_path)
+
+    return model_path
+
+
+def mock_messages(
+    worker_manager_queue: CommChannelBase,
+    feature_store: FeatureStore,
+    feature_store_root_dir: pathlib.Path,
+    comm_channel_root_dir: pathlib.Path,
+) -> None:
+    """Mock event producer for triggering the inference pipeline"""
+    feature_store_root_dir.mkdir(parents=True, exist_ok=True)
+    comm_channel_root_dir.mkdir(parents=True, exist_ok=True)
+
+    model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt")
+    model_bytes = model_path.read_bytes()
+    model_key = str(feature_store_root_dir / "model_fs.pt")
+
+    feature_store[model_key] = model_bytes
+
+    iteration_number = 0
+
+    while True:
+        iteration_number += 1
+        time.sleep(1)
+        # 1. for demo, ignore upstream and just put stuff into downstream
+        # 2. for demo, only one downstream but we'd normally have to filter
+        #       msg content and send to the correct downstream (worker) queue
+        # timestamp = time.time_ns()
+        # mock_channel = test_path / f"brainstorm-{timestamp}.txt"
+        # mock_channel.touch()
+
+        # thread - just look for key (wait for keys)
+        # call checkpoint, try to get non-persistent key, it blocks
+        # working set size > 1 has side-effects
+        # only incurs cost when working set size has been exceeded
+
+        channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt"
+        callback_channel = FileSystemCommChannel(pathlib.Path(channel_key))
+
+        input_path = feature_store_root_dir / f"{iteration_number}/input.pt"
+        output_path = feature_store_root_dir / f"{iteration_number}/output.pt"
+
+        input_key = str(input_path)
+        output_key = str(output_path)
+
+        buffer = io.BytesIO()
+        tensor = torch.randn((1, 2), dtype=torch.float32)
+        torch.save(tensor, buffer)
+        feature_store[input_key] = buffer.getvalue()
+        fsd = feature_store.descriptor
+
+        message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd)
+        message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd)
+        message_model_key = MessageHandler.build_model_key(model_key, fsd)
+
+        request = MessageHandler.build_request(
+            reply_channel=callback_channel.descriptor,
+            model=message_model_key,
+            inputs=[message_tensor_input_key],
+            outputs=[message_tensor_output_key],
+            output_descriptors=[],
+            custom_attributes=None,
+        )
+        request_bytes = MessageHandler.serialize_request(request)
+        worker_manager_queue.send(request_bytes)
+
+
+@pytest.fixture
+def prepare_environment(test_dir: str) -> pathlib.Path:
+    """Cleanup prior outputs to run demo repeatedly"""
+    path = pathlib.Path(f"{test_dir}/workermanager.log")
+    logging.basicConfig(filename=path.absolute(), level=logging.DEBUG)
+    return path
+
+
+def test_worker_manager(prepare_environment: pathlib.Path) -> None:
+    """Test the worker manager"""
+
+    test_path = prepare_environment
+    fs_path = test_path / "feature_store"
+    comm_path = test_path / "comm_store"
+
+    to_worker_channel = dch.Channel.make_process_local()
+    to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None)
+    to_worker_fli_serialized = to_worker_fli.serialize()
+
+    # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader
+    # or test environment may be unable to send messages w/queue
+    descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8")
+    os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor
+
+    config_loader = EnvironmentConfigLoader(
+        featurestore_factory=DragonFeatureStore.from_descriptor,
+        callback_factory=FileSystemCommChannel.from_descriptor,
+        queue_factory=DragonFLIChannel.from_descriptor,
+    )
+    integrated_worker_type = TorchWorker
+
+    worker_manager = WorkerManager(
+        config_loader,
+        integrated_worker_type,
+        as_service=True,
+        cooldown=5,
+        device="cpu",
+        dispatcher_queue=mp.Queue(maxsize=0),
+    )
+
+    worker_queue = config_loader.get_queue()
+    if worker_queue is None:
+        logger.warn(
+            f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}"
+        )
+
+    # create a mock client application to populate the request queue
+    msg_pump = mp.Process(
+        target=mock_messages,
+        args=(
+            worker_queue,
+            FileSystemFeatureStore(fs_path),
+            fs_path,
+            comm_path,
+        ),
+    )
+    msg_pump.start()
+
+    # create a process to execute commands
+    process = mp.Process(target=worker_manager.execute)
+    process.start()
+    process.join(timeout=5)
+    process.kill()
+    msg_pump.kill()
diff --git a/tests/dragon/utils/__init__.py b/tests/dragon/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py
new file mode 100644
index 000000000..08b659c07
--- /dev/null
+++ b/tests/dragon/utils/channel.py
@@ -0,0 +1,94 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import threading
+import typing as t
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class FileSystemCommChannel(CommChannelBase):
+    """Passes messages by writing to a file"""
+
+    def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
+        """Initialize the FileSystemCommChannel instance
+
+        :param key: a path to the root directory of the feature store"""
+        self._lock = threading.RLock()
+
+        if not isinstance(key, bytes):
+            super().__init__(key.as_posix().encode("utf-8"))
+            self._file_path = key
+        else:
+            super().__init__(key)
+            self._file_path = pathlib.Path(key.decode("utf-8"))
+
+        if not self._file_path.parent.exists():
+            self._file_path.parent.mkdir(parents=True)
+
+        self._file_path.touch()
+
+    def send(self, value: bytes) -> None:
+        """Send a message throuh the underlying communication channel
+
+        :param value: The value to send"""
+        logger.debug(
+            f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}"
+        )
+        with self._lock:
+            self._file_path.write_bytes(value)
+
+    def recv(self) -> bytes:
+        """Receieve a message through the underlying communication channel
+
+        :returns: the received message"""
+        with self._lock:
+            if self._file_path.exists():
+                incoming = self._file_path.read_bytes()
+                self._file_path.unlink()
+                return incoming
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: t.Union[str, bytes],
+    ) -> "FileSystemCommChannel":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemCommChannel"""
+        try:
+            if isinstance(descriptor, str):
+                path = pathlib.Path(descriptor)
+            else:
+                path = pathlib.Path(descriptor.decode("utf-8"))
+            return FileSystemCommChannel(path)
+        except:
+            print("failed to create FS comm channel: {descriptor}")
diff --git a/tests/dragon/utils/worker.py b/tests/dragon/utils/worker.py
new file mode 100644
index 000000000..0582cae56
--- /dev/null
+++ b/tests/dragon/utils/worker.py
@@ -0,0 +1,104 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import typing as t
+
+import torch
+
+import smartsim._core.mli.infrastructure.worker.worker as mliw
+import smartsim.error as sse
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class IntegratedTorchWorker(mliw.MachineLearningWorkerBase):
+    """A minimum implementation of a worker that executes a PyTorch model"""
+
+    # @staticmethod
+    # def deserialize(request: InferenceRequest) -> t.List[t.Any]:
+    #     # request.input_meta
+    #     # request.raw_inputs
+    #     return request
+
+    @staticmethod
+    def load_model(
+        request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult, device: str
+    ) -> mliw.LoadModelResult:
+        model_bytes = fetch_result.model_bytes or request.raw_model
+        if not model_bytes:
+            raise ValueError("Unable to load model without reference object")
+
+        model: torch.nn.Module = torch.load(io.BytesIO(model_bytes))
+        result = mliw.LoadModelResult(model)
+        return result
+
+    @staticmethod
+    def transform_input(
+        request: mliw.InferenceRequest,
+        fetch_result: mliw.FetchInputResult,
+        device: str,
+    ) -> mliw.TransformInputResult:
+        # extra metadata for assembly can be found in request.input_meta
+        raw_inputs = request.raw_inputs or fetch_result.inputs
+
+        result: t.List[torch.Tensor] = []
+        # should this happen here?
+        # consider - fortran to c data layout
+        # is there an intermediate representation before really doing torch.load?
+        if raw_inputs:
+            result = [torch.load(io.BytesIO(item)) for item in raw_inputs]
+
+        return mliw.TransformInputResult(result)
+
+    @staticmethod
+    def execute(
+        request: mliw.InferenceRequest,
+        load_result: mliw.LoadModelResult,
+        transform_result: mliw.TransformInputResult,
+    ) -> mliw.ExecuteResult:
+        if not load_result.model:
+            raise sse.SmartSimError("Model must be loaded to execute")
+
+        model = load_result.model
+        results = [model(tensor) for tensor in transform_result.transformed]
+
+        execute_result = mliw.ExecuteResult(results)
+        return execute_result
+
+    @staticmethod
+    def transform_output(
+        request: mliw.InferenceRequest,
+        execute_result: mliw.ExecuteResult,
+        result_device: str,
+    ) -> mliw.TransformOutputResult:
+        # send the original tensors...
+        execute_result.predictions = [t.detach() for t in execute_result.predictions]
+        # todo: solve sending all tensor metadata that coincisdes with each prediction
+        return mliw.TransformOutputResult(
+            execute_result.predictions, [1], "c", "float32"
+        )
diff --git a/tests/mli/__init__.py b/tests/mli/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/mli/channel.py b/tests/mli/channel.py
new file mode 100644
index 000000000..226e8683d
--- /dev/null
+++ b/tests/mli/channel.py
@@ -0,0 +1,91 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import threading
+import typing as t
+
+from smartsim._core.mli.comm.channel.channel import CommChannelBase
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class FileSystemCommChannel(CommChannelBase):
+    """Passes messages by writing to a file"""
+
+    def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None:
+        """Initialize the FileSystemCommChannel instance
+
+        :param key: a path to the root directory of the feature store"""
+        self._lock = threading.RLock()
+        if not isinstance(key, bytes):
+            super().__init__(key.as_posix().encode("utf-8"))
+            self._file_path = key
+        else:
+            super().__init__(key)
+            self._file_path = pathlib.Path(key.decode("utf-8"))
+
+        if not self._file_path.parent.exists():
+            self._file_path.parent.mkdir(parents=True)
+
+        self._file_path.touch()
+
+    def send(self, value: bytes) -> None:
+        """Send a message throuh the underlying communication channel
+
+        :param value: The value to send"""
+        logger.debug(
+            f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}"
+        )
+        with self._lock:
+            self._file_path.write_bytes(value)
+
+    def recv(self) -> bytes:
+        """Receieve a message through the underlying communication channel
+
+        :returns: the received message"""
+        with self._lock:
+            if self._file_path.exists():
+                incoming = self._file_path.read_bytes()
+                self._file_path.unlink()
+            return incoming
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "FileSystemCommChannel":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemCommChannel"""
+        try:
+            path = pathlib.Path(descriptor)
+            return FileSystemCommChannel(path)
+        except:
+            print(f"failed to create fs comm channel: {descriptor}")
+            raise
diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py
new file mode 100644
index 000000000..de748ae6e
--- /dev/null
+++ b/tests/mli/featurestore.py
@@ -0,0 +1,155 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import typing as t
+
+import smartsim.error as sse
+from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class MemoryFeatureStore(FeatureStore):
+    """A feature store with values persisted only in local memory"""
+
+    def __init__(self) -> None:
+        """Initialize the MemoryFeatureStore instance"""
+        self._storage: t.Dict[str, bytes] = {}
+
+    def __getitem__(self, key: str) -> bytes:
+        """Retrieve an item using key
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+        if key not in self._storage:
+            raise sse.SmartSimError(f"{key} not found in feature store")
+        return self._storage[key]
+
+    def __setitem__(self, key: str, value: bytes) -> None:
+        """Membership operator to test for a key existing within the feature store.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
+        self._storage[key] = value
+
+    def __contains__(self, key: str) -> bool:
+        """Membership operator to test for a key existing within the feature store.
+        Return `True` if the key is found, `False` otherwise
+        :param key: Unique key of an item to retrieve from the feature store"""
+        return key in self._storage
+
+    @property
+    def descriptor(self) -> str:
+        """Unique identifier enabling a client to connect to the feature store
+
+        :returns: A descriptor encoded as a string"""
+        return "in-memory-fs"
+
+
+class FileSystemFeatureStore(FeatureStore):
+    """Alternative feature store implementation for testing. Stores all
+    data on the file system"""
+
+    def __init__(
+        self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None
+    ) -> None:
+        """Initialize the FileSystemFeatureStore instance
+
+        :param storage_dir: (optional) root directory to store all data relative to"""
+        if isinstance(storage_dir, str):
+            storage_dir = pathlib.Path(storage_dir)
+        self._storage_dir = storage_dir
+
+    def __getitem__(self, key: str) -> bytes:
+        """Retrieve an item using key
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+        path = self._key_path(key)
+        if not path.exists():
+            raise sse.SmartSimError(f"{path} not found in feature store")
+        return path.read_bytes()
+
+    def __setitem__(self, key: str, value: bytes) -> None:
+        """Assign a value using key
+
+        :param key: Unique key of an item to set in the feature store
+        :param value: Value to persist in the feature store"""
+        path = self._key_path(key, create=True)
+        path.write_bytes(value)
+
+    def __contains__(self, key: str) -> bool:
+        """Membership operator to test for a key existing within the feature store.
+
+        :param key: Unique key of an item to retrieve from the feature store
+        :returns: `True` if the key is found, `False` otherwise"""
+        path = self._key_path(key)
+        return path.exists()
+
+    def _key_path(self, key: str, create: bool = False) -> pathlib.Path:
+        """Given a key, return a path that is optionally combined with a base
+        directory used by the FileSystemFeatureStore.
+
+        :param key: Unique key of an item to retrieve from the feature store"""
+        value = pathlib.Path(key)
+
+        if self._storage_dir:
+            value = self._storage_dir / key
+
+        if create:
+            value.parent.mkdir(parents=True, exist_ok=True)
+
+        return value
+
+    @property
+    def descriptor(self) -> str:
+        """Unique identifier enabling a client to connect to the feature store
+
+        :returns: A descriptor encoded as a string"""
+        if not self._storage_dir:
+            raise ValueError("No storage path configured")
+        return self._storage_dir.as_posix()
+
+    @classmethod
+    def from_descriptor(
+        cls,
+        descriptor: str,
+    ) -> "FileSystemFeatureStore":
+        """A factory method that creates an instance from a descriptor string
+
+        :param descriptor: The descriptor that uniquely identifies the resource
+        :returns: An attached FileSystemFeatureStore"""
+        try:
+            path = pathlib.Path(descriptor)
+            path.mkdir(parents=True, exist_ok=True)
+            if not path.is_dir():
+                raise ValueError("FileSystemFeatureStore requires a directory path")
+            if not path.exists():
+                path.mkdir(parents=True, exist_ok=True)
+            return FileSystemFeatureStore(path)
+        except:
+            logger.error(f"Error while creating FileSystemFeatureStore: {descriptor}")
+            raise
diff --git a/tests/mli/test_default_torch_worker.py b/tests/mli/test_default_torch_worker.py
new file mode 100644
index 000000000..b2ec6c3dc
--- /dev/null
+++ b/tests/mli/test_default_torch_worker.py
@@ -0,0 +1,206 @@
+# # BSD 2-Clause License
+# #
+# # Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# # All rights reserved.
+# #
+# # Redistribution and use in source and binary forms, with or without
+# # modification, are permitted provided that the following conditions are met:
+# #
+# # 1. Redistributions of source code must retain the above copyright notice, this
+# #    list of conditions and the following disclaimer.
+# #
+# # 2. Redistributions in binary form must reproduce the above copyright notice,
+# #    this list of conditions and the following disclaimer in the documentation
+# #    and/or other materials provided with the distribution.
+# #
+# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# import io
+# import pathlib
+# import typing as t
+
+# import pytest
+# import torch
+
+# from smartsim._core.mli.infrastructure.worker.integratedtorchworker import (
+#     IntegratedTorchWorker,
+# )
+# import smartsim.error as sse
+# from smartsim._core.mli.infrastructure import MemoryFeatureStore
+# from smartsim._core.mli.infrastructure.worker.worker import (
+#     ExecuteResult,
+#     FetchInputResult,
+#     FetchModelResult,
+#     InferenceRequest,
+#     TransformInputResult,
+#     LoadModelResult,
+# )
+# from smartsim._core.utils import installed_redisai_backends
+
+# # The tests in this file belong to the group_a group
+# pytestmark = pytest.mark.group_b
+
+# # retrieved from pytest fixtures
+# is_dragon = pytest.test_launcher == "dragon"
+# torch_available = "torch" in installed_redisai_backends()
+
+
+# @pytest.fixture
+# def persist_torch_model(test_dir: str) -> pathlib.Path:
+#     test_path = pathlib.Path(test_dir)
+#     model_path = test_path / "basic.pt"
+
+#     model = torch.nn.Linear(2, 1)
+#     torch.save(model, model_path)
+
+#     return model_path
+
+
+# # def test_deserialize() -> None:
+# #     """Verify that serialized requests are properly deserialized to
+# #     and converted to the internal representation used by ML workers"""
+# #     worker = SampleTorchWorker
+# #     buffer = io.BytesIO()
+
+# #     exp_model_key = "model-key"
+# #     msg = InferenceRequest(model_key=exp_model_key)
+# #     pickle.dump(msg, buffer)
+
+# #     deserialized: InferenceRequest = worker.deserialize(buffer.getvalue())
+
+# #     assert deserialized.model_key == exp_model_key
+# #     # assert deserialized.backend == exp_backend
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_load_model_from_disk(persist_torch_model: pathlib.Path) -> None:
+#     """Verify that a model can be loaded using a FileSystemFeatureStore"""
+#     worker = IntegratedTorchWorker
+#     request = InferenceRequest(raw_model=persist_torch_model.read_bytes())
+
+#     fetch_result = FetchModelResult(persist_torch_model.read_bytes())
+#     load_result = worker.load_model(request, fetch_result)
+
+#     input = torch.randn(2)
+#     pred = load_result.model(input)
+
+#     assert pred
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_transform_input() -> None:
+#     """Verify that the default input transform operation is a no-op copy"""
+#     rows, cols = 1, 4
+#     num_values = 7
+#     tensors = [torch.randn((rows, cols)) for _ in range(num_values)]
+
+#     request = InferenceRequest()
+
+#     inputs: t.List[bytes] = []
+#     for tensor in tensors:
+#         buffer = io.BytesIO()
+#         torch.save(tensor, buffer)
+#         inputs.append(buffer.getvalue())
+
+#     fetch_result = FetchInputResult(inputs)
+#     worker = IntegratedTorchWorker
+#     result = worker.transform_input(request, fetch_result)
+#     transformed: t.Collection[torch.Tensor] = result.transformed
+
+#     assert len(transformed) == num_values
+
+#     for output, expected in zip(transformed, tensors):
+#         assert output.shape == expected.shape
+#         assert output.equal(expected)
+
+#     transformed = list(transformed)
+
+#     original: torch.Tensor = tensors[0]
+#     assert transformed[0].equal(original)
+
+#     # verify a copy was made
+#     transformed[0] = 2 * transformed[0]
+#     assert transformed[0].equal(2 * original)
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_execute_model(persist_torch_model: pathlib.Path) -> None:
+#     """Verify that a model executes corrrectly via the worker"""
+
+#     # put model bytes into memory
+#     model_name = "test-key"
+#     feature_store = MemoryFeatureStore()
+#     feature_store[model_name] = persist_torch_model.read_bytes()
+
+#     worker = IntegratedTorchWorker
+#     request = InferenceRequest(model_key=model_name)
+#     fetch_result = FetchModelResult(persist_torch_model.read_bytes())
+#     load_result = worker.load_model(request, fetch_result)
+
+#     value = torch.randn(2)
+#     transform_result = TransformInputResult([value])
+
+#     execute_result = worker.execute(request, load_result, transform_result)
+
+#     assert execute_result.predictions is not None
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_execute_missing_model(persist_torch_model: pathlib.Path) -> None:
+#     """Verify that a executing a model with an invalid key fails cleanly"""
+
+#     # use key that references an un-set model value
+#     model_name = "test-key"
+#     feature_store = MemoryFeatureStore()
+#     feature_store[model_name] = persist_torch_model.read_bytes()
+
+#     worker = IntegratedTorchWorker
+#     request = InferenceRequest(input_keys=[model_name])
+
+#     load_result = LoadModelResult(None)
+#     transform_result = TransformInputResult(
+#         [torch.randn(2), torch.randn(2), torch.randn(2)]
+#     )
+
+#     with pytest.raises(sse.SmartSimError) as ex:
+#         worker.execute(request, load_result, transform_result)
+
+#     assert "Model must be loaded" in ex.value.args[0]
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_transform_output() -> None:
+#     """Verify that the default output transform operation is a no-op copy"""
+#     rows, cols = 1, 4
+#     num_values = 7
+#     inputs = [torch.randn((rows, cols)) for _ in range(num_values)]
+#     exp_outputs = [torch.Tensor(tensor) for tensor in inputs]
+
+#     worker = SampleTorchWorker
+#     request = InferenceRequest()
+#     exec_result = ExecuteResult(inputs)
+
+#     result = worker.transform_output(request, exec_result)
+
+#     assert len(result.outputs) == num_values
+
+#     for output, expected in zip(result.outputs, exp_outputs):
+#         assert output.shape == expected.shape
+#         assert output.equal(expected)
+
+#     transformed = list(result.outputs)
+
+#     # verify a copy was made
+#     original: torch.Tensor = inputs[0]
+#     transformed[0] = 2 * transformed[0]
+
+#     assert transformed[0].equal(2 * original)
diff --git a/tests/mli/test_integrated_torch_worker.py b/tests/mli/test_integrated_torch_worker.py
new file mode 100644
index 000000000..60f1f0c6b
--- /dev/null
+++ b/tests/mli/test_integrated_torch_worker.py
@@ -0,0 +1,275 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pathlib
+import typing as t
+
+import pytest
+import torch
+
+# import smartsim.error as sse
+# from smartsim._core.mli.infrastructure.control import workermanager as mli
+# from smartsim._core.mli.message_handler import MessageHandler
+from smartsim._core.utils import installed_redisai_backends
+
+# The tests in this file belong to the group_b group
+pytestmark = pytest.mark.group_b
+
+# retrieved from pytest fixtures
+is_dragon = pytest.test_launcher == "dragon"
+torch_available = "torch" in installed_redisai_backends()
+
+
+@pytest.fixture
+def persist_torch_model(test_dir: str) -> pathlib.Path:
+    test_path = pathlib.Path(test_dir)
+    model_path = test_path / "basic.pt"
+
+    model = torch.nn.Linear(2, 1)
+    torch.save(model, model_path)
+
+    return model_path
+
+
+# todo: move deserialization tests into suite for worker manager where serialization occurs
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_deserialize_direct_request(persist_torch_model: pathlib.Path) -> None:
+#     """Verify that a direct requestis deserialized properly"""
+#     worker = mli.IntegratedTorchWorker
+#     # feature_store = mli.MemoryFeatureStore()
+
+#     model_bytes = persist_torch_model.read_bytes()
+#     input_tensor = torch.randn(2)
+
+#     expected_callback_channel = b"faux_channel_descriptor_bytes"
+#     callback_channel = mli.DragonCommChannel.find(expected_callback_channel)
+
+#     message_tensor_input = MessageHandler.build_tensor(
+#         input_tensor, "c", "float32", [2]
+#     )
+
+#     request = MessageHandler.build_request(
+#         reply_channel=callback_channel.descriptor,
+#         model=model_bytes,
+#         inputs=[message_tensor_input],
+#         outputs=[],
+#         custom_attributes=None,
+#     )
+
+#     msg_bytes = MessageHandler.serialize_request(request)
+
+#     inference_request = worker.deserialize(msg_bytes)
+#     assert inference_request.callback._descriptor == expected_callback_channel
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_deserialize_indirect_request(persist_torch_model: pathlib.Path) -> None:
+#     """Verify that an indirect request is deserialized correctly"""
+#     worker = mli.IntegratedTorchWorker
+#     # feature_store = mli.MemoryFeatureStore()
+
+#     model_key = "persisted-model"
+#     # model_bytes = persist_torch_model.read_bytes()
+#     # feature_store[model_key] = model_bytes
+
+#     input_key = f"demo-input"
+#     # input_tensor = torch.randn(2)
+#     # feature_store[input_key] = input_tensor
+
+#     expected_callback_channel = b"faux_channel_descriptor_bytes"
+#     callback_channel = mli.DragonCommChannel.find(expected_callback_channel)
+
+#     output_key = f"demo-output"
+
+#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     message_model_key = MessageHandler.build_model_key(model_key)
+
+#     request = MessageHandler.build_request(
+#         reply_channel=callback_channel.descriptor,
+#         model=message_model_key,
+#         inputs=[message_tensor_input_key],
+#         outputs=[message_tensor_output_key],
+#         custom_attributes=None,
+#     )
+
+#     msg_bytes = MessageHandler.serialize_request(request)
+
+#     inference_request = worker.deserialize(msg_bytes)
+#     assert inference_request.callback._descriptor == expected_callback_channel
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_deserialize_mixed_mode_indirect_inputs(
+#     persist_torch_model: pathlib.Path,
+# ) -> None:
+#     """Verify that a mixed mode (combining direct and indirect inputs, models, outputs)
+#     with indirect inputs is deserialized correctly"""
+#     worker = mli.IntegratedTorchWorker
+#     # feature_store = mli.MemoryFeatureStore()
+
+#     # model_key = "persisted-model"
+#     model_bytes = persist_torch_model.read_bytes()
+#     # feature_store[model_key] = model_bytes
+
+#     input_key = f"demo-input"
+#     # input_tensor = torch.randn(2)
+#     # feature_store[input_key] = input_tensor
+
+#     expected_callback_channel = b"faux_channel_descriptor_bytes"
+#     callback_channel = mli.DragonCommChannel.find(expected_callback_channel)
+
+#     output_key = f"demo-output"
+
+#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     # message_model_key = MessageHandler.build_model_key(model_key)
+
+#     request = MessageHandler.build_request(
+#         reply_channel=callback_channel.descriptor,
+#         model=model_bytes,
+#         inputs=[message_tensor_input_key],
+#         # outputs=[message_tensor_output_key],
+#         outputs=[],
+#         custom_attributes=None,
+#     )
+
+#     msg_bytes = MessageHandler.serialize_request(request)
+
+#     inference_request = worker.deserialize(msg_bytes)
+#     assert inference_request.callback._descriptor == expected_callback_channel
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_deserialize_mixed_mode_indirect_outputs(
+#     persist_torch_model: pathlib.Path,
+# ) -> None:
+#     """Verify that a mixed mode (combining direct and indirect inputs, models, outputs)
+#     with indirect outputs is deserialized correctly"""
+#     worker = mli.IntegratedTorchWorker
+#     # feature_store = mli.MemoryFeatureStore()
+
+#     # model_key = "persisted-model"
+#     model_bytes = persist_torch_model.read_bytes()
+#     # feature_store[model_key] = model_bytes
+
+#     input_key = f"demo-input"
+#     input_tensor = torch.randn(2)
+#     # feature_store[input_key] = input_tensor
+
+#     expected_callback_channel = b"faux_channel_descriptor_bytes"
+#     callback_channel = mli.DragonCommChannel.find(expected_callback_channel)
+
+#     output_key = f"demo-output"
+
+#     message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     # message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     # message_model_key = MessageHandler.build_model_key(model_key)
+#     message_tensor_input = MessageHandler.build_tensor(
+#         input_tensor, "c", "float32", [2]
+#     )
+
+#     request = MessageHandler.build_request(
+#         reply_channel=callback_channel.descriptor,
+#         model=model_bytes,
+#         inputs=[message_tensor_input],
+#         # outputs=[message_tensor_output_key],
+#         outputs=[message_tensor_output_key],
+#         custom_attributes=None,
+#     )
+
+#     msg_bytes = MessageHandler.serialize_request(request)
+
+#     inference_request = worker.deserialize(msg_bytes)
+#     assert inference_request.callback._descriptor == expected_callback_channel
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_deserialize_mixed_mode_indirect_model(
+#     persist_torch_model: pathlib.Path,
+# ) -> None:
+#     """Verify that a mixed mode (combining direct and indirect inputs, models, outputs)
+#     with indirect outputs is deserialized correctly"""
+#     worker = mli.IntegratedTorchWorker
+#     # feature_store = mli.MemoryFeatureStore()
+
+#     model_key = "persisted-model"
+#     # model_bytes = persist_torch_model.read_bytes()
+#     # feature_store[model_key] = model_bytes
+
+#     # input_key = f"demo-input"
+#     input_tensor = torch.randn(2)
+#     # feature_store[input_key] = input_tensor
+
+#     expected_callback_channel = b"faux_channel_descriptor_bytes"
+#     callback_channel = mli.DragonCommChannel.find(expected_callback_channel)
+
+#     output_key = f"demo-output"
+
+#     # message_tensor_output_key = MessageHandler.build_tensor_key(output_key)
+#     # message_tensor_input_key = MessageHandler.build_tensor_key(input_key)
+#     message_model_key = MessageHandler.build_model_key(model_key)
+#     message_tensor_input = MessageHandler.build_tensor(
+#         input_tensor, "c", "float32", [2]
+#     )
+
+#     request = MessageHandler.build_request(
+#         reply_channel=callback_channel.descriptor,
+#         model=message_model_key,
+#         inputs=[message_tensor_input],
+#         # outputs=[message_tensor_output_key],
+#         outputs=[],
+#         custom_attributes=None,
+#     )
+
+#     msg_bytes = MessageHandler.serialize_request(request)
+
+#     inference_request = worker.deserialize(msg_bytes)
+#     assert inference_request.callback._descriptor == expected_callback_channel
+
+
+# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed")
+# def test_serialize(test_dir: str, persist_torch_model: pathlib.Path) -> None:
+#     """Verify that the worker correctly executes reply serialization"""
+#     worker = mli.IntegratedTorchWorker
+
+#     reply = mli.InferenceReply()
+#     reply.output_keys = ["foo", "bar"]
+
+#     # use the worker implementation of reply serialization to get bytes for
+#     # use on the callback channel
+#     reply_bytes = worker.serialize_reply(reply)
+#     assert reply_bytes is not None
+
+#     # deserialize to verity the mapping in the worker.serialize_reply was correct
+#     actual_reply = MessageHandler.deserialize_response(reply_bytes)
+
+#     actual_tensor_keys = [tk.key for tk in actual_reply.result.keys]
+#     assert set(actual_tensor_keys) == set(reply.output_keys)
+#     assert actual_reply.status == 200
+#     assert actual_reply.statusMessage == "success"
diff --git a/tests/mli/test_service.py b/tests/mli/test_service.py
new file mode 100644
index 000000000..617738f94
--- /dev/null
+++ b/tests/mli/test_service.py
@@ -0,0 +1,205 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import datetime
+import multiprocessing as mp
+import pathlib
+import typing as t
+from asyncore import loop
+
+import pytest
+import torch
+
+import smartsim.error as sse
+from smartsim._core.entrypoints.service import Service
+
+# The tests in this file belong to the group_b group
+pytestmark = pytest.mark.group_a
+
+
+class SimpleService(Service):
+    """Mock implementation of a service that counts method invocations
+    using the base class event hooks."""
+
+    def __init__(
+        self,
+        log: t.List[str],
+        quit_after: int = 0,
+        as_service: bool = False,
+        cooldown: int = 0,
+        loop_delay: int = 0,
+    ) -> None:
+        super().__init__(as_service, cooldown, loop_delay)
+        self._log = log
+        self._quit_after = quit_after
+        self.num_iterations = 0
+        self.num_starts = 0
+        self.num_shutdowns = 0
+        self.num_cooldowns = 0
+        self.num_can_shutdown = 0
+        self.num_delays = 0
+
+    def _on_iteration(self) -> None:
+        self.num_iterations += 1
+
+    def _on_start(self) -> None:
+        self.num_starts += 1
+
+    def _on_shutdown(self) -> None:
+        self.num_shutdowns += 1
+
+    def _on_cooldown_elapsed(self) -> None:
+        self.num_cooldowns += 1
+
+    def _on_delay(self) -> None:
+        self.num_delays += 1
+
+    def _can_shutdown(self) -> bool:
+        self.num_can_shutdown += 1
+        if self._quit_after == 0:
+            return True
+
+        return self.num_iterations >= self._quit_after
+
+
+def test_service_init() -> None:
+    """Verify expected default values after Service initialization"""
+    activity_log: t.List[str] = []
+    service = SimpleService(activity_log)
+
+    assert service._as_service is False
+    assert service._cooldown == 0
+    assert service._loop_delay == 0
+
+
+def test_service_run_once() -> None:
+    """Verify the service completes after a single call to _on_iteration"""
+    activity_log: t.List[str] = []
+    service = SimpleService(activity_log)
+
+    service.execute()
+
+    assert service.num_iterations == 1
+    assert service.num_starts == 1
+    assert service.num_cooldowns == 0  # it never exceeds a cooldown period
+    assert service.num_can_shutdown == 0  # it automatically exits in run once
+    assert service.num_shutdowns == 1
+
+
+@pytest.mark.parametrize(
+    "num_iterations",
+    [
+        pytest.param(0, id="Immediate Shutdown"),
+        pytest.param(1, id="1x"),
+        pytest.param(2, id="2x"),
+        pytest.param(4, id="4x"),
+        pytest.param(8, id="8x"),
+        pytest.param(16, id="16x"),
+        pytest.param(32, id="32x"),
+    ],
+)
+def test_service_run_until_can_shutdown(num_iterations: int) -> None:
+    """Verify the service completes after a dynamic number of iterations
+    based on the return value of `_can_shutdown`"""
+    activity_log: t.List[str] = []
+
+    service = SimpleService(activity_log, quit_after=num_iterations, as_service=True)
+
+    service.execute()
+
+    if num_iterations == 0:
+        # no matter what, it should always execute the _on_iteration method
+        assert service.num_iterations == 1
+    else:
+        assert service.num_iterations == num_iterations
+
+    assert service.num_starts == 1
+    assert service.num_shutdowns == 1
+
+
+@pytest.mark.parametrize(
+    "cooldown",
+    [
+        pytest.param(1, id="1s"),
+        pytest.param(3, id="3s"),
+        pytest.param(5, id="5s"),
+    ],
+)
+def test_service_cooldown(cooldown: int) -> None:
+    """Verify that the cooldown period is respected"""
+    activity_log: t.List[str] = []
+
+    service = SimpleService(
+        activity_log,
+        quit_after=1,
+        as_service=True,
+        cooldown=cooldown,
+        loop_delay=0,
+    )
+
+    ts0 = datetime.datetime.now()
+    service.execute()
+    ts1 = datetime.datetime.now()
+
+    fudge_factor = 1.1  # allow a little bit of wiggle room for the loop
+    duration_in_seconds = (ts1 - ts0).total_seconds()
+
+    assert duration_in_seconds <= cooldown * fudge_factor
+    assert service.num_cooldowns == 1
+    assert service.num_shutdowns == 1
+
+
+@pytest.mark.parametrize(
+    "delay, num_iterations",
+    [
+        pytest.param(1, 3, id="1s delay, 3x"),
+        pytest.param(3, 2, id="2s delay, 2x"),
+        pytest.param(5, 1, id="5s delay, 1x"),
+    ],
+)
+def test_service_delay(delay: int, num_iterations: int) -> None:
+    """Verify that a delay is correctly added between iterations"""
+    activity_log: t.List[str] = []
+
+    service = SimpleService(
+        activity_log,
+        quit_after=num_iterations,
+        as_service=True,
+        cooldown=0,
+        loop_delay=delay,
+    )
+
+    ts0 = datetime.datetime.now()
+    service.execute()
+    ts1 = datetime.datetime.now()
+
+    # the expected duration is the sum of the delay between each iteration
+    expected_duration = (num_iterations + 1) * delay
+    duration_in_seconds = (ts1 - ts0).total_seconds()
+
+    assert duration_in_seconds <= expected_duration
+    assert service.num_cooldowns == 0
+    assert service.num_shutdowns == 1
diff --git a/tests/mli/worker.py b/tests/mli/worker.py
new file mode 100644
index 000000000..0582cae56
--- /dev/null
+++ b/tests/mli/worker.py
@@ -0,0 +1,104 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import io
+import typing as t
+
+import torch
+
+import smartsim._core.mli.infrastructure.worker.worker as mliw
+import smartsim.error as sse
+from smartsim.log import get_logger
+
+logger = get_logger(__name__)
+
+
+class IntegratedTorchWorker(mliw.MachineLearningWorkerBase):
+    """A minimum implementation of a worker that executes a PyTorch model"""
+
+    # @staticmethod
+    # def deserialize(request: InferenceRequest) -> t.List[t.Any]:
+    #     # request.input_meta
+    #     # request.raw_inputs
+    #     return request
+
+    @staticmethod
+    def load_model(
+        request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult, device: str
+    ) -> mliw.LoadModelResult:
+        model_bytes = fetch_result.model_bytes or request.raw_model
+        if not model_bytes:
+            raise ValueError("Unable to load model without reference object")
+
+        model: torch.nn.Module = torch.load(io.BytesIO(model_bytes))
+        result = mliw.LoadModelResult(model)
+        return result
+
+    @staticmethod
+    def transform_input(
+        request: mliw.InferenceRequest,
+        fetch_result: mliw.FetchInputResult,
+        device: str,
+    ) -> mliw.TransformInputResult:
+        # extra metadata for assembly can be found in request.input_meta
+        raw_inputs = request.raw_inputs or fetch_result.inputs
+
+        result: t.List[torch.Tensor] = []
+        # should this happen here?
+        # consider - fortran to c data layout
+        # is there an intermediate representation before really doing torch.load?
+        if raw_inputs:
+            result = [torch.load(io.BytesIO(item)) for item in raw_inputs]
+
+        return mliw.TransformInputResult(result)
+
+    @staticmethod
+    def execute(
+        request: mliw.InferenceRequest,
+        load_result: mliw.LoadModelResult,
+        transform_result: mliw.TransformInputResult,
+    ) -> mliw.ExecuteResult:
+        if not load_result.model:
+            raise sse.SmartSimError("Model must be loaded to execute")
+
+        model = load_result.model
+        results = [model(tensor) for tensor in transform_result.transformed]
+
+        execute_result = mliw.ExecuteResult(results)
+        return execute_result
+
+    @staticmethod
+    def transform_output(
+        request: mliw.InferenceRequest,
+        execute_result: mliw.ExecuteResult,
+        result_device: str,
+    ) -> mliw.TransformOutputResult:
+        # send the original tensors...
+        execute_result.predictions = [t.detach() for t in execute_result.predictions]
+        # todo: solve sending all tensor metadata that coincisdes with each prediction
+        return mliw.TransformOutputResult(
+            execute_result.predictions, [1], "c", "float32"
+        )
diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py
index b23a1a7ef..4bf589ad4 100644
--- a/tests/test_dragon_installer.py
+++ b/tests/test_dragon_installer.py
@@ -44,6 +44,7 @@
     retrieve_asset,
     retrieve_asset_info,
 )
+from smartsim._core._install.builder import WebTGZ
 from smartsim.error.errors import SmartSimCLIActionCancelled
 
 # The tests in this file belong to the group_a group
@@ -58,14 +59,25 @@
 def test_archive(test_dir: str, archive_path: pathlib.Path) -> pathlib.Path:
     """Fixture for returning a simple tarfile to test on"""
     num_files = 10
+
+    archive_name = archive_path.name
+    archive_name = archive_name.replace(".tar.gz", "")
+
     with tarfile.TarFile.open(archive_path, mode="w:gz") as tar:
-        mock_whl = pathlib.Path(test_dir) / "mock.whl"
+        mock_whl = pathlib.Path(test_dir) / archive_name / f"{archive_name}.whl"
+        mock_whl.parent.mkdir(parents=True, exist_ok=True)
         mock_whl.touch()
 
+        tar.add(mock_whl)
+
         for i in range(num_files):
-            content = pathlib.Path(test_dir) / f"{i:04}.txt"
+            content = pathlib.Path(test_dir) / archive_name / f"{i:04}.txt"
             content.write_text(f"i am file {i}\n")
             tar.add(content)
+            content.unlink()
+
+        mock_whl.unlink()
+
     return archive_path
 
 
@@ -118,6 +130,7 @@ def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset]
                     _git_attr(value=f"http://foo/{archive_name}"),
                 )
                 monkeypatch.setattr(asset, "_name", _git_attr(value=archive_name))
+                monkeypatch.setattr(asset, "_id", _git_attr(value=123))
                 assets.append(asset)
 
     return assets
@@ -149,11 +162,22 @@ def test_retrieve_cached(
     test_archive: pathlib.Path,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    """Verify that a previously retrieved asset archive is re-used"""
-    with tarfile.TarFile.open(test_archive) as tar:
-        tar.extractall(test_dir)
+    """Verify that a previously retrieved asset archive is re-used and the
+    release asset retrieval is not attempted"""
 
-    ts1 = test_archive.parent.stat().st_ctime
+    asset_id = 123
+
+    def mock_webtgz_extract(self_, target_) -> None:
+        mock_extraction_dir = pathlib.Path(target_)
+        with tarfile.TarFile.open(test_archive) as tar:
+            tar.extractall(mock_extraction_dir)
+
+    # we'll use the mock extract to create the files that would normally be downloaded
+    expected_output_dir = test_archive.parent / str(asset_id)
+    mock_webtgz_extract(None, expected_output_dir)
+
+    # get modification time of directory holding the "downloaded" archive
+    ts1 = expected_output_dir.stat().st_ctime
 
     requester = Requester(
         auth=None,
@@ -174,16 +198,76 @@ def test_retrieve_cached(
     # ensure mocked asset has values that we use...
     monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo"))
     monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name))
+    monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id))
 
+    # show that retrieving an asset w/a different ID results in ignoring
+    # other wheels from prior downloads in the parent directory of the asset
     asset_path = retrieve_asset(test_archive.parent, asset)
     ts2 = asset_path.stat().st_ctime
 
+    # NOTE: the file should be written to a subdir based on the asset ID
     assert (
-        asset_path == test_archive.parent
-    )  # show that the expected path matches the output path
+        asset_path == expected_output_dir
+    )  # shows that the expected path matches the output path
     assert ts1 == ts2  # show that the file wasn't changed...
 
 
+def test_retrieve_updated(
+    test_archive: pathlib.Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Verify that a previously retrieved asset archive is not re-used if a new
+    version is found"""
+
+    old_asset_id = 100
+    asset_id = 123
+
+    def mock_webtgz_extract(self_, target_) -> None:
+        mock_extraction_dir = pathlib.Path(target_)
+        with tarfile.TarFile.open(test_archive) as tar:
+            tar.extractall(mock_extraction_dir)
+
+    # we'll use the mock extract to create the files that would normally be downloaded
+    expected_output_dir = test_archive.parent / str(asset_id)
+    old_output_dir = test_archive.parent / str(old_asset_id)
+    mock_webtgz_extract(None, old_output_dir)
+
+    requester = Requester(
+        auth=None,
+        base_url="https://github.com",
+        user_agent="mozilla",
+        per_page=10,
+        verify=False,
+        timeout=1,
+        retry=1,
+        pool_size=1,
+    )
+    headers = {"mock-header": "mock-value"}
+    attributes = {"mock-attr": "mock-attr-value"}
+    completed = True
+
+    asset = GitReleaseAsset(requester, headers, attributes, completed)
+
+    # ensure mocked asset has values that we use...
+    monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo"))
+    monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name))
+    monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id))
+    monkeypatch.setattr(
+        WebTGZ,
+        "extract",
+        lambda s_, t_: mock_webtgz_extract(s_, expected_output_dir),
+    )  # mock the retrieval of the updated archive
+
+    # tell it to retrieve. it should return the path to the new download, not the old one
+    asset_path = retrieve_asset(test_archive.parent, asset)
+
+    # sanity check we don't have the same paths
+    assert old_output_dir != expected_output_dir
+
+    # verify the "cached" copy wasn't used
+    assert asset_path == expected_output_dir
+
+
 @pytest.mark.parametrize(
     "dragon_pin,pyv,is_found,is_crayex",
     [
diff --git a/tests/test_dragon_run_policy.py b/tests/test_dragon_run_policy.py
index 1d8d069fa..c94ae375b 100644
--- a/tests/test_dragon_run_policy.py
+++ b/tests/test_dragon_run_policy.py
@@ -143,7 +143,6 @@ def test_create_run_policy_run_request_no_run_policy() -> None:
     assert policy.device == Policy.Device.DEFAULT
     assert set(policy.cpu_affinity) == set()
     assert policy.gpu_affinity == []
-    assert policy.affinity == Policy.Affinity.DEFAULT
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -167,7 +166,6 @@ def test_create_run_policy_run_request_default_run_policy() -> None:
 
     assert set(policy.cpu_affinity) == set()
     assert set(policy.gpu_affinity) == set()
-    assert policy.affinity == Policy.Affinity.DEFAULT
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -192,7 +190,6 @@ def test_create_run_policy_run_request_cpu_affinity_no_device() -> None:
 
     assert set(policy.cpu_affinity) == affinity
     assert policy.gpu_affinity == []
-    assert policy.affinity == Policy.Affinity.SPECIFIC
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -216,7 +213,6 @@ def test_create_run_policy_run_request_cpu_affinity() -> None:
 
     assert set(policy.cpu_affinity) == affinity
     assert policy.gpu_affinity == []
-    assert policy.affinity == Policy.Affinity.SPECIFIC
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
@@ -240,7 +236,6 @@ def test_create_run_policy_run_request_gpu_affinity() -> None:
 
     assert policy.cpu_affinity == []
     assert set(policy.gpu_affinity) == set(affinity)
-    assert policy.affinity == Policy.Affinity.SPECIFIC
 
 
 @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
diff --git a/tests/test_dragon_run_request.py b/tests/test_dragon_run_request.py
index 7514deab1..62ac572eb 100644
--- a/tests/test_dragon_run_request.py
+++ b/tests/test_dragon_run_request.py
@@ -30,18 +30,14 @@
 import time
 from unittest.mock import MagicMock
 
+import pydantic.error_wrappers
 import pytest
-from pydantic import ValidationError
+
+from smartsim._core.launcher.dragon.pqueue import NodePrioritizer
 
 # The tests in this file belong to the group_b group
 pytestmark = pytest.mark.group_b
-
-try:
-    import dragon
-
-    dragon_loaded = True
-except:
-    dragon_loaded = False
+dragon = pytest.importorskip("dragon")
 
 from smartsim._core.config import CONFIG
 from smartsim._core.schemas.dragonRequests import *
@@ -56,38 +52,6 @@
     )
 
 
-class NodeMock(MagicMock):
-    def __init__(
-        self, name: t.Optional[str] = None, num_gpus: int = 2, num_cpus: int = 8
-    ) -> None:
-        super().__init__()
-        self._mock_id = name
-        NodeMock._num_gpus = num_gpus
-        NodeMock._num_cpus = num_cpus
-
-    @property
-    def hostname(self) -> str:
-        if self._mock_id:
-            return self._mock_id
-        return create_short_id_str()
-
-    @property
-    def num_cpus(self) -> str:
-        return NodeMock._num_cpus
-
-    @property
-    def num_gpus(self) -> str:
-        return NodeMock._num_gpus
-
-    def _set_id(self, value: str) -> None:
-        self._mock_id = value
-
-    def gpus(self, parent: t.Any = None) -> t.List[str]:
-        if self._num_gpus:
-            return [f"{self.hostname}-gpu{i}" for i in range(NodeMock._num_gpus)]
-        return []
-
-
 class GroupStateMock(MagicMock):
     def Running(self) -> MagicMock:
         running = MagicMock(**{"__str__.return_value": "Running"})
@@ -102,59 +66,59 @@ class ProcessGroupMock(MagicMock):
     puids = [121, 122]
 
 
-def node_mock() -> NodeMock:
-    return NodeMock()
-
-
 def get_mock_backend(
-    monkeypatch: pytest.MonkeyPatch, num_gpus: int = 2
+    monkeypatch: pytest.MonkeyPatch, num_cpus: int, num_gpus: int
 ) -> "DragonBackend":
-
+    # create all the necessary namespaces as raw magic mocks
+    monkeypatch.setitem(sys.modules, "dragon.data.ddict.ddict", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.native.machine", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.native.group_state", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.native.process_group", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.native.process", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.infrastructure.connection", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.infrastructure.policy", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.infrastructure.process_desc", MagicMock())
+    monkeypatch.setitem(sys.modules, "dragon.data.ddict.ddict", MagicMock())
+
+    node_list = ["node1", "node2", "node3"]
+    system_mock = MagicMock(return_value=MagicMock(nodes=node_list))
+    node_mock = lambda x: MagicMock(hostname=x, num_cpus=num_cpus, num_gpus=num_gpus)
+    process_group_mock = MagicMock(return_value=ProcessGroupMock())
     process_mock = MagicMock(returncode=0)
-    process_group_mock = MagicMock(**{"Process.return_value": ProcessGroupMock()})
-    process_module_mock = MagicMock()
-    process_module_mock.Process = process_mock
-    node_mock = NodeMock(num_gpus=num_gpus)
-    system_mock = MagicMock(nodes=["node1", "node2", "node3"])
+    policy_mock = MagicMock(return_value=MagicMock())
+    group_state_mock = GroupStateMock()
+
+    # customize members that must perform specific actions within the namespaces
     monkeypatch.setitem(
         sys.modules,
         "dragon",
         MagicMock(
             **{
-                "native.machine.Node.return_value": node_mock,
-                "native.machine.System.return_value": system_mock,
-                "native.group_state": GroupStateMock(),
-                "native.process_group.ProcessGroup.return_value": ProcessGroupMock(),
+                "native.machine.Node": node_mock,
+                "native.machine.System": system_mock,
+                "native.group_state": group_state_mock,
+                "native.process_group.ProcessGroup": process_group_mock,
+                "native.process_group.Process": process_mock,
+                "native.process.Process": process_mock,
+                "infrastructure.policy.Policy": policy_mock,
             }
         ),
     )
-    monkeypatch.setitem(
-        sys.modules,
-        "dragon.infrastructure.connection",
-        MagicMock(),
-    )
-    monkeypatch.setitem(
-        sys.modules,
-        "dragon.infrastructure.policy",
-        MagicMock(**{"Policy.return_value": MagicMock()}),
-    )
-    monkeypatch.setitem(sys.modules, "dragon.native.process", process_module_mock)
-    monkeypatch.setitem(sys.modules, "dragon.native.process_group", process_group_mock)
 
-    monkeypatch.setitem(sys.modules, "dragon.native.group_state", GroupStateMock())
-    monkeypatch.setitem(
-        sys.modules,
-        "dragon.native.machine",
-        MagicMock(
-            **{"System.return_value": system_mock, "Node.return_value": node_mock}
-        ),
-    )
     from smartsim._core.launcher.dragon.dragonBackend import DragonBackend
 
     dragon_backend = DragonBackend(pid=99999)
-    monkeypatch.setattr(
-        dragon_backend, "_free_hosts", collections.deque(dragon_backend._hosts)
+
+    # NOTE: we're manually updating these values due to issue w/mocking namespaces
+    dragon_backend._prioritizer = NodePrioritizer(
+        [
+            MagicMock(num_cpus=num_cpus, num_gpus=num_gpus, hostname=node)
+            for node in node_list
+        ],
+        dragon_backend._queue_lock,
     )
+    dragon_backend._cpus = [num_cpus] * len(node_list)
+    dragon_backend._gpus = [num_gpus] * len(node_list)
 
     return dragon_backend
 
@@ -212,16 +176,14 @@ def set_mock_group_infos(
     }
 
     monkeypatch.setattr(dragon_backend, "_group_infos", group_infos)
-    monkeypatch.setattr(dragon_backend, "_free_hosts", collections.deque(hosts[1:3]))
-    monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: "abc123-1"})
+    monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: {"abc123-1"}})
     monkeypatch.setattr(dragon_backend, "_running_steps", ["abc123-1"])
 
     return group_infos
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
 
     handshake_req = DragonHandshakeRequest()
     handshake_resp = dragon_backend.process_request(handshake_req)
@@ -230,9 +192,8 @@ def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None:
     assert handshake_resp.dragon_pid == 99999
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -259,9 +220,9 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
 
     assert dragon_backend._running_steps == [step_id]
     assert len(dragon_backend._queued_steps) == 0
-    assert len(dragon_backend._free_hosts) == 1
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
+    assert len(dragon_backend.free_hosts) == 1
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
 
     monkeypatch.setattr(
         dragon_backend._group_infos[step_id].process_group, "status", "Running"
@@ -271,9 +232,9 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
 
     assert dragon_backend._running_steps == [step_id]
     assert len(dragon_backend._queued_steps) == 0
-    assert len(dragon_backend._free_hosts) == 1
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
+    assert len(dragon_backend.free_hosts) == 1
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
 
     dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
 
@@ -281,9 +242,8 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
     assert not dragon_backend._running_steps
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
 
     dragon_backend._shutdown_requested = True
 
@@ -309,7 +269,7 @@ def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
 
 def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that a policy is applied to a run request"""
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -325,10 +285,9 @@ def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None:
     assert run_req.policy is None
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that a policy is applied to a run request"""
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -356,9 +315,9 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
 
     assert dragon_backend._running_steps == [step_id]
     assert len(dragon_backend._queued_steps) == 0
-    assert len(dragon_backend._free_hosts) == 1
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
+    assert len(dragon_backend._prioritizer.unassigned()) == 1
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
 
     monkeypatch.setattr(
         dragon_backend._group_infos[step_id].process_group, "status", "Running"
@@ -368,9 +327,9 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
 
     assert dragon_backend._running_steps == [step_id]
     assert len(dragon_backend._queued_steps) == 0
-    assert len(dragon_backend._free_hosts) == 1
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
-    assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
+    assert len(dragon_backend._prioritizer.unassigned()) == 1
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
+    assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
 
     dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
 
@@ -378,9 +337,8 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
     assert not dragon_backend._running_steps
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
 
     group_infos = set_mock_group_infos(monkeypatch, dragon_backend)
 
@@ -395,9 +353,8 @@ def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None:
     }
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     group_infos = set_mock_group_infos(monkeypatch, dragon_backend)
 
     running_steps = [
@@ -424,10 +381,9 @@ def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None:
     )
 
     assert len(dragon_backend._allocated_hosts) == 0
-    assert len(dragon_backend._free_hosts) == 3
+    assert len(dragon_backend._prioritizer.unassigned()) == 3
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 @pytest.mark.parametrize(
     "immediate, kill_jobs, frontend_shutdown",
     [
@@ -446,7 +402,7 @@ def test_shutdown_request(
     frontend_shutdown: bool,
 ) -> None:
     monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "0")
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     monkeypatch.setattr(dragon_backend, "_cooldown_period", 1)
     set_mock_group_infos(monkeypatch, dragon_backend)
 
@@ -486,11 +442,10 @@ def test_shutdown_request(
     assert dragon_backend._has_cooled_down == kill_jobs
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 @pytest.mark.parametrize("telemetry_flag", ["0", "1"])
 def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -> None:
     monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", telemetry_flag)
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
 
     expected_cooldown = (
         2 * CONFIG.telemetry_frequency + 5 if int(telemetry_flag) > 0 else 5
@@ -502,19 +457,17 @@ def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -
         assert dragon_backend.cooldown_period == expected_cooldown
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     first_heartbeat = dragon_backend.last_heartbeat
     assert dragon_backend.current_time > first_heartbeat
     dragon_backend._heartbeat()
     assert dragon_backend.last_heartbeat > first_heartbeat
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 @pytest.mark.parametrize("num_nodes", [1, 3, 100])
 def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -527,18 +480,42 @@ def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None:
         pmi_enabled=False,
     )
 
-    assert dragon_backend._can_honor(run_req)[0] == (
-        num_nodes <= len(dragon_backend._hosts)
-    )
+    can_honor, error_msg = dragon_backend._can_honor(run_req)
+
+    nodes_in_range = num_nodes <= len(dragon_backend._hosts)
+    assert can_honor == nodes_in_range
+    assert error_msg is None if nodes_in_range else error_msg is not None
+
+
+@pytest.mark.parametrize("num_nodes", [-10, -1, 0])
+def test_can_honor_invalid_num_nodes(
+    monkeypatch: pytest.MonkeyPatch, num_nodes: int
+) -> None:
+    """Verify that requests for invalid numbers of nodes (negative, zero) are rejected"""
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
+
+    with pytest.raises(pydantic.error_wrappers.ValidationError) as ex:
+        DragonRunRequest(
+            exe="sleep",
+            exe_args=["5"],
+            path="/a/fake/path",
+            nodes=num_nodes,
+            tasks=1,
+            tasks_per_node=1,
+            env={},
+            current_env={},
+            pmi_enabled=False,
+        )
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 @pytest.mark.parametrize("affinity", [[0], [0, 1], list(range(8))])
 def test_can_honor_cpu_affinity(
     monkeypatch: pytest.MonkeyPatch, affinity: t.List[int]
 ) -> None:
     """Verify that valid CPU affinities are accepted"""
-    dragon_backend = get_mock_backend(monkeypatch)
+    num_cpus, num_gpus = 8, 0
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=num_cpus, num_gpus=num_gpus)
+
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -555,11 +532,10 @@ def test_can_honor_cpu_affinity(
     assert dragon_backend._can_honor(run_req)[0]
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_can_honor_cpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that invalid CPU affinities are NOT accepted
     NOTE: negative values are captured by the Pydantic schema"""
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -576,13 +552,15 @@ def test_can_honor_cpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) ->
     assert not dragon_backend._can_honor(run_req)[0]
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 @pytest.mark.parametrize("affinity", [[0], [0, 1]])
 def test_can_honor_gpu_affinity(
     monkeypatch: pytest.MonkeyPatch, affinity: t.List[int]
 ) -> None:
     """Verify that valid GPU affinities are accepted"""
-    dragon_backend = get_mock_backend(monkeypatch)
+
+    num_cpus, num_gpus = 8, 2
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=num_cpus, num_gpus=num_gpus)
+
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -599,11 +577,10 @@ def test_can_honor_gpu_affinity(
     assert dragon_backend._can_honor(run_req)[0]
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_can_honor_gpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that invalid GPU affinities are NOT accepted
     NOTE: negative values are captured by the Pydantic schema"""
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     run_req = DragonRunRequest(
         exe="sleep",
         exe_args=["5"],
@@ -620,46 +597,45 @@ def test_can_honor_gpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) ->
     assert not dragon_backend._can_honor(run_req)[0]
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_can_honor_gpu_device_not_available(monkeypatch: pytest.MonkeyPatch) -> None:
     """Verify that a request for a GPU if none exists is not accepted"""
 
     # create a mock node class that always reports no GPUs available
-    dragon_backend = get_mock_backend(monkeypatch, num_gpus=0)
-
-    run_req = DragonRunRequest(
-        exe="sleep",
-        exe_args=["5"],
-        path="/a/fake/path",
-        nodes=2,
-        tasks=1,
-        tasks_per_node=1,
-        env={},
-        current_env={},
-        pmi_enabled=False,
-        # specify GPU device w/no affinity
-        policy=DragonRunPolicy(gpu_affinity=[0]),
-    )
-
-    assert not dragon_backend._can_honor(run_req)[0]
+    with monkeypatch.context() as ctx:
+        dragon_backend = get_mock_backend(ctx, num_cpus=8, num_gpus=0)
+
+        run_req = DragonRunRequest(
+            exe="sleep",
+            exe_args=["5"],
+            path="/a/fake/path",
+            nodes=2,
+            tasks=1,
+            tasks_per_node=1,
+            env={},
+            current_env={},
+            pmi_enabled=False,
+            # specify GPU device w/no affinity
+            policy=DragonRunPolicy(gpu_affinity=[0]),
+        )
+        can_honor, _ = dragon_backend._can_honor(run_req)
+        assert not can_honor
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     step_id = next(dragon_backend._step_ids)
 
     assert step_id.endswith("0")
     assert step_id != next(dragon_backend._step_ids)
 
 
-@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")
 def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
-    dragon_backend = get_mock_backend(monkeypatch)
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
     set_mock_group_infos(monkeypatch, dragon_backend)
     hosts = dragon_backend.hosts
+    dragon_backend._prioritizer.increment(hosts[0])
 
-    expected_message = textwrap.dedent(f"""\
+    expected_msg = textwrap.dedent(f"""\
         Dragon server backend update
         | Host   |  Status  |
         |--------|----------|
@@ -667,7 +643,7 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
         | {hosts[1]} |   Free   |
         | {hosts[2]} |   Free   |
         | Step     | Status       | Hosts           |  Return codes  |  Num procs  |
-        |----------|--------------|-------------|----------------|-------------|
+        |----------|--------------|-----------------|----------------|-------------|
         | abc123-1 | Running      | {hosts[0]}         |                |      1      |
         | del999-2 | Cancelled    | {hosts[1]}         |       -9       |      1      |
         | c101vz-3 | Completed    | {hosts[1]},{hosts[2]} |       0        |      2      |
@@ -676,6 +652,110 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
 
     # get rid of white space to make the comparison easier
     actual_msg = dragon_backend.status_message.replace(" ", "")
-    expected_message = expected_message.replace(" ", "")
+    expected_msg = expected_msg.replace(" ", "")
+
+    # ignore dashes in separators (hostname changes may cause column expansion)
+    while actual_msg.find("--") > -1:
+        actual_msg = actual_msg.replace("--", "-")
+    while expected_msg.find("--") > -1:
+        expected_msg = expected_msg.replace("--", "-")
+
+    assert actual_msg == expected_msg
+
+
+def test_can_honor_hosts_unavailable_hosts(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Verify that requesting nodes with invalid names causes number of available
+    nodes check to fail due to valid # of named nodes being under num_nodes"""
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
+
+    # let's supply 2 invalid and 1 valid hostname
+    actual_hosts = list(dragon_backend._hosts)
+    actual_hosts[0] = f"x{actual_hosts[0]}"
+    actual_hosts[1] = f"x{actual_hosts[1]}"
+
+    host_list = ",".join(actual_hosts)
+
+    run_req = DragonRunRequest(
+        exe="sleep",
+        exe_args=["5"],
+        path="/a/fake/path",
+        nodes=2,  # <----- requesting 2 of 3 available nodes
+        hostlist=host_list,  # <--- only one valid name available
+        tasks=1,
+        tasks_per_node=1,
+        env={},
+        current_env={},
+        pmi_enabled=False,
+        policy=DragonRunPolicy(),
+    )
+
+    can_honor, error_msg = dragon_backend._can_honor(run_req)
+
+    # confirm the failure is indicated
+    assert not can_honor
+    # confirm failure message indicates number of nodes requested as cause
+    assert "named hosts" in error_msg
+
+
+def test_can_honor_hosts_unavailable_hosts_ok(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Verify that requesting nodes with invalid names causes number of available
+    nodes check to be reduced but still passes if enough valid named nodes are passed"""
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
+
+    # let's supply 2 valid and 1 invalid hostname
+    actual_hosts = list(dragon_backend._hosts)
+    actual_hosts[0] = f"x{actual_hosts[0]}"
+
+    host_list = ",".join(actual_hosts)
+
+    run_req = DragonRunRequest(
+        exe="sleep",
+        exe_args=["5"],
+        path="/a/fake/path",
+        nodes=2,  # <----- requesting 2 of 3 available nodes
+        hostlist=host_list,  # <--- two valid names are available
+        tasks=1,
+        tasks_per_node=1,
+        env={},
+        current_env={},
+        pmi_enabled=False,
+        policy=DragonRunPolicy(),
+    )
+
+    can_honor, error_msg = dragon_backend._can_honor(run_req)
+
+    # confirm the failure is indicated
+    assert can_honor, error_msg
+    # confirm failure message indicates number of nodes requested as cause
+    assert error_msg is None, error_msg
+
+
+def test_can_honor_hosts_1_hosts_requested(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Verify that requesting nodes with invalid names causes number of available
+    nodes check to be reduced but still passes if enough valid named nodes are passed"""
+    dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
+
+    # let's supply 2 valid and 1 invalid hostname
+    actual_hosts = list(dragon_backend._hosts)
+    actual_hosts[0] = f"x{actual_hosts[0]}"
+
+    host_list = ",".join(actual_hosts)
+
+    run_req = DragonRunRequest(
+        exe="sleep",
+        exe_args=["5"],
+        path="/a/fake/path",
+        nodes=1,  # <----- requesting 0 nodes - should be ignored
+        hostlist=host_list,  # <--- two valid names are available
+        tasks=1,
+        tasks_per_node=1,
+        env={},
+        current_env={},
+        pmi_enabled=False,
+        policy=DragonRunPolicy(),
+    )
+
+    can_honor, error_msg = dragon_backend._can_honor(run_req)
 
-    assert actual_msg == expected_message
+    # confirm the failure is indicated
+    assert can_honor, error_msg
diff --git a/tests/test_dragon_runsettings.py b/tests/test_dragon_runsettings.py
index 34e8510e8..8c7600c74 100644
--- a/tests/test_dragon_runsettings.py
+++ b/tests/test_dragon_runsettings.py
@@ -96,3 +96,122 @@ def test_dragon_runsettings_gpu_affinity():
     # ensure the value is not changed when we extend the list
     rs.run_args["gpu-affinity"] = "7,8,9"
     assert rs.run_args["gpu-affinity"] != ",".join(str(val) for val in exp_value)
+
+
+def test_dragon_runsettings_hostlist_null():
+    """Verify that passing a null hostlist is treated as a failure"""
+    rs = DragonRunSettings(exe="sleep", exe_args=["1"])
+
+    # baseline check that no host list exists
+    stored_list = rs.run_args.get("host-list", None)
+    assert stored_list is None
+
+    with pytest.raises(ValueError) as ex:
+        rs.set_hostlist(None)
+
+    assert "empty hostlist" in ex.value.args[0]
+
+
+def test_dragon_runsettings_hostlist_empty():
+    """Verify that passing an empty hostlist is treated as a failure"""
+    rs = DragonRunSettings(exe="sleep", exe_args=["1"])
+
+    # baseline check that no host list exists
+    stored_list = rs.run_args.get("host-list", None)
+    assert stored_list is None
+
+    with pytest.raises(ValueError) as ex:
+        rs.set_hostlist([])
+
+    assert "empty hostlist" in ex.value.args[0]
+
+
+@pytest.mark.parametrize("hostlist_csv", ["   ", " , ,     ,   ", ",", ",,,"])
+def test_dragon_runsettings_hostlist_whitespace_handling(hostlist_csv: str):
+    """Verify that passing a hostlist with emptystring host names is treated as a failure"""
+    rs = DragonRunSettings(exe="sleep", exe_args=["1"])
+
+    # baseline check that no host list exists
+    stored_list = rs.run_args.get("host-list", None)
+    assert stored_list is None
+
+    # empty string as hostname in list
+    with pytest.raises(ValueError) as ex:
+        rs.set_hostlist(hostlist_csv)
+
+    assert "invalid names" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "hostlist_csv", [["   "], [" ", "", "     ", "   "], ["", " "], ["", "", "", ""]]
+)
+def test_dragon_runsettings_hostlist_whitespace_handling_list(hostlist_csv: str):
+    """Verify that passing a hostlist with emptystring host names contained in a list
+    is treated as a failure"""
+    rs = DragonRunSettings(exe="sleep", exe_args=["1"])
+
+    # baseline check that no host list exists
+    stored_list = rs.run_args.get("host-list", None)
+    assert stored_list is None
+
+    # empty string as hostname in list
+    with pytest.raises(ValueError) as ex:
+        rs.set_hostlist(hostlist_csv)
+
+    assert "invalid names" in ex.value.args[0]
+
+
+def test_dragon_runsettings_hostlist_as_csv():
+    """Verify that a hostlist is stored properly when passing in a CSV string"""
+    rs = DragonRunSettings(exe="sleep", exe_args=["1"])
+
+    # baseline check that no host list exists
+    stored_list = rs.run_args.get("host-list", None)
+    assert stored_list is None
+
+    hostnames = ["host0", "host1", "host2", "host3", "host4"]
+
+    # set the host list with ideal comma separated values
+    input0 = ",".join(hostnames)
+
+    # set the host list with a string of comma separated values
+    # including extra whitespace
+    input1 = ", ".join(hostnames)
+
+    for hosts_input in [input0, input1]:
+        rs.set_hostlist(hosts_input)
+
+        stored_list = rs.run_args.get("host-list", None)
+        assert stored_list
+
+        # confirm that all values from the original list are retrieved
+        split_stored_list = stored_list.split(",")
+        assert set(hostnames) == set(split_stored_list)
+
+
+def test_dragon_runsettings_hostlist_as_csv():
+    """Verify that a hostlist is stored properly when passing in a CSV string"""
+    rs = DragonRunSettings(exe="sleep", exe_args=["1"])
+
+    # baseline check that no host list exists
+    stored_list = rs.run_args.get("host-list", None)
+    assert stored_list is None
+
+    hostnames = ["host0", "host1", "host2", "host3", "host4"]
+
+    # set the host list with ideal comma separated values
+    input0 = ",".join(hostnames)
+
+    # set the host list with a string of comma separated values
+    # including extra whitespace
+    input1 = ", ".join(hostnames)
+
+    for hosts_input in [input0, input1]:
+        rs.set_hostlist(hosts_input)
+
+        stored_list = rs.run_args.get("host-list", None)
+        assert stored_list
+
+        # confirm that all values from the original list are retrieved
+        split_stored_list = stored_list.split(",")
+        assert set(hostnames) == set(split_stored_list)
diff --git a/tests/test_dragon_step.py b/tests/test_dragon_step.py
index 19f408e0b..f933fb7bc 100644
--- a/tests/test_dragon_step.py
+++ b/tests/test_dragon_step.py
@@ -73,12 +73,18 @@ def dragon_batch_step(test_dir: str) -> DragonBatchStep:
     cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]]
     gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]]
 
+    # specify 3 hostnames to select from but require only 2 nodes
+    num_nodes = 2
+    hostnames = ["host1", "host2", "host3"]
+
     # assign some unique affinities to each run setting instance
     for index, rs in enumerate(settings):
         if gpu_affinities[index]:
             rs.set_node_feature("gpu")
         rs.set_cpu_affinity(cpu_affinities[index])
         rs.set_gpu_affinity(gpu_affinities[index])
+        rs.set_hostlist(hostnames)
+        rs.set_nodes(num_nodes)
 
     steps = list(
         DragonStep(name_, test_dir, rs_) for name_, rs_ in zip(names, settings)
@@ -374,6 +380,11 @@ def test_dragon_batch_step_write_request_file(
     cpu_affinities = [[], [0, 1, 2], [], [3, 4, 5, 6]]
     gpu_affinities = [[], [], [0, 1, 2], [3, 4, 5, 6]]
 
+    hostnames = ["host1", "host2", "host3"]
+    num_nodes = 2
+
+    # parse requests file path from  the launch command
+    # e.g. dragon python <batch-script-path>
     launch_cmd = dragon_batch_step.get_launch_cmd()
     requests_file = get_request_path_from_batch_script(launch_cmd)
 
@@ -392,3 +403,5 @@ def test_dragon_batch_step_write_request_file(
         assert run_request
         assert run_request.policy.cpu_affinity == cpu_affinities[index]
         assert run_request.policy.gpu_affinity == gpu_affinities[index]
+        assert run_request.nodes == num_nodes
+        assert run_request.hostlist == ",".join(hostnames)
diff --git a/tests/test_message_handler/__init__.py b/tests/test_message_handler/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/test_message_handler/test_build_model.py b/tests/test_message_handler/test_build_model.py
new file mode 100644
index 000000000..56c1c8764
--- /dev/null
+++ b/tests/test_message_handler/test_build_model.py
@@ -0,0 +1,72 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+
+def test_build_model_successful():
+    expected_data = b"model data"
+    expected_name = "model name"
+    expected_version = "v0.0.1"
+    model = handler.build_model(expected_data, expected_name, expected_version)
+    assert model.data == expected_data
+    assert model.name == expected_name
+    assert model.version == expected_version
+
+
+@pytest.mark.parametrize(
+    "data, name, version",
+    [
+        pytest.param(
+            100,
+            "model name",
+            "v0.0.1",
+            id="bad data type",
+        ),
+        pytest.param(
+            b"model data",
+            1,
+            "v0.0.1",
+            id="bad name type",
+        ),
+        pytest.param(
+            b"model data",
+            "model name",
+            0.1,
+            id="bad version type",
+        ),
+    ],
+)
+def test_build_model_unsuccessful(data, name, version):
+    with pytest.raises(ValueError):
+        model = handler.build_model(data, name, version)
diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py
new file mode 100644
index 000000000..c09c787fc
--- /dev/null
+++ b/tests/test_message_handler/test_build_model_key.py
@@ -0,0 +1,47 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+
+def test_build_model_key_successful():
+    fsd = "mock-feature-store-descriptor"
+    model_key = handler.build_model_key("tensor_key", fsd)
+    assert model_key.key == "tensor_key"
+    assert model_key.featureStoreDescriptor == fsd
+
+
+def test_build_model_key_unsuccessful():
+    with pytest.raises(ValueError):
+        fsd = "mock-feature-store-descriptor"
+        model_key = handler.build_model_key(100, fsd)
diff --git a/tests/test_message_handler/test_build_request_attributes.py b/tests/test_message_handler/test_build_request_attributes.py
new file mode 100644
index 000000000..5b1e09b0a
--- /dev/null
+++ b/tests/test_message_handler/test_build_request_attributes.py
@@ -0,0 +1,55 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+
+def test_build_torch_request_attributes_successful():
+    attribute = handler.build_torch_request_attributes("sparse")
+    assert attribute.tensorType == "sparse"
+
+
+def test_build_torch_request_attributes_unsuccessful():
+    with pytest.raises(ValueError):
+        attribute = handler.build_torch_request_attributes("invalid!")
+
+
+def test_build_tf_request_attributes_successful():
+    attribute = handler.build_tf_request_attributes(name="tfcnn", tensor_type="sparse")
+    assert attribute.tensorType == "sparse"
+    assert attribute.name == "tfcnn"
+
+
+def test_build_tf_request_attributes_unsuccessful():
+    with pytest.raises(ValueError):
+        attribute = handler.build_tf_request_attributes("tf_fail", "invalid!")
diff --git a/tests/test_message_handler/test_build_tensor_desc.py b/tests/test_message_handler/test_build_tensor_desc.py
new file mode 100644
index 000000000..45126fb16
--- /dev/null
+++ b/tests/test_message_handler/test_build_tensor_desc.py
@@ -0,0 +1,90 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+
+@pytest.mark.parametrize(
+    "dtype, order, dimension",
+    [
+        pytest.param(
+            "int8",
+            "c",
+            [3, 2, 5],
+            id="small torch tensor",
+        ),
+        pytest.param(
+            "int64",
+            "c",
+            [1040, 1040, 3],
+            id="medium torch tensor",
+        ),
+    ],
+)
+def test_build_tensor_descriptor_successful(dtype, order, dimension):
+    built_tensor_descriptor = handler.build_tensor_descriptor(order, dtype, dimension)
+    assert built_tensor_descriptor is not None
+    assert built_tensor_descriptor.order == order
+    assert built_tensor_descriptor.dataType == dtype
+    for i, j in zip(built_tensor_descriptor.dimensions, dimension):
+        assert i == j
+
+
+@pytest.mark.parametrize(
+    "dtype, order, dimension",
+    [
+        pytest.param(
+            "bad_order",
+            "int8",
+            [3, 2, 5],
+            id="bad order type",
+        ),
+        pytest.param(
+            "f",
+            "bad_num_type",
+            [3, 2, 5],
+            id="bad numerical type",
+        ),
+        pytest.param(
+            "f",
+            "int8",
+            "bad shape type",
+            id="bad shape type",
+        ),
+    ],
+)
+def test_build_tensor_descriptor_unsuccessful(dtype, order, dimension):
+    with pytest.raises(ValueError):
+        built_tensor_descriptor = handler.build_tensor_descriptor(
+            order, dtype, dimension
+        )
diff --git a/tests/test_message_handler/test_build_tensor_key.py b/tests/test_message_handler/test_build_tensor_key.py
new file mode 100644
index 000000000..6a28b80c4
--- /dev/null
+++ b/tests/test_message_handler/test_build_tensor_key.py
@@ -0,0 +1,46 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+
+def test_build_tensor_key_successful():
+    fsd = "mock-feature-store-descriptor"
+    tensor_key = handler.build_tensor_key("tensor_key", fsd)
+    assert tensor_key.key == "tensor_key"
+
+
+def test_build_tensor_key_unsuccessful():
+    with pytest.raises(ValueError):
+        fsd = "mock-feature-store-descriptor"
+        tensor_key = handler.build_tensor_key(100, fsd)
diff --git a/tests/test_message_handler/test_output_descriptor.py b/tests/test_message_handler/test_output_descriptor.py
new file mode 100644
index 000000000..beb9a4765
--- /dev/null
+++ b/tests/test_message_handler/test_output_descriptor.py
@@ -0,0 +1,78 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+handler = MessageHandler()
+
+fsd = "mock-feature-store-descriptor"
+tensor_key = handler.build_tensor_key("key", fsd)
+
+
+@pytest.mark.parametrize(
+    "order, keys, dtype, dimension",
+    [
+        pytest.param("c", [tensor_key], "int8", [1, 2, 3, 4], id="all specified"),
+        pytest.param(
+            "c", [tensor_key, tensor_key], "none", [1, 2, 3, 4], id="none dtype"
+        ),
+        pytest.param("c", [tensor_key], "int8", [], id="empty dimensions"),
+        pytest.param("c", [], "int8", [1, 2, 3, 4], id="empty keys"),
+    ],
+)
+def test_build_output_tensor_descriptor_successful(dtype, keys, order, dimension):
+    built_descriptor = handler.build_output_tensor_descriptor(
+        order, keys, dtype, dimension
+    )
+    assert built_descriptor is not None
+    assert built_descriptor.order == order
+    assert len(built_descriptor.optionalKeys) == len(keys)
+    assert built_descriptor.optionalDatatype == dtype
+    for i, j in zip(built_descriptor.optionalDimension, dimension):
+        assert i == j
+
+
+@pytest.mark.parametrize(
+    "order, keys, dtype, dimension",
+    [
+        pytest.param("bad_order", [], "int8", [3, 2, 5], id="bad order type"),
+        pytest.param(
+            "f", [tensor_key], "bad_num_type", [3, 2, 5], id="bad numerical type"
+        ),
+        pytest.param("f", [tensor_key], "int8", "bad shape type", id="bad shape type"),
+        pytest.param("f", ["tensor_key"], "int8", [3, 2, 5], id="bad key type"),
+    ],
+)
+def test_build_output_tensor_descriptor_unsuccessful(order, keys, dtype, dimension):
+    with pytest.raises(ValueError):
+        built_tensor = handler.build_output_tensor_descriptor(
+            order, keys, dtype, dimension
+        )
diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py
new file mode 100644
index 000000000..ea9b04d64
--- /dev/null
+++ b/tests/test_message_handler/test_request.py
@@ -0,0 +1,436 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+fsd = "mock-feature-store-descriptor"
+
+model_key = MessageHandler.build_model_key("model_key", fsd)
+model = MessageHandler.build_model(b"model data", "model_name", "v0.0.1")
+
+input_key1 = MessageHandler.build_tensor_key("input_key1", fsd)
+input_key2 = MessageHandler.build_tensor_key("input_key2", fsd)
+
+output_key1 = MessageHandler.build_tensor_key("output_key1", fsd)
+output_key2 = MessageHandler.build_tensor_key("output_key2", fsd)
+
+output_descriptor1 = MessageHandler.build_output_tensor_descriptor(
+    "c", [output_key1, output_key2], "int64", []
+)
+output_descriptor2 = MessageHandler.build_output_tensor_descriptor("f", [], "auto", [])
+output_descriptor3 = MessageHandler.build_output_tensor_descriptor(
+    "c", [output_key1], "none", [1, 2, 3]
+)
+torch_attributes = MessageHandler.build_torch_request_attributes("sparse")
+tf_attributes = MessageHandler.build_tf_request_attributes(
+    name="tf", tensor_type="sparse"
+)
+
+tensor_1 = MessageHandler.build_tensor_descriptor("c", "int8", [1])
+tensor_2 = MessageHandler.build_tensor_descriptor("c", "int64", [3, 2])
+tensor_3 = MessageHandler.build_tensor_descriptor("f", "int8", [1])
+tensor_4 = MessageHandler.build_tensor_descriptor("f", "int64", [3, 2])
+
+
+tf_indirect_request = MessageHandler.build_request(
+    b"reply",
+    model,
+    [input_key1, input_key2],
+    [output_key1, output_key2],
+    [output_descriptor1, output_descriptor2, output_descriptor3],
+    tf_attributes,
+)
+
+tf_direct_request = MessageHandler.build_request(
+    b"reply",
+    model,
+    [tensor_3, tensor_4],
+    [],
+    [output_descriptor1, output_descriptor2],
+    tf_attributes,
+)
+
+torch_indirect_request = MessageHandler.build_request(
+    b"reply",
+    model,
+    [input_key1, input_key2],
+    [output_key1, output_key2],
+    [output_descriptor1, output_descriptor2, output_descriptor3],
+    torch_attributes,
+)
+
+torch_direct_request = MessageHandler.build_request(
+    b"reply",
+    model,
+    [tensor_1, tensor_2],
+    [],
+    [output_descriptor1, output_descriptor2],
+    torch_attributes,
+)
+
+
+@pytest.mark.parametrize(
+    "reply_channel, model, input, output, output_descriptors, custom_attributes",
+    [
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [input_key1, input_key2],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            torch_attributes,
+        ),
+        pytest.param(
+            b"another reply channel",
+            model,
+            [input_key1],
+            [output_key2],
+            [output_descriptor1],
+            tf_attributes,
+        ),
+        pytest.param(
+            b"another reply channel",
+            model,
+            [input_key1],
+            [output_key2],
+            [output_descriptor1],
+            torch_attributes,
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [input_key1],
+            [output_key1],
+            [output_descriptor1],
+            None,
+        ),
+    ],
+)
+def test_build_request_indirect_successful(
+    reply_channel, model, input, output, output_descriptors, custom_attributes
+):
+    built_request = MessageHandler.build_request(
+        reply_channel,
+        model,
+        input,
+        output,
+        output_descriptors,
+        custom_attributes,
+    )
+    assert built_request is not None
+    assert built_request.replyChannel.descriptor == reply_channel
+    if built_request.model.which() == "key":
+        assert built_request.model.key.key == model.key
+    else:
+        assert built_request.model.data.data == model.data
+        assert built_request.model.data.name == model.name
+        assert built_request.model.data.version == model.version
+    assert built_request.input.which() == "keys"
+    assert built_request.input.keys[0].key == input[0].key
+    assert len(built_request.input.keys) == len(input)
+    assert len(built_request.output) == len(output)
+    for i, j in zip(built_request.outputDescriptors, output_descriptors):
+        assert i.order == j.order
+    if built_request.customAttributes.which() == "tf":
+        assert (
+            built_request.customAttributes.tf.tensorType == custom_attributes.tensorType
+        )
+    elif built_request.customAttributes.which() == "torch":
+        assert (
+            built_request.customAttributes.torch.tensorType
+            == custom_attributes.tensorType
+        )
+    else:
+        assert built_request.customAttributes.none == custom_attributes
+
+
+@pytest.mark.parametrize(
+    "reply_channel, model, input, output, output_descriptors, custom_attributes",
+    [
+        pytest.param(
+            [],
+            model_key,
+            [input_key1, input_key2],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            tf_attributes,
+            id="bad channel",
+        ),
+        pytest.param(
+            b"reply channel",
+            "bad model",
+            [input_key1],
+            [output_key2],
+            [output_descriptor1],
+            torch_attributes,
+            id="bad model",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            ["input_key1", "input_key2"],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            tf_attributes,
+            id="bad inputs",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [model_key],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            torch_attributes,
+            id="bad input schema type",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [input_key1],
+            ["output_key1", "output_key2"],
+            [output_descriptor1],
+            tf_attributes,
+            id="bad outputs",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [input_key1],
+            [model_key],
+            [output_descriptor1],
+            tf_attributes,
+            id="bad output schema type",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [input_key1],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            "bad attributes",
+            id="bad custom attributes",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [input_key1],
+            [output_key1, output_key2],
+            [output_descriptor1],
+            model_key,
+            id="bad custom attributes schema type",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [input_key1],
+            [output_key1, output_key2],
+            "bad descriptors",
+            torch_attributes,
+            id="bad output descriptors",
+        ),
+    ],
+)
+def test_build_request_indirect_unsuccessful(
+    reply_channel, model, input, output, output_descriptors, custom_attributes
+):
+    with pytest.raises(ValueError):
+        built_request = MessageHandler.build_request(
+            reply_channel,
+            model,
+            input,
+            output,
+            output_descriptors,
+            custom_attributes,
+        )
+
+
+@pytest.mark.parametrize(
+    "reply_channel, model, input, output, output_descriptors, custom_attributes",
+    [
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [tensor_1, tensor_2],
+            [],
+            [output_descriptor2],
+            torch_attributes,
+        ),
+        pytest.param(
+            b"another reply channel",
+            model,
+            [tensor_1],
+            [],
+            [output_descriptor3],
+            tf_attributes,
+        ),
+        pytest.param(
+            b"another reply channel",
+            model,
+            [tensor_2],
+            [],
+            [output_descriptor1],
+            tf_attributes,
+        ),
+        pytest.param(
+            b"another reply channel",
+            model,
+            [tensor_1],
+            [],
+            [output_descriptor1],
+            None,
+        ),
+    ],
+)
+def test_build_request_direct_successful(
+    reply_channel, model, input, output, output_descriptors, custom_attributes
+):
+    built_request = MessageHandler.build_request(
+        reply_channel,
+        model,
+        input,
+        output,
+        output_descriptors,
+        custom_attributes,
+    )
+    assert built_request is not None
+    assert built_request.replyChannel.descriptor == reply_channel
+    if built_request.model.which() == "key":
+        assert built_request.model.key.key == model.key
+    else:
+        assert built_request.model.data.data == model.data
+        assert built_request.model.data.name == model.name
+        assert built_request.model.data.version == model.version
+    assert built_request.input.which() == "descriptors"
+    assert len(built_request.input.descriptors) == len(input)
+    assert len(built_request.output) == len(output)
+    for i, j in zip(built_request.outputDescriptors, output_descriptors):
+        assert i.order == j.order
+    if built_request.customAttributes.which() == "tf":
+        assert (
+            built_request.customAttributes.tf.tensorType == custom_attributes.tensorType
+        )
+    elif built_request.customAttributes.which() == "torch":
+        assert (
+            built_request.customAttributes.torch.tensorType
+            == custom_attributes.tensorType
+        )
+    else:
+        assert built_request.customAttributes.none == custom_attributes
+
+
+@pytest.mark.parametrize(
+    "reply_channel, model, input, output, output_descriptors, custom_attributes",
+    [
+        pytest.param(
+            [],
+            model_key,
+            [tensor_3, tensor_4],
+            [],
+            [output_descriptor2],
+            tf_attributes,
+            id="bad channel",
+        ),
+        pytest.param(
+            b"reply channel",
+            "bad model",
+            [tensor_4],
+            [],
+            [output_descriptor2],
+            tf_attributes,
+            id="bad model",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            ["input_key1", "input_key2"],
+            [],
+            [output_descriptor2],
+            torch_attributes,
+            id="bad inputs",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [],
+            ["output_key1", "output_key2"],
+            [output_descriptor2],
+            tf_attributes,
+            id="bad outputs",
+        ),
+        pytest.param(
+            b"reply channel",
+            model_key,
+            [tensor_4],
+            [],
+            [output_descriptor2],
+            "bad attributes",
+            id="bad custom attributes",
+        ),
+        pytest.param(
+            b"reply_channel",
+            model_key,
+            [tensor_3, tensor_4],
+            [],
+            ["output_descriptor2"],
+            torch_attributes,
+            id="bad output descriptors",
+        ),
+    ],
+)
+def test_build_request_direct_unsuccessful(
+    reply_channel, model, input, output, output_descriptors, custom_attributes
+):
+    with pytest.raises(ValueError):
+        built_request = MessageHandler.build_request(
+            reply_channel,
+            model,
+            input,
+            output,
+            output_descriptors,
+            custom_attributes,
+        )
+
+
+@pytest.mark.parametrize(
+    "req",
+    [
+        pytest.param(tf_indirect_request, id="tf indirect"),
+        pytest.param(tf_direct_request, id="tf direct"),
+        pytest.param(torch_indirect_request, id="indirect"),
+        pytest.param(torch_direct_request, id="direct"),
+    ],
+)
+def test_serialize_request_successful(req):
+    serialized = MessageHandler.serialize_request(req)
+    assert type(serialized) == bytes
+
+    deserialized = MessageHandler.deserialize_request(serialized)
+    assert deserialized.to_dict() == req.to_dict()
diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py
new file mode 100644
index 000000000..d6894eb5c
--- /dev/null
+++ b/tests/test_message_handler/test_response.py
@@ -0,0 +1,178 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+from smartsim._core.mli.message_handler import MessageHandler
+
+# The tests in this file belong to the group_a group
+pytestmark = pytest.mark.group_a
+
+fsd = "mock-feature-store-descriptor"
+
+result_key1 = MessageHandler.build_tensor_key("result_key1", fsd)
+result_key2 = MessageHandler.build_tensor_key("result_key2", fsd)
+
+torch_attributes = MessageHandler.build_torch_response_attributes()
+tf_attributes = MessageHandler.build_tf_response_attributes()
+
+tensor1 = MessageHandler.build_tensor_descriptor("c", "int8", [1])
+tensor2 = MessageHandler.build_tensor_descriptor("c", "int64", [3, 2])
+
+
+tf_indirect_response = MessageHandler.build_response(
+    "complete",
+    "Success!",
+    [result_key1, result_key2],
+    tf_attributes,
+)
+
+tf_direct_response = MessageHandler.build_response(
+    "complete",
+    "Success again!",
+    [tensor2, tensor1],
+    tf_attributes,
+)
+
+torch_indirect_response = MessageHandler.build_response(
+    "complete",
+    "Success!",
+    [result_key1, result_key2],
+    torch_attributes,
+)
+
+torch_direct_response = MessageHandler.build_response(
+    "complete",
+    "Success again!",
+    [tensor1, tensor2],
+    torch_attributes,
+)
+
+
+@pytest.mark.parametrize(
+    "status, status_message, result, custom_attribute",
+    [
+        pytest.param(
+            200,
+            "Yay, it worked!",
+            [tensor1, tensor2],
+            None,
+            id="tensor descriptor list",
+        ),
+        pytest.param(
+            200,
+            "Yay, it worked!",
+            [result_key1, result_key2],
+            tf_attributes,
+            id="tensor key list",
+        ),
+    ],
+)
+def test_build_response_successful(status, status_message, result, custom_attribute):
+    response = MessageHandler.build_response(
+        status=status,
+        message=status_message,
+        result=result,
+        custom_attributes=custom_attribute,
+    )
+    assert response is not None
+    assert response.status == status
+    assert response.message == status_message
+    if response.result.which() == "keys":
+        assert response.result.keys[0].to_dict() == result[0].to_dict()
+    else:
+        assert response.result.descriptors[0].to_dict() == result[0].to_dict()
+
+
+@pytest.mark.parametrize(
+    "status, status_message, result, custom_attribute",
+    [
+        pytest.param(
+            "bad status",
+            "Yay, it worked!",
+            [tensor1, tensor2],
+            None,
+            id="bad status",
+        ),
+        pytest.param(
+            "complete",
+            200,
+            [tensor2],
+            torch_attributes,
+            id="bad status message",
+        ),
+        pytest.param(
+            "complete",
+            "Yay, it worked!",
+            ["result_key1", "result_key2"],
+            tf_attributes,
+            id="bad result",
+        ),
+        pytest.param(
+            "complete",
+            "Yay, it worked!",
+            [tf_attributes],
+            tf_attributes,
+            id="bad result type",
+        ),
+        pytest.param(
+            "complete",
+            "Yay, it worked!",
+            [tensor2, tensor1],
+            "custom attributes",
+            id="bad custom attributes",
+        ),
+        pytest.param(
+            "complete",
+            "Yay, it worked!",
+            [tensor2, tensor1],
+            result_key1,
+            id="bad custom attributes type",
+        ),
+    ],
+)
+def test_build_response_unsuccessful(status, status_message, result, custom_attribute):
+    with pytest.raises(ValueError):
+        response = MessageHandler.build_response(
+            status, status_message, result, custom_attribute
+        )
+
+
+@pytest.mark.parametrize(
+    "response",
+    [
+        pytest.param(torch_indirect_response, id="indirect"),
+        pytest.param(torch_direct_response, id="direct"),
+        pytest.param(tf_indirect_response, id="tf indirect"),
+        pytest.param(tf_direct_response, id="tf direct"),
+    ],
+)
+def test_serialize_response(response):
+    serialized = MessageHandler.serialize_response(response)
+    assert type(serialized) == bytes
+
+    deserialized = MessageHandler.deserialize_response(serialized)
+    assert deserialized.to_dict() == response.to_dict()
diff --git a/tests/test_node_prioritizer.py b/tests/test_node_prioritizer.py
new file mode 100644
index 000000000..905c0ecc9
--- /dev/null
+++ b/tests/test_node_prioritizer.py
@@ -0,0 +1,553 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2024, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import random
+import threading
+import typing as t
+
+import pytest
+
+from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter
+from smartsim.error.errors import SmartSimError
+from smartsim.log import get_logger
+
+# The tests in this file belong to the group_b group
+pytestmark = pytest.mark.group_b
+
+
+logger = get_logger(__name__)
+
+
+class MockNode:
+    def __init__(self, hostname: str, num_cpus: int, num_gpus: int) -> None:
+        self.hostname = hostname
+        self.num_cpus = num_cpus
+        self.num_gpus = num_gpus
+
+
+def mock_node_hosts(
+    num_cpu_nodes: int, num_gpu_nodes: int
+) -> t.Tuple[t.List[MockNode], t.List[MockNode]]:
+    cpu_hosts = [f"cpu-node-{i}" for i in range(num_cpu_nodes)]
+    gpu_hosts = [f"gpu-node-{i}" for i in range(num_gpu_nodes)]
+
+    return cpu_hosts, gpu_hosts
+
+
+def mock_node_builder(num_cpu_nodes: int, num_gpu_nodes: int) -> t.List[MockNode]:
+    nodes = []
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+
+    nodes.extend(MockNode(hostname, 4, 0) for hostname in cpu_hosts)
+    nodes.extend(MockNode(hostname, 4, 4) for hostname in gpu_hosts)
+
+    return nodes
+
+
+def test_node_prioritizer_init_null() -> None:
+    """Verify that the priorizer reports failures to send a valid node set
+    if a null value is passed"""
+    lock = threading.RLock()
+    with pytest.raises(SmartSimError) as ex:
+        NodePrioritizer(None, lock)
+
+    assert "Missing" in ex.value.args[0]
+
+
+def test_node_prioritizer_init_empty() -> None:
+    """Verify that the priorizer reports failures to send a valid node set
+    if an empty list is passed"""
+    lock = threading.RLock()
+    with pytest.raises(SmartSimError) as ex:
+        NodePrioritizer([], lock)
+
+    assert "Missing" in ex.value.args[0]
+
+
+@pytest.mark.parametrize(
+    "num_cpu_nodes,num_gpu_nodes", [(1, 1), (2, 1), (1, 2), (8, 4), (1000, 200)]
+)
+def test_node_prioritizer_init_ok(num_cpu_nodes: int, num_gpu_nodes: int) -> None:
+    """Verify that initialization with a valid node list results in the
+    appropriate cpu & gpu ref counts, and complete ref map"""
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    # perform prioritizer initialization
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # get a copy of all the expected host names
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    all_hosts = cpu_hosts + gpu_hosts
+    assert len(all_hosts) == num_cpu_nodes + num_gpu_nodes
+
+    # verify tracking data is initialized correctly for all nodes
+    for hostname in all_hosts:
+        # show that the ref map is tracking the node
+        assert hostname in p._nodes
+
+        tracking_info = p.get_tracking_info(hostname)
+
+        # show that the node is created w/zero ref counts
+        assert tracking_info.num_refs == 0
+
+        # show that the node is created and marked as not dirty (unchanged)
+        # assert tracking_info.is_dirty == False
+
+    # iterate through known cpu node keys and verify prioritizer initialization
+    for hostname in cpu_hosts:
+        # show that the device ref counters are appropriately assigned
+        cpu_ref = next((n for n in p._cpu_refs if n.hostname == hostname), None)
+        assert cpu_ref, "CPU-only node not found in cpu ref set"
+
+        gpu_ref = next((n for n in p._gpu_refs if n.hostname == hostname), None)
+        assert not gpu_ref, "CPU-only node should not be found in gpu ref set"
+
+    # iterate through known GPU node keys and verify prioritizer initialization
+    for hostname in gpu_hosts:
+        # show that the device ref counters are appropriately assigned
+        gpu_ref = next((n for n in p._gpu_refs if n.hostname == hostname), None)
+        assert gpu_ref, "GPU-only node not found in gpu ref set"
+
+        cpu_ref = next((n for n in p._cpu_refs if n.hostname == hostname), None)
+        assert not cpu_ref, "GPU-only node should not be found in cpu ref set"
+
+    # verify we have all hosts in the ref map
+    assert set(p._nodes.keys()) == set(all_hosts)
+
+    # verify we have no extra hosts in ref map
+    assert len(p._nodes.keys()) == len(set(all_hosts))
+
+
+def test_node_prioritizer_direct_increment() -> None:
+    """Verify that performing the increment operation causes the expected
+    side effect on the intended records"""
+
+    num_cpu_nodes, num_gpu_nodes = 32, 8
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    exclude_index = 2
+    exclude_host0 = cpu_hosts[exclude_index]
+    exclude_host1 = gpu_hosts[exclude_index]
+    exclusions = [exclude_host0, exclude_host1]
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # let's increment each element in a predictable way and verify
+    for node in nodes:
+        if node.hostname in exclusions:
+            # expect 1 cpu and 1 gpu node at zero and not incremented
+            continue
+
+        if node.num_gpus == 0:
+            num_increments = random.randint(0, num_cpu_nodes - 1)
+        else:
+            num_increments = random.randint(0, num_gpu_nodes - 1)
+
+        # increment this node some random number of times
+        for _ in range(num_increments):
+            p.increment(node.hostname)
+
+        # ... and verify the correct incrementing is applied
+        tracking_info = p.get_tracking_info(node.hostname)
+        assert tracking_info.num_refs == num_increments
+
+    # verify the excluded cpu node was never changed
+    tracking_info0 = p.get_tracking_info(exclude_host0)
+    assert tracking_info0.num_refs == 0
+
+    # verify the excluded gpu node was never changed
+    tracking_info1 = p.get_tracking_info(exclude_host1)
+    assert tracking_info1.num_refs == 0
+
+
+def test_node_prioritizer_indirect_increment() -> None:
+    """Verify that performing the increment operation indirectly affects
+    each available node until we run out of nodes to return"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # verify starting state
+    for node in p._nodes.values():
+        tracking_info = p.get_tracking_info(node.hostname)
+
+        assert node.num_refs == 0  # <--- ref count starts at zero
+        assert tracking_info.num_refs == 0  # <--- ref count starts at zero
+
+    # perform indirect
+    for node in p._nodes.values():
+        tracking_info = p.get_tracking_info(node.hostname)
+
+        # apply `next` operation and verify tracking info reflects new ref
+        node = p.next(PrioritizerFilter.CPU)
+        tracking_info = p.get_tracking_info(node.hostname)
+
+        # verify side-effects
+        assert tracking_info.num_refs > 0  # <--- ref count should now be > 0
+
+        # we expect it to give back only "clean" nodes from next*
+        assert tracking_info.is_dirty == False  # NOTE: this is "hidden" by protocol
+
+    # every node should be incremented now. prioritizer shouldn't have anything to give
+    tracking_info = p.next(PrioritizerFilter.CPU)
+    assert tracking_info is None  # <--- get_next shouldn't have any nodes to give
+
+
+def test_node_prioritizer_indirect_decrement_availability() -> None:
+    """Verify that a node who is decremented (dirty) is made assignable
+    on a subsequent request"""
+
+    num_cpu_nodes, num_gpu_nodes = 1, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # increment our only node...
+    p.increment(cpu_hosts[0])
+
+    tracking_info = p.next()
+    assert tracking_info is None, "No nodes should be assignable"
+
+    # perform a decrement...
+    p.decrement(cpu_hosts[0])
+
+    # ... and confirm that the node is available again
+    tracking_info = p.next()
+    assert tracking_info is not None, "A node should be assignable"
+
+
+def test_node_prioritizer_multi_increment() -> None:
+    """Verify that retrieving multiple nodes via `next_n` API correctly
+    increments reference counts and returns appropriate results"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    assert p.get_tracking_info(cpu_hosts[0]).num_refs > 0
+
+    p.increment(cpu_hosts[2])
+    assert p.get_tracking_info(cpu_hosts[2]).num_refs > 0
+
+    p.increment(cpu_hosts[4])
+    assert p.get_tracking_info(cpu_hosts[4]).num_refs > 0
+
+    # use next_n w/the minimum allowed value
+    all_tracking_info = p.next_n(1, PrioritizerFilter.CPU)  # <---- next_n(1)
+
+    # confirm the number requested is honored
+    assert len(all_tracking_info) == 1
+    # ensure no unavailable node is returned
+    assert all_tracking_info[0].hostname not in [
+        cpu_hosts[0],
+        cpu_hosts[2],
+        cpu_hosts[4],
+    ]
+
+    # use next_n w/value that exceeds available number of open nodes
+    # 3 direct increments in setup, 1 out of next_n(1), 4 left
+    all_tracking_info = p.next_n(5, PrioritizerFilter.CPU)
+
+    # confirm that no nodes are returned, even though 4 out of 5 requested are available
+    assert len(all_tracking_info) == 0
+
+
+def test_node_prioritizer_multi_increment_validate_n() -> None:
+    """Verify that retrieving multiple nodes via `next_n` API correctly
+    reports failures when the request size is above pool size"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # we have 8 total cpu nodes available... request too many nodes
+    all_tracking_info = p.next_n(9, PrioritizerFilter.CPU)
+    assert len(all_tracking_info) == 0
+
+    all_tracking_info = p.next_n(num_cpu_nodes * 1000, PrioritizerFilter.CPU)
+    assert len(all_tracking_info) == 0
+
+
+def test_node_prioritizer_indirect_direct_interleaved_increments() -> None:
+    """Verify that interleaving indirect and direct increments results in
+    expected ref counts"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 4
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # perform some set of non-popped increments
+    p.increment(gpu_hosts[1])
+    p.increment(gpu_hosts[3])
+    p.increment(gpu_hosts[3])
+
+    # increment 0th item 1x
+    p.increment(cpu_hosts[0])
+
+    # increment 3th item 2x
+    p.increment(cpu_hosts[3])
+    p.increment(cpu_hosts[3])
+
+    # increment last item 3x
+    p.increment(cpu_hosts[7])
+    p.increment(cpu_hosts[7])
+    p.increment(cpu_hosts[7])
+
+    tracking_info = p.get_tracking_info(gpu_hosts[1])
+    assert tracking_info.num_refs == 1
+
+    tracking_info = p.get_tracking_info(gpu_hosts[3])
+    assert tracking_info.num_refs == 2
+
+    nodes = [n for n in p._nodes.values() if n.num_refs == 0 and n.num_gpus == 0]
+
+    # we should skip the 0-th item in the heap due to direct increment
+    tracking_info = p.next(PrioritizerFilter.CPU)
+    assert tracking_info.num_refs == 1
+    # confirm we get a cpu node
+    assert "cpu-node" in tracking_info.hostname
+
+    # this should pull the next item right out
+    tracking_info = p.next(PrioritizerFilter.CPU)
+    assert tracking_info.num_refs == 1
+    assert "cpu-node" in tracking_info.hostname
+
+    # ensure we pull from gpu nodes and the 0th item is returned
+    tracking_info = p.next(PrioritizerFilter.GPU)
+    assert tracking_info.num_refs == 1
+    assert "gpu-node" in tracking_info.hostname
+
+    # we should step over the 3-th node on this iteration
+    tracking_info = p.next(PrioritizerFilter.CPU)
+    assert tracking_info.num_refs == 1
+    assert "cpu-node" in tracking_info.hostname
+
+    # and ensure that heap also steps over a direct increment
+    tracking_info = p.next(PrioritizerFilter.GPU)
+    assert tracking_info.num_refs == 1
+    assert "gpu-node" in tracking_info.hostname
+
+    # and another GPU request should return nothing
+    tracking_info = p.next(PrioritizerFilter.GPU)
+    assert tracking_info is None
+
+
+def test_node_prioritizer_decrement_floor() -> None:
+    """Verify that repeatedly decrementing ref counts does not
+    allow negative ref counts"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 4
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # try a ton of decrements on all the items in the prioritizer
+    for _ in range(len(nodes) * 100):
+        index = random.randint(0, num_cpu_nodes - 1)
+        p.decrement(cpu_hosts[index])
+
+        index = random.randint(0, num_gpu_nodes - 1)
+        p.decrement(gpu_hosts[index])
+
+    for node in nodes:
+        tracking_info = p.get_tracking_info(node.hostname)
+        assert tracking_info.num_refs == 0
+
+
+@pytest.mark.parametrize("num_requested", [1, 2, 3])
+def test_node_prioritizer_multi_increment_subheap(num_requested: int) -> None:
+    """Verify that retrieving multiple nodes via `next_n` API correctly
+    increments reference counts and returns appropriate results
+    when requesting an in-bounds number of nodes"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+    p.increment(cpu_hosts[4])
+
+    hostnames = [cpu_hosts[0], cpu_hosts[1], cpu_hosts[2], cpu_hosts[3], cpu_hosts[5]]
+
+    # request n == {num_requested} nodes from set of 3 available
+    all_tracking_info = p.next_n(
+        num_requested,
+        hosts=hostnames,
+    )  # <---- w/0,2,4 assigned, only 1,3,5 from hostnames can work
+
+    # all parameterizations should result in a matching output size
+    assert len(all_tracking_info) == num_requested
+
+
+def test_node_prioritizer_multi_increment_subheap_assigned() -> None:
+    """Verify that retrieving multiple nodes via `next_n` API does
+    not return anything when the number requested cannot be satisfied
+    by the given subheap due to prior assignment"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+
+    hostnames = [
+        cpu_hosts[0],
+        "x" + cpu_hosts[2],
+    ]  # <--- we can't get 2 from 1 valid node name
+
+    # request n == {num_requested} nodes from set of 3 available
+    num_requested = 2
+    all_tracking_info = p.next_n(num_requested, hosts=hostnames)
+
+    # w/0,2 assigned, nothing can be returned
+    assert len(all_tracking_info) == 0
+
+
+def test_node_prioritizer_empty_subheap_next_w_no_hosts() -> None:
+    """Verify that retrieving multiple nodes via `next_n` API does
+    with an empty host list uses the entire available host list"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+
+    hostnames = []
+
+    # request n == {num_requested} nodes from set of 3 available
+    num_requested = 1
+    node = p.next(hosts=hostnames)
+    assert node
+
+    # assert "No hostnames provided" == ex.value.args[0]
+
+
+def test_node_prioritizer_empty_subheap_next_n_w_hosts() -> None:
+    """Verify that retrieving multiple nodes via `next_n` API does
+    not blow up with an empty host list"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+
+    hostnames = []
+
+    # request n == {num_requested} nodes from set of 3 available
+    num_requested = 1
+    node = p.next_n(num_requested, hosts=hostnames)
+    assert node is not None
+
+
+@pytest.mark.parametrize("num_requested", [-100, -1, 0])
+def test_node_prioritizer_empty_subheap_next_n(num_requested: int) -> None:
+    """Verify that retrieving a node via `next_n` API does
+    not allow a request with num_items < 1"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+
+    # request n == {num_requested} nodes from set of 3 available
+    with pytest.raises(ValueError) as ex:
+        p.next_n(num_requested)
+
+    assert "Number of items requested" in ex.value.args[0]
+
+
+@pytest.mark.parametrize("num_requested", [-100, -1, 0])
+def test_node_prioritizer_empty_subheap_next_n(num_requested: int) -> None:
+    """Verify that retrieving multiple nodes via `next_n` API does
+    not allow a request with num_items < 1"""
+
+    num_cpu_nodes, num_gpu_nodes = 8, 0
+    cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
+    nodes = mock_node_builder(num_cpu_nodes, num_gpu_nodes)
+
+    lock = threading.RLock()
+    p = NodePrioritizer(nodes, lock)
+
+    # Mark some nodes as dirty to verify retrieval
+    p.increment(cpu_hosts[0])
+    p.increment(cpu_hosts[2])
+
+    hostnames = [cpu_hosts[0], cpu_hosts[2]]
+
+    # request n == {num_requested} nodes from set of 3 available
+    with pytest.raises(ValueError) as ex:
+        p.next_n(num_requested, hosts=hostnames)
+
+    assert "Number of items requested" in ex.value.args[0]