From c6a145acad6da4aefe11afba194a6aa4df0534d5 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 17 Sep 2024 19:41:19 -0500 Subject: [PATCH 01/40] Squash event integration --- doc/changelog.md | 2 +- ex/high_throughput_inference/mock_app.py | 109 +-- .../standalone_worker_manager.py | 31 +- .../_core/launcher/dragon/dragonBackend.py | 151 +++- smartsim/_core/mli/comm/channel/channel.py | 24 +- .../_core/mli/comm/channel/dragon_channel.py | 23 +- smartsim/_core/mli/comm/channel/dragon_fli.py | 44 +- .../infrastructure/control/error_handling.py | 2 +- .../mli/infrastructure/environment_loader.py | 2 +- .../storage/backbone_feature_store.py | 251 ++++++- .../storage/dragon_feature_store.py | 8 +- .../infrastructure/storage/feature_store.py | 30 +- .../_core/mli/infrastructure/worker/worker.py | 41 +- smartsim/_core/mli/message_handler.py | 24 +- .../mli_schemas/data/data_references.capnp | 4 +- .../data/data_references_capnp.pyi | 4 +- .../mli/mli_schemas/request/request.capnp | 2 +- .../mli/mli_schemas/request/request_capnp.pyi | 2 +- smartsim/log.py | 7 +- smartsim/protoclient.py | 285 +++++++ tests/dragon/test_dragon_backend.py | 174 +++++ tests/dragon/test_environment_loader.py | 2 +- tests/dragon/test_error_handling.py | 80 +- tests/dragon/test_featurestore.py | 338 +++++++++ tests/dragon/test_featurestore_base.py | 96 ++- tests/dragon/test_featurestore_integration.py | 3 +- tests/dragon/test_protoclient.py | 231 ++++++ tests/dragon/test_request_dispatcher.py | 81 +- tests/dragon/test_worker_manager.py | 557 ++++++++------ tests/dragon/utils/channel.py | 18 +- tests/mli/channel.py | 18 +- tests/mli/test_integrated_torch_worker.py | 24 +- tests/test_featurestore.py | 711 ++++++++++++++++++ .../test_build_model_key.py | 10 +- .../test_output_descriptor.py | 2 +- tests/test_message_handler/test_request.py | 38 +- tests/test_message_handler/test_response.py | 4 +- 37 files changed, 2874 insertions(+), 559 deletions(-) create mode 100644 smartsim/protoclient.py create mode 100644 tests/dragon/test_dragon_backend.py create mode 100644 tests/dragon/test_featurestore.py create mode 100644 tests/dragon/test_protoclient.py create mode 100644 tests/test_featurestore.py diff --git a/doc/changelog.md b/doc/changelog.md index 7d08c9376..b0e326d1f 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,12 +13,12 @@ Jump to: Description +- Implement asynchronous notifications for shared data - Quick bug fix in _validate - Add helper methods to MLI classes - Update error handling for consistency - Parameterize installation of dragon package with `smart build` - Update docstrings -- Implement asynchronous notifications for shared data - Filenames conform to snake case - Update SmartSim environment variables using new naming convention - Refactor `exception_handler` diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index dcc52296e..31195c7e6 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -37,18 +37,10 @@ import argparse import io -import numpy -import os -import time + import torch -from mpi4py import MPI -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( - DragonFeatureStore, -) -from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger -from smartsim._core.utils.timings import PerfTimer torch.set_num_interop_threads(16) torch.set_num_threads(1) @@ -56,79 +48,15 @@ logger = get_logger("App") logger.info("Started app") -CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False +from collections import OrderedDict -class ProtoClient: - def __init__(self, timing_on: bool): - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - connect_to_infrastructure() - ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] - self._ddict = DDict.attach(ddict_str) - self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor - to_worker_fli_str = None - while to_worker_fli_str is None: - try: - to_worker_fli_str = self._ddict["to_worker_fli"] - self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) - except KeyError: - time.sleep(1) - self._from_worker_ch = Channel.make_process_local() - self._from_worker_ch_serialized = self._from_worker_ch.serialize() - self._to_worker_ch = Channel.make_process_local() - - self.perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_") - - def run_model(self, model: bytes | str, batch: torch.Tensor): - tensors = [batch.numpy()] - self.perf_timer.start_timings("batch_size", batch.shape[0]) - built_tensor_desc = MessageHandler.build_tensor_descriptor( - "c", "float32", list(batch.shape) - ) - self.perf_timer.measure_time("build_tensor_descriptor") - if isinstance(model, str): - model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor) - else: - model_arg = MessageHandler.build_model(model, "resnet-50", "1.0") - request = MessageHandler.build_request( - reply_channel=self._from_worker_ch_serialized, - model=model_arg, - inputs=[built_tensor_desc], - outputs=[], - output_descriptors=[], - custom_attributes=None, - ) - self.perf_timer.measure_time("build_request") - request_bytes = MessageHandler.serialize_request(request) - self.perf_timer.measure_time("serialize_request") - with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: - to_sendh.send_bytes(request_bytes) - self.perf_timer.measure_time("send_request") - for tensor in tensors: - to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!! - self.perf_timer.measure_time("send_tensors") - with self._from_worker_ch.recvh(timeout=None) as from_recvh: - resp = from_recvh.recv_bytes(timeout=None) - self.perf_timer.measure_time("receive_response") - response = MessageHandler.deserialize_response(resp) - self.perf_timer.measure_time("deserialize_response") - # list of data blobs? recv depending on the len(response.result.descriptors)? - data_blob: bytes = from_recvh.recv_bytes(timeout=None) - self.perf_timer.measure_time("receive_tensor") - result = torch.from_numpy( - numpy.frombuffer( - data_blob, - dtype=str(response.result.descriptors[0].dataType), - ) - ) - self.perf_timer.measure_time("deserialize_tensor") +from smartsim.log import get_logger, log_to_file +from smartsim.protoclient import ProtoClient - self.perf_timer.end_timings() - return result +logger = get_logger("App", "DEBUG") - def set_model(self, key: str, model: bytes): - self._ddict[key] = model +CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False class ResNetWrapper: @@ -151,6 +79,7 @@ def model(self): def name(self): return self._name + if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") @@ -160,30 +89,38 @@ def name(self): resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt") - client = ProtoClient(timing_on=True) - client.set_model(resnet.name, resnet.model) + client = ProtoClient(timing_on=True, wait_timeout=0) + # client.set_model(resnet.name, resnet.model) if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: # TODO: adapt to non-Nvidia devices torch_device = args.device.replace("gpu", "cuda") - pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to(torch_device) + pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to( + torch_device + ) TOTAL_ITERATIONS = 100 - for log2_bsize in range(args.log_max_batchsize+1): + for log2_bsize in range(args.log_max_batchsize + 1): b_size: int = 2**log2_bsize logger.info(f"Batch size: {b_size}") - for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)): + for iteration_number in range(TOTAL_ITERATIONS + int(b_size == 1)): logger.info(f"Iteration: {iteration_number}") sample_batch = resnet.get_batch(b_size) remote_result = client.run_model(resnet.name, sample_batch) logger.info(client.perf_timer.get_last("total_time")) if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: local_res = pt_model(sample_batch.to(torch_device)) - err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to(torch_device)-torch.flatten(local_res), ord=1).cpu() + err_norm = torch.linalg.vector_norm( + torch.flatten(remote_result).to(torch_device) + - torch.flatten(local_res), + ord=1, + ).cpu() res_norm = torch.linalg.vector_norm(remote_result, ord=1).item() local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item() - logger.info(f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}") + logger.info( + f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}" + ) torch.cuda.synchronize() - client.perf_timer.print_timings(to_file=True) \ No newline at end of file + client.perf_timer.print_timings(to_file=True) diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py index feb1af1ae..e34df0ccd 100644 --- a/ex/high_throughput_inference/standalone_worker_manager.py +++ b/ex/high_throughput_inference/standalone_worker_manager.py @@ -37,6 +37,7 @@ from dragon.globalservices.api_setup import connect_to_infrastructure from dragon.managed_memory import MemoryPool from dragon.utils import b64decode, b64encode + # pylint enable=import-error # isort: off @@ -45,6 +46,7 @@ import argparse import base64 import multiprocessing as mp +import optparse import os import pickle import socket @@ -53,26 +55,24 @@ import typing as t import cloudpickle -import optparse -import os from smartsim._core.entrypoints.service import Service from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( - DragonFeatureStore, -) from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestDispatcher, ) from smartsim._core.mli.infrastructure.control.worker_manager import WorkerManager from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) +from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase - from smartsim.log import get_logger logger = get_logger("Worker Manager Entry Point") @@ -85,7 +85,6 @@ logger.info(f"CPUS: {os.cpu_count()}") - def service_as_dragon_proc( service: Service, cpu_affinity: list[int], gpu_affinity: list[int] ) -> dragon_process.Process: @@ -108,8 +107,6 @@ def service_as_dragon_proc( ) - - if __name__ == "__main__": parser = argparse.ArgumentParser("Worker Manager") parser.add_argument( @@ -144,26 +141,24 @@ def service_as_dragon_proc( connect_to_infrastructure() ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] - ddict = DDict.attach(ddict_str) + + backbone = BackboneFeatureStore.from_descriptor(ddict_str) to_worker_channel = Channel.make_process_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_serialized = to_worker_fli.serialize() - ddict["to_worker_fli"] = to_worker_fli_serialized + to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, True) + + backbone.worker_queue = to_worker_fli_comm_channel.descriptor arg_worker_type = cloudpickle.loads( base64.b64decode(args.worker_class.encode("ascii")) ) - dfs = DragonFeatureStore(ddict) - comm_channel = DragonFLIChannel(to_worker_fli_serialized) - - descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor + os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel, + callback_factory=DragonCommChannel.from_descriptor, queue_factory=DragonFLIChannel.from_descriptor, ) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 7526af14a..0f8121ab5 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -26,6 +26,7 @@ import collections import functools import itertools +import multiprocessing as mp import time import typing as t from dataclasses import dataclass, field @@ -34,18 +35,27 @@ from tabulate import tabulate -# pylint: disable=import-error +# pylint: disable=import-error,C0302,R0915,R6301 # isort: off import dragon.data.ddict.ddict as dragon_ddict import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy import dragon.infrastructure.process_desc as dragon_process_desc -import dragon.native.group_state as dragon_group_state + +# import dragon.native.group_state as dragon_group_state import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group import dragon.native.machine as dragon_machine from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBase, + # EventBroadcaster, + EventCategory, + EventConsumer, +) # pylint: enable=import-error # isort: on @@ -72,8 +82,8 @@ class DragonStatus(str, Enum): - ERROR = str(dragon_group_state.Error()) - RUNNING = str(dragon_group_state.Running()) + ERROR = "Error" # str(dragon_group_state.Error()) + RUNNING = "Running" # str(dragon_group_state.Running()) def __str__(self) -> str: return self.value @@ -187,8 +197,15 @@ def __init__(self, pid: int) -> None: else 5 ) """Time in seconds needed to server to complete shutdown""" - self._infra_ddict: t.Optional[dragon_ddict.DDict] = None - + self._backbone: t.Optional[BackboneFeatureStore] = None + """The backbone feature store""" + self._event_consumer: t.Optional[EventConsumer] = None + """A listener registered to listen for new consumers and update the shared + consumer registrations list""" + self._event_consumer_process: t.Optional[mp.Process] = None + """The process executing the event consumers `listen` method""" + + """An event consumer for receiving events from MLI resources""" self._nodes: t.List["dragon_machine.Node"] = [] """Node capability information for hosts in the allocation""" self._hosts: t.List[str] = [] @@ -539,21 +556,113 @@ def _stop_steps(self) -> None: self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED self._group_infos[step_id].return_codes = [-9] - @property - def infra_ddict(self) -> str: - """Create a Dragon distributed dictionary and return its - serialized descriptor + def _create_backbone(self) -> BackboneFeatureStore: + """ + Create a BackboneFeatureStore if one does not exist. + + :returns: The descriptor of the backbone feature store """ - if self._infra_ddict is None: - logger.info("Creating DDict") - self._infra_ddict = dragon_ddict.DDict( + if self._backbone is None: + logger.info("Creating backbone storage DDict") + backbone_storage = dragon_ddict.DDict( n_nodes=len(self._hosts), total_mem=len(self._hosts) * 1024**3 ) # todo: parametrize - logger.info("Created DDict") - self._infra_ddict["creation"] = str(time.time()) - logger.info(self._infra_ddict["creation"]) + logger.info("Created backbone storage DDict") + self._backbone = BackboneFeatureStore( + backbone_storage, allow_reserved_writes=True + ) + logger.info(self._backbone.creation_date) + + return self._backbone + + def _on_consumer_created(self, event: EventBase) -> None: + """Event handler for""" + logger.warning(f"Unhandled event received: {event}") + + def _bootstrap_event_listeners( + self, backbone: BackboneFeatureStore, consumer: EventConsumer + ) -> None: + """Update the list of notification channels registered in the backbone. + + :param backbone: The backbone feature store to update""" + # Copy the consumer list so a backend restart doesn't clear registrations + notify_descriptors = list(backbone.notification_channels) - return str(self._infra_ddict.serialize()) + # Update directly to avoid SEND/ACK pattern + notify_descriptors.append(consumer.descriptor) + # consumer.register() # this will loop infinitely waiting for itself + + backbone.notification_channels = notify_descriptors + + def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer: + """ + Create an event publisher and event consumer for communicating with + other MLI resources. + + :param backbone: The backbone feature store used by the MLI backend. NOTE: + passing backbone as a parameter to ensure the backbone is initialized before + attempting to connect any eventing clients. + :returns: The newly created EventConsumer instance + """ + # if self._event_producer is None: + # logger.info("Creating event publisher") + # # todo: ensure DCC.from_descriptor and not DCC.from_local + # self._event_producer = + # EventBroadcaster(backbone, DragonCommChannel.from_descriptor) + # logger.info("Created event publisher") + + if self._event_consumer is None: + logger.info("Creating event consumer") + event_channel = DragonCommChannel.from_local() + consumer = EventConsumer( + event_channel, + backbone, + [EventCategory.CONSUMER_CREATED], + name="BackendConsumerRegistrar", + event_handler=self._on_consumer_created, + ) + + # self._backbone.backend_channel = + # consumer.descriptor # i want to get rid of this extra channel + # self._bootstrap_event_listeners(backbone, consumer) + self._event_consumer = consumer + + # options = dragon_process_desc. + # ProcessOptions(make_inf_channels=True) # what is this!? + # grp_consumer = dragon_process_group.ProcessGroup( + # restart=False, pmi_enabled=False + # ) + # self._event_consumer_process = dragon_process.ProcessTemplate( + # target=self._event_consumer.listen, + # # args=request.exe_args, + # # cwd=request.path, + # env={ + # # **request.current_env, + # # **request.env, + # **self._backbone.get_env(), + # }, + # stdout=dragon_process.Popen.PIPE, + # stderr=dragon_process.Popen.PIPE, + # # policy=local_policy, + # options=options, + # ) + # grp_consumer.add(self._event_consumer_process) + # # self._event_consumer_process = + # mp.Process(target=self._event_consumer.listen) + # # self._event_consumer_process.start() + # grp_consumer.init() + # grp_consumer.start() + + logger.info("Created event consumer") + + return self._event_consumer + + def _start_eventing_listeners(self) -> None: + if self._event_consumer: + self._event_consumer_process = mp.Process( + target=self._event_consumer.listen + ) + self._event_consumer_process.start() @staticmethod def create_run_policy( @@ -596,6 +705,9 @@ def create_run_policy( def _start_steps(self) -> None: self._heartbeat() + backbone = self._create_backbone() + self._create_eventing(backbone) + with self._queue_lock: started = [] for step_id, request in self._queued_steps.items(): @@ -622,7 +734,7 @@ def _start_steps(self) -> None: env={ **request.current_env, **request.env, - "_SMARTSIM_INFRA_BACKBONE": self.infra_ddict, + **backbone.get_env(), }, stdout=dragon_process.Popen.PIPE, stderr=dragon_process.Popen.PIPE, @@ -778,6 +890,9 @@ def _should_print_status(self) -> bool: def _update(self) -> None: """Trigger all update queries and update local state database""" + backbone = self._create_backbone() + self._create_eventing(backbone) + self._stop_steps() self._start_steps() self._refresh_statuses() diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 9a12e4c8d..90d81cb9b 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -26,6 +26,7 @@ import base64 import typing as t +import uuid from abc import ABC, abstractmethod from smartsim.log import get_logger @@ -36,12 +37,19 @@ class CommChannelBase(ABC): """Base class for abstracting a message passing mechanism""" - def __init__(self, descriptor: t.Union[str, bytes]) -> None: + def __init__( + self, + descriptor: str, + name: t.Optional[str] = None, + ) -> None: """Initialize the CommChannel instance. :param descriptor: Channel descriptor """ self._descriptor = descriptor + """An opaque identifier used to connect to an underlying communication channel""" + self._name = name or str(uuid.uuid4()) + """A user-friendly identifier for channel-related logging""" @abstractmethod def send(self, value: bytes, timeout: float = 0) -> None: @@ -61,11 +69,19 @@ def recv(self, timeout: float = 0) -> t.List[bytes]: """ @property - def descriptor(self) -> bytes: + def descriptor(self) -> str: """Return the channel descriptor for the underlying dragon channel. :returns: Byte encoded channel descriptor """ - if isinstance(self._descriptor, str): - return base64.b64decode(self._descriptor.encode("utf-8")) return self._descriptor + + @property + def decoded_descriptor(self) -> bytes: + """Return the descriptor decoded from a string into bytes""" + return base64.b64decode(self._descriptor.encode("utf-8")) + + def __str__(self) -> str: + """Build a string representation of the channel useful for printing""" + classname = type(self).__class__.__name__ + return f"{classname}('{self._name}', '{self._descriptor}')" diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 1363c0d67..a22ebe952 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -130,15 +130,17 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: with self._channel.recvh(timeout=timeout) as recvh: messages: t.List[bytes] = [] + # todo: consider that this could (under load) never exit. do we need + # to configure a maximum number to pull at once? try: message_bytes = recvh.recv_bytes(timeout=timeout) messages.append(message_bytes) - logger.debug(f"DragonCommChannel {self.descriptor!r} received message") + logger.debug(f"DragonCommChannel {self.descriptor} received message") except dch.ChannelEmpty: # emptied the queue, ok to swallow this ex - logger.debug(f"DragonCommChannel exhausted: {self.descriptor!r}") + logger.debug(f"DragonCommChannel exhausted: {self.descriptor}") except dch.ChannelRecvTimeout as ex: - logger.debug(f"Timeout exceeded on channel.recv: {self.descriptor!r}") + logger.debug(f"Timeout exceeded on channel.recv: {self.descriptor}") return messages @@ -169,8 +171,7 @@ def from_descriptor( :param descriptor: The descriptor that uniquely identifies the resource. Output from `descriptor_string` is correctly encoded. :returns: An attached DragonCommChannel - :raises SmartSimError: If creation of comm channel fails - """ + :raises SmartSimError: If creation of comm channel fails""" try: utf8_descriptor: t.Union[str, bytes] = descriptor if isinstance(descriptor, str): @@ -186,3 +187,15 @@ def from_descriptor( raise SmartSimError( f"Failed to create dragon comm channel: {descriptor!r}" ) from ex + + @classmethod + def from_local(cls, _descriptor: t.Optional[str] = None) -> "DragonCommChannel": + """A factory method that creates a local channel instance + + :returns: An attached DragonCommChannel""" + try: + channel = dch.Channel.make_process_local() + return DragonCommChannel(channel) + except: + logger.error(f"Failed to create local dragon comm channel", exc_info=True) + raise diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 84d809c8a..325f6b779 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -50,7 +50,7 @@ class DragonFLIChannel(cch.CommChannelBase): def __init__( self, - fli_desc: bytes, + fli_: fli.FLInterface, sender_supplied: bool = True, buffer_size: int = 0, ) -> None: @@ -60,9 +60,11 @@ def __init__( :param sender_supplied: Flag indicating if the FLI uses sender-supplied streams :param buffer_size: Maximum number of sent messages that can be buffered """ - super().__init__(fli_desc) - self._fli: "fli" = fli.FLInterface.attach(fli_desc) - self._channel: t.Optional["dch"] = ( + descriptor = base64.b64encode(fli_.serialize()).decode("utf-8") + super().__init__(descriptor) + + self._fli = fli_ + self._channel: t.Optional["dch.Channel"] = ( create_local(buffer_size) if sender_supplied else None ) @@ -107,6 +109,33 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: ) from e return messages + @classmethod + def _string_descriptor_to_fli(cls, descriptor: str) -> "fli.FLInterface": + """Helper method to convert a string-safe, encoded descriptor back + into its original byte format""" + descriptor_ = base64.b64decode(descriptor.encode("utf-8")) + return fli.FLInterface.attach(descriptor_) + + @classmethod + def from_sender_supplied_descriptor( + cls, + descriptor: str, + ) -> "DragonFLIChannel": + """A factory method that creates an instance from a descriptor string + + :param descriptor: the descriptor of the main FLI channel to attach + :returns: An attached DragonFLIChannel""" + try: + return DragonFLIChannel( + fli_=cls._string_descriptor_to_fli(descriptor), + sender_supplied=True, + ) + except: + logger.error( + f"Error while creating sender supplied DragonFLIChannel: {descriptor}" + ) + raise + @classmethod def from_descriptor( cls, @@ -118,10 +147,13 @@ def from_descriptor( :returns: An attached DragonFLIChannel :raises SmartSimError: If creation of DragonFLIChanenel fails """ + if not descriptor: + raise ValueError("Invalid descriptor provided") + try: return DragonFLIChannel( - fli_desc=base64.b64decode(descriptor), - sender_supplied=True, + fli_=cls._string_descriptor_to_fli(descriptor), + sender_supplied=False, ) except Exception as e: raise SmartSimError( diff --git a/smartsim/_core/mli/infrastructure/control/error_handling.py b/smartsim/_core/mli/infrastructure/control/error_handling.py index 8961cac54..a75f533a3 100644 --- a/smartsim/_core/mli/infrastructure/control/error_handling.py +++ b/smartsim/_core/mli/infrastructure/control/error_handling.py @@ -48,7 +48,7 @@ def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: return MessageHandler.build_response( status=status, message=message, - result=[], + result=None, custom_attributes=None, ) diff --git a/smartsim/_core/mli/infrastructure/environment_loader.py b/smartsim/_core/mli/infrastructure/environment_loader.py index 02043fbd8..e67cc469a 100644 --- a/smartsim/_core/mli/infrastructure/environment_loader.py +++ b/smartsim/_core/mli/infrastructure/environment_loader.py @@ -42,7 +42,7 @@ class EnvironmentConfigLoader: def __init__( self, featurestore_factory: t.Callable[[str], FeatureStore], - callback_factory: t.Callable[[bytes], CommChannelBase], + callback_factory: t.Callable[[str], CommChannelBase], queue_factory: t.Callable[[str], CommChannelBase], ) -> None: """Initialize the config loader instance with the factories necessary for diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index b6655bded..0db41f77a 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -24,7 +24,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import base64 import enum +import itertools import pickle import time import typing as t @@ -39,6 +41,7 @@ # isort: on from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) @@ -48,6 +51,14 @@ logger = get_logger(__name__) +def byte_descriptor_to_string(descriptor: bytes) -> str: + return base64.b64encode(descriptor).decode("utf-8") + + +def string_descriptor_to_byte(descriptor: str) -> bytes: + return base64.b64decode(descriptor.encode("utf-8")) + + # todo: did i create an arms race where a developer just grabs the backbone # and passes it wherever they need a FeatureStore? class BackboneFeatureStore(DragonFeatureStore): @@ -55,9 +66,15 @@ class BackboneFeatureStore(DragonFeatureStore): information stored in the MLI backbone feature store.""" MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS" + MLI_BACKEND_CONSUMER = "_SMARTIM_MLI_BACKEND_CONSUMER" + MLI_WORKER_QUEUE = "to_worker_fli" + MLI_BACKBONE = "_SMARTSIM_INFRA_BACKBONE" + _CREATED_ON = "creation" def __init__( - self, storage: "dragon_ddict.DDict", allow_reserved_writes: bool = False + self, + storage: "dragon_ddict.DDict", + allow_reserved_writes: bool = False, ) -> None: """Initialize the DragonFeatureStore instance. @@ -68,6 +85,17 @@ def __init__( super().__init__(storage) self._enable_reserved_writes = allow_reserved_writes + if self._CREATED_ON not in self: + self._record_creation_date() + + @property + def wait_timeout(self) -> float: + return self._wait_timeout + + @wait_timeout.setter + def wait_timeout(self, value: float) -> None: + self._wait_timeout = value + @property def notification_channels(self) -> t.Sequence[str]: """Retrieve descriptors for all registered MLI notification channels. @@ -87,6 +115,135 @@ def notification_channels(self, values: t.Sequence[str]) -> None: """ self[self.MLI_NOTIFY_CONSUMERS] = ",".join([str(value) for value in values]) + @property + def backend_channel(self) -> t.Optional[str]: + """Retrieve the channel descriptor exposed by the MLI backend for events + + :returns: a stringified channel descriptor""" + if self.MLI_NOTIFY_CONSUMERS in self: + return str(self[self.MLI_NOTIFY_CONSUMERS]) + return None + + @backend_channel.setter + def backend_channel(self, value: str) -> None: + """Set the channel exposed by the MLI backend for events + + :param value: a stringified channel descriptor""" + self[self.MLI_NOTIFY_CONSUMERS] = value + + @property + def worker_queue(self) -> t.Optional[str]: + """Retrieve the channel descriptor exposed by the MLI + backend to send work to an MLI worker manager instance + + :returns: a stringified channel descriptor""" + if self.MLI_WORKER_QUEUE in self: + return str(self[self.MLI_WORKER_QUEUE]) + return None + + @worker_queue.setter + def worker_queue(self, value: str) -> None: + """Set the channel descriptor exposed by the MLI + backend to send work to an MLI worker manager instance + + :param value: a stringified channel descriptor""" + self[self.MLI_WORKER_QUEUE] = value + + @property + def creation_date(self) -> str: + """Return the creation date for the backbone feature store""" + return str(self[self._CREATED_ON]) + + def _record_creation_date(self) -> None: + """Write the creation timestamp to the feature store""" + if self._CREATED_ON not in self: + if not self._allow_reserved_writes: + logger.warning( + "Recorded creation from a write-protected backbone instance" + ) + self[self._CREATED_ON] = str(time.time()) + + @classmethod + def from_writable_descriptor( + cls, + descriptor: str, + ) -> "BackboneFeatureStore": + """A factory method that creates an instance from a descriptor string + + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached DragonFeatureStore + :raises SmartSimError: if attachment to DragonFeatureStore fails""" + try: + return BackboneFeatureStore(dragon_ddict.DDict.attach(descriptor), True) + except Exception as ex: + logger.error(f"Error creating dragon feature store: {descriptor}") + raise SmartSimError( + f"Error creating dragon feature store: {descriptor}" + ) from ex + + @staticmethod + def _check_wait_timeout( + start_time: float, timeout: float, indicators: t.Dict[str, bool] + ) -> None: + """Perform timeout verification + + :param start_time: the start time to use for elapsed calculation + :param timeout: the timeout (in seconds) + :param indicators: latest retrieval status for requested keys""" + elapsed = time.time() - start_time + if timeout and elapsed > timeout: + raise SmartSimError( + f"Timeout retrieving all keys from backbone: {indicators}" + ) + + def wait_for( + self, keys: t.List[str], timeout: float = 0 + ) -> t.Dict[str, t.Union[str, bytes, None]]: + """Perform a blocking wait until all specified keys have been found + in the backbone + + :param keys: The required collection of keys to retrieve + :param timeout: The maximum wait time in seconds. Overrides class level setting + """ + + to_check = list(keys) + was_found = [False for _ in to_check] # add test ensuring dupes are handled.. + values: t.List[t.Union[str, bytes, None]] = [None for _ in to_check] + + backoff: t.List[float] = [0.1, 0.5, 1, 2, 4, 8] + backoff_iter = itertools.cycle(backoff) + start_time = time.time() + + while not all(was_found): + delay = next(backoff_iter) + + for index, key in enumerate(to_check): + if was_found[index]: + continue + + try: + values[index] = self[key] + was_found[index] = True + except KeyError: + if delay == backoff[-1]: + logger.debug(f"Re-attempting `{key}` retrieval in {delay}s") + + if all(was_found): + continue + + self._check_wait_timeout( + start_time, timeout, dict(zip(to_check, was_found)) + ) + + time.sleep(delay) + + return dict(zip(keys, values)) + + def get_env(self) -> t.Dict[str, str]: + """Returns a dictionary populated with environment variables necessary to + connect a process to the existing backbone instance.""" + return {self.MLI_BACKBONE: self.descriptor} + class EventCategory(str, enum.Enum): """Predefined event types raised by SmartSim backend.""" @@ -126,21 +283,26 @@ class OnCreateConsumer(EventBase): descriptor: str """Descriptor of the comm channel exposed by the consumer""" + filters: t.List[EventCategory] + """The collection of filters indicating messages of interest to this consumer""" - def __init__(self, descriptor: str) -> None: + def __init__(self, descriptor: str, filters: t.Sequence[EventCategory]) -> None: """Initialize the OnCreateConsumer event. :param descriptor: Descriptor of the comm channel exposed by the consumer + :param descriptor: Collection of filters indicating messages of interest """ super().__init__(EventCategory.CONSUMER_CREATED, str(uuid.uuid4())) self.descriptor = descriptor + self.filters = list(filters) def __str__(self) -> str: """Convert the event to a string. :returns: A string representation of this instance """ - return f"{str(super())}|{self.descriptor}" + _filters = ",".join(self.filters) + return f"{str(super())}|{self.descriptor}|{_filters}" class OnWriteFeatureStore(EventBase): @@ -181,6 +343,36 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int: """ +class EventSender: + """An event publisher that performs publishing of system events to a + single endpoint""" + + def __init__( + self, + backbone: BackboneFeatureStore, + channel: t.Optional[CommChannelBase], + ) -> None: + """Initialize the instance""" + self._backbone = backbone + self._channel: t.Optional[CommChannelBase] = channel + + def send(self, event: EventBase) -> int: + """The send operation""" + if self._channel is None: + # self._channel = self._channel_factory(event) + raise Exception("No channel to send on") + num_sent = 0 + + try: + event_bytes = bytes(event) + self._channel.send(event_bytes) + num_sent += 1 + except Exception as ex: + raise SmartSimError(f"Failed broadcast to channel: {self._channel}") from ex + + return num_sent + + class EventBroadcaster: """Performs fan-out publishing of system events.""" @@ -353,6 +545,8 @@ def __init__( backbone: BackboneFeatureStore, filters: t.Optional[t.List[EventCategory]] = None, batch_timeout: t.Optional[float] = None, + name: t.Optional[str] = None, + event_handler: t.Optional[t.Callable[[EventBase], None]] = None, ) -> None: """Initialize the EventConsumer instance. @@ -371,6 +565,15 @@ def __init__( self._backbone = backbone self._global_filters = filters or [] self._global_timeout = batch_timeout or 1.0 + self._name = name + self._event_handler = event_handler + + @property + def descriptor(self) -> str: + """The descriptor of the underlying comm channel where events are received + + :returns: The comm channel descriptor""" + return self._comm_channel.descriptor def receive( self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0 @@ -417,3 +620,45 @@ def receive( break return messages + + def register(self) -> t.Generator[bool, None, None]: + """Send an event to register this consumer as a listener""" + awaiting_confirmation = True + descriptor = self._comm_channel.descriptor + backoffs = itertools.cycle((0.1, 0.5, 1.0, 2.0, 4.0, 8.0)) + event = OnCreateConsumer(descriptor, self._global_filters) + + # we're going to sit in this loop to wait for the backbone to get + # updated with the registration (to avoid SEND/ACK) + while awaiting_confirmation: + registered_channels = self._backbone.notification_channels + # todo: this should probably be descriptor_string? maybe i need to + # get rid of descriptor as bytes or just make desc_string required in ABC + if descriptor in registered_channels: + awaiting_confirmation = False + + yield not awaiting_confirmation + time.sleep(next(backoffs)) + + # if backend_descriptor := self._backbone.backend_channel: + # backend_channel = DragonCommChannel. + # from_descriptor(backend_descriptor) + # backend = EventSender(self._backbone, backend_channel) + # backend.send(event) + + # broadcast that this consumer is now ready to mingle + publisher = EventBroadcaster(self._backbone, DragonCommChannel.from_local) + publisher.send(event, timeout=0.1) + + # def register_callback(self, callback: t.Callable[[EventBase], None]) -> None: ... + + def listen(self) -> None: + """Function to handle incoming events""" + print("starting listener...") + + while True: + print("awaiting new message") + incoming_messages = self.receive() + for message in incoming_messages: + if self._event_handler: + self._event_handler(message) diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index d7b37ffe6..0256b1a51 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -46,13 +46,14 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None: """Initialize the DragonFeatureStore instance. :param storage: A distributed dictionary to be used as the underlying - storage mechanism of the feature store - """ + storage mechanism of the feature store""" if isinstance(storage, dragon_ddict.DDict): descriptor = str(storage.serialize()) else: descriptor = "not-set" + # todo: follow up and ensure this descriptor is also encoded/decoded + # in a string-safe way here & in `from_descriptor` super().__init__(descriptor) self._storage: t.Dict[str, t.Union[str, bytes]] = storage @@ -97,7 +98,8 @@ def from_descriptor( :raises SmartSimError: If attachment to DragonFeatureStore fails """ try: - return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor)) + logger.debug(f"Attaching to FeatureStore with descriptor: {descriptor}") + return cls(dragon_ddict.DDict.attach(descriptor)) except Exception as ex: logger.error(f"Error creating dragon feature store: {descriptor}") raise SmartSimError( diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py index a55c52305..ac6cdaf31 100644 --- a/smartsim/_core/mli/infrastructure/storage/feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py @@ -43,6 +43,14 @@ class ReservedKeys(str, enum.Enum): """Storage location for the list of registered consumers that will receive events from an EventBroadcaster""" + MLI_BACKEND_CONSUMER = "_SMARTIM_MLI_BACKEND_CONSUMER" + """Storage location for the channel used to send messages directly to + the MLI backend""" + + MLI_WORKER_QUEUE = "to_worker_fli" # todo: ensure this adheres to standard + """Storage location for the channel used to send work requests + to the available worker managers""" + @classmethod def contains(cls, value: str) -> bool: """Convert a string representation into an enumeration member. @@ -59,7 +67,27 @@ def contains(cls, value: str) -> bool: @dataclass(frozen=True) -class FeatureStoreKey: +class TensorKey: + """A key,descriptor pair enabling retrieval of an item from a feature store.""" + + key: str + """The unique key of an item in a feature store""" + descriptor: str + """The unique identifier of the feature store containing the key""" + + def __post_init__(self) -> None: + """Ensure the key and descriptor have at least one character. + + :raises ValueError: If key or descriptor are empty strings + """ + if len(self.key) < 1: + raise ValueError("Key must have at least one character.") + if len(self.descriptor) < 1: + raise ValueError("Descriptor must have at least one character.") + + +@dataclass(frozen=True) +class ModelKey: """A key,descriptor pair enabling retrieval of an item from a feature store.""" key: str diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 530d25154..ac1a14866 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -39,17 +39,16 @@ from ...comm.channel.channel import CommChannelBase from ...message_handler import MessageHandler from ...mli_schemas.model.model_capnp import Model -from ..storage.feature_store import FeatureStore, FeatureStoreKey +from ..storage.feature_store import FeatureStore, ModelKey, TensorKey if t.TYPE_CHECKING: - from smartsim._core.mli.mli_schemas.data.data_references_capnp import TensorKey from smartsim._core.mli.mli_schemas.response.response_capnp import Status from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor logger = get_logger(__name__) # Placeholder -ModelIdentifier = FeatureStoreKey +ModelIdentifier = ModelKey class InferenceRequest: @@ -57,12 +56,12 @@ class InferenceRequest: def __init__( self, - model_key: t.Optional[FeatureStoreKey] = None, + model_key: t.Optional[ModelKey] = None, callback: t.Optional[CommChannelBase] = None, raw_inputs: t.Optional[t.List[bytes]] = None, - input_keys: t.Optional[t.List[FeatureStoreKey]] = None, + input_keys: t.Optional[t.List[TensorKey]] = None, input_meta: t.Optional[t.List[t.Any]] = None, - output_keys: t.Optional[t.List[FeatureStoreKey]] = None, + output_keys: t.Optional[t.List[TensorKey]] = None, raw_model: t.Optional[Model] = None, batch_size: int = 0, ): @@ -153,7 +152,7 @@ class InferenceReply: def __init__( self, outputs: t.Optional[t.Collection[t.Any]] = None, - output_keys: t.Optional[t.Collection[FeatureStoreKey]] = None, + output_keys: t.Optional[t.Collection[TensorKey]] = None, status_enum: "Status" = "running", message: str = "In progress", ) -> None: @@ -166,7 +165,7 @@ def __init__( """ self.outputs: t.Collection[t.Any] = outputs or [] """List of output data""" - self.output_keys: t.Collection[t.Optional[FeatureStoreKey]] = output_keys or [] + self.output_keys: t.Collection[t.Optional[TensorKey]] = output_keys or [] """List of keys used for output data""" self.status_enum = status_enum """Status of the reply""" @@ -320,7 +319,7 @@ class RequestBatch: """List of InferenceRequests in the batch""" inputs: t.Optional[TransformInputResult] """Transformed batch of input tensors""" - model_id: ModelIdentifier + model_id: "ModelIdentifier" """Model (key, descriptor) tuple""" @property @@ -350,7 +349,7 @@ def raw_model(self) -> t.Optional[t.Any]: return None @property - def input_keys(self) -> t.List[FeatureStoreKey]: + def input_keys(self) -> t.List[TensorKey]: """All input keys available in this batch's requests. :returns: All input keys belonging to requests in this batch""" @@ -361,7 +360,7 @@ def input_keys(self) -> t.List[FeatureStoreKey]: return keys @property - def output_keys(self) -> t.List[FeatureStoreKey]: + def output_keys(self) -> t.List[TensorKey]: """All output keys available in this batch's requests. :returns: All output keys belonging to requests in this batch""" @@ -378,7 +377,7 @@ class MachineLearningWorkerCore: @staticmethod def deserialize_message( data_blob: bytes, - callback_factory: t.Callable[[bytes], CommChannelBase], + callback_factory: t.Callable[[str], CommChannelBase], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest. @@ -388,27 +387,27 @@ def deserialize_message( :returns: The raw input message deserialized into an InferenceRequest """ request = MessageHandler.deserialize_request(data_blob) - model_key: t.Optional[FeatureStoreKey] = None + model_key: t.Optional[ModelKey] = None model_bytes: t.Optional[Model] = None if request.model.which() == "key": - model_key = FeatureStoreKey( + model_key = ModelKey( key=request.model.key.key, - descriptor=request.model.key.featureStoreDescriptor, + descriptor=request.model.key.descriptor, ) elif request.model.which() == "data": model_bytes = request.model.data callback_key = request.replyChannel.descriptor comm_channel = callback_factory(callback_key) - input_keys: t.Optional[t.List[FeatureStoreKey]] = None + input_keys: t.Optional[t.List[TensorKey]] = None input_bytes: t.Optional[t.List[bytes]] = None - output_keys: t.Optional[t.List[FeatureStoreKey]] = None + output_keys: t.Optional[t.List[TensorKey]] = None input_meta: t.Optional[t.List[TensorDescriptor]] = None if request.input.which() == "keys": input_keys = [ - FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) + TensorKey(key=value.key, descriptor=value.descriptor) for value in request.input.keys ] elif request.input.which() == "descriptors": @@ -416,7 +415,7 @@ def deserialize_message( if request.output: output_keys = [ - FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) + TensorKey(key=value.key, descriptor=value.descriptor) for value in request.output ] @@ -545,7 +544,7 @@ def place_output( request: InferenceRequest, transform_result: TransformOutputResult, feature_stores: t.Dict[str, FeatureStore], - ) -> t.Collection[t.Optional[FeatureStoreKey]]: + ) -> t.Collection[t.Optional[TensorKey]]: """Given a collection of data, make it available as a shared resource in the feature store. @@ -558,7 +557,7 @@ def place_output( if not feature_stores: raise ValueError("Feature store is required for output persistence") - keys: t.List[t.Optional[FeatureStoreKey]] = [] + keys: t.List[t.Optional[TensorKey]] = [] # need to decide how to get back to original sub-batch inputs so they can be # accurately placed, datum might need to include this. diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 71def143a..d7324e4a4 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -73,7 +73,7 @@ def build_output_tensor_descriptor( order, data type, and dimensions. :param order: Order of the tensor, such as row-major (c) or column-major (f) - :param keys: List of TensorKeys to apply transorm descriptor to + :param keys: List of TensorKey to apply transorm descriptor to :param data_type: Tranform data type of the tensor :param dimensions: Transform dimensions of the tensor :returns: The OutputDescriptor @@ -92,14 +92,12 @@ def build_output_tensor_descriptor( return description @staticmethod - def build_tensor_key( - key: str, feature_store_descriptor: str - ) -> data_references_capnp.TensorKey: + def build_tensor_key(key: str, descriptor: str) -> data_references_capnp.TensorKey: """ Builds a new TensorKey message with the provided key. :param key: String to set the TensorKey - :param feature_store_descriptor: A descriptor identifying the feature store + :param descriptor: A descriptor identifying the feature store containing the key :returns: The TensorKey :raises ValueError: If building fails @@ -107,7 +105,7 @@ def build_tensor_key( try: tensor_key = data_references_capnp.TensorKey.new_message() tensor_key.key = key - tensor_key.featureStoreDescriptor = feature_store_descriptor + tensor_key.descriptor = descriptor except Exception as e: raise ValueError("Error building tensor key.") from e return tensor_key @@ -133,14 +131,12 @@ def build_model(data: bytes, name: str, version: str) -> model_capnp.Model: return model @staticmethod - def build_model_key( - key: str, feature_store_descriptor: str - ) -> data_references_capnp.ModelKey: + def build_model_key(key: str, descriptor: str) -> data_references_capnp.ModelKey: """ Builds a new ModelKey message with the provided key. :param key: String to set the ModelKey - :param feature_store_descriptor: A descriptor identifying the feature store + :param descriptor: A descriptor identifying the feature store containing the key :returns: The ModelKey :raises ValueError: If building fails @@ -148,9 +144,9 @@ def build_model_key( try: model_key = data_references_capnp.ModelKey.new_message() model_key.key = key - model_key.featureStoreDescriptor = feature_store_descriptor + model_key.descriptor = descriptor except Exception as e: - raise ValueError("Error building model key.") from e + raise ValueError("Error building tensor key.") from e return model_key @staticmethod @@ -242,7 +238,7 @@ def _assign_model( @staticmethod def _assign_reply_channel( - request: request_capnp.Request, reply_channel: bytes + request: request_capnp.Request, reply_channel: str ) -> None: """ Assigns a reply channel to the supplied request. @@ -360,7 +356,7 @@ def _assign_custom_request_attributes( @staticmethod def build_request( - reply_channel: bytes, + reply_channel: str, model: t.Union[data_references_capnp.ModelKey, model_capnp.Model], inputs: t.Union[ t.List[data_references_capnp.TensorKey], diff --git a/smartsim/_core/mli/mli_schemas/data/data_references.capnp b/smartsim/_core/mli/mli_schemas/data/data_references.capnp index 699abe5d2..65293be7b 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references.capnp +++ b/smartsim/_core/mli/mli_schemas/data/data_references.capnp @@ -28,10 +28,10 @@ struct ModelKey { key @0 :Text; - featureStoreDescriptor @1 :Text; + descriptor @1 :Text; } struct TensorKey { key @0 :Text; - featureStoreDescriptor @1 :Text; + descriptor @1 :Text; } diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi index bcf53e0a0..a5e318a55 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi @@ -36,7 +36,7 @@ from typing import Iterator class ModelKey: key: str - featureStoreDescriptor: str + descriptor: str @staticmethod @contextmanager def from_bytes( @@ -72,7 +72,7 @@ class ModelKeyBuilder(ModelKey): class TensorKey: key: str - featureStoreDescriptor: str + descriptor: str @staticmethod @contextmanager def from_bytes( diff --git a/smartsim/_core/mli/mli_schemas/request/request.capnp b/smartsim/_core/mli/mli_schemas/request/request.capnp index 4be1cfa21..26d9542d9 100644 --- a/smartsim/_core/mli/mli_schemas/request/request.capnp +++ b/smartsim/_core/mli/mli_schemas/request/request.capnp @@ -32,7 +32,7 @@ using DataRef = import "../data/data_references.capnp"; using Models = import "../model/model.capnp"; struct ChannelDescriptor { - descriptor @0 :Data; + descriptor @0 :Text; } struct Request { diff --git a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi index a4ad631f9..2aab80b1d 100644 --- a/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/request/request_capnp.pyi @@ -61,7 +61,7 @@ from .request_attributes.request_attributes_capnp import ( ) class ChannelDescriptor: - descriptor: bytes + descriptor: str @staticmethod @contextmanager def from_bytes( diff --git a/smartsim/log.py b/smartsim/log.py index 3d6c0860e..a28112efa 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -252,7 +252,9 @@ def filter(self, record: logging.LogRecord) -> bool: return record.levelno <= level_no -def log_to_file(filename: str, log_level: str = "debug") -> None: +def log_to_file( + filename: str, log_level: str = "debug", logger: t.Optional[logging.Logger] = None +) -> None: """Installs a second filestream handler to the root logger, allowing subsequent logging calls to be sent to filename. @@ -261,7 +263,8 @@ def log_to_file(filename: str, log_level: str = "debug") -> None: to allow the file to store more or less verbose logging information. """ - logger = logging.getLogger("SmartSim") + if logger is None: + logger = logging.getLogger("SmartSim") stream = open( # pylint: disable=consider-using-with filename, "w+", encoding="utf-8" ) diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py new file mode 100644 index 000000000..bf195a756 --- /dev/null +++ b/smartsim/protoclient.py @@ -0,0 +1,285 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +# pylint: disable=unused-import,import-error +import dragon +from dragon import fli +import dragon.channels +from dragon.globalservices.api_setup import connect_to_infrastructure + +# isort: on +# pylint: enable=unused-import,import-error + +import numbers +import os +import time +import typing as t +from collections import OrderedDict + +import numpy +import torch + +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBroadcaster, + EventProducer, + OnWriteFeatureStore, +) +from smartsim._core.mli.message_handler import MessageHandler +from smartsim._core.utils.timings import PerfTimer +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +# from mpi4py import MPI + + +_TimingDict = OrderedDict[str, list[str]] + + +logger = get_logger("App") +logger.info("Started app") +CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False + + +class ProtoClient: + @staticmethod + def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore: + """Use the supplied environment variables to attach + to a pre-existing backbone featurestore. Requires the + environment to contain `_SMARTSIM_INFRA_BACKBONE` + environment variable + + :returns: the attached backbone featurestore""" + # todo: ensure this env var from config loader or constant + descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) + if descriptor is None: + raise SmartSimError( + "Missing required backbone configuration in environment" + ) + + backbone = t.cast( + BackboneFeatureStore, BackboneFeatureStore.from_descriptor(descriptor) + ) + backbone.wait_timeout = wait_timeout + return backbone + + def _attach_to_worker_queue(self) -> DragonFLIChannel: + """Wait until the backbone contains the worker queue configuration, + then attach an FLI to the given worker queue""" + configuration = self._backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE]) + # descriptor = configuration.get(BackboneFeatureStore.MLI_WORKER_QUEUE, None) + # NOTE: without wait_for, this MUST be in the backbone.... + # descriptor = self._backbone.worker_queue + descriptor = str(configuration[BackboneFeatureStore.MLI_WORKER_QUEUE]) + + if not descriptor: + raise ValueError("Unable to locate worker queue using backbone") + + # self._to_worker_fli = DragonFLIChannel.from_descriptor(descriptor) + return DragonFLIChannel.from_descriptor(str(descriptor)) + + @staticmethod + def _create_worker_channels() -> t.Tuple[DragonCommChannel, DragonCommChannel]: + """Create channels to be used in the worker queue""" + # self._from_worker_ch = Channel.make_process_local() + _from_worker_ch = DragonCommChannel.from_local() + # self._from_worker_ch_serialized = self._from_worker_ch.serialize() + # self._to_worker_ch = Channel.make_process_local() + _to_worker_ch = DragonCommChannel.from_local() + + return _from_worker_ch, _to_worker_ch + + def _create_broadcaster(self) -> EventProducer: + """Create an event publisher that will broadcast updates to + other MLI components. This publisher + + :returns: the event publisher instance""" + broadcaster: EventProducer = EventBroadcaster( + self._backbone, DragonCommChannel.from_descriptor + ) + return broadcaster + + def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None: + """Initialize the client instance + + :param timing_on: Flag indicating if timing information should be + written to file + :param wait_timeout: Maximum wait time allowed to attach to the + worker queue + + :raises: SmartSimError if unable to attach to a backbone featurestore""" + # comm = MPI.COMM_WORLD + # rank = comm.Get_rank() + rank: int = 0 + self._queue_timeout = wait_timeout + + connect_to_infrastructure() + # ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] + # self._ddict = DDict.attach(ddict_str) + # self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor + self._backbone = self._attach_to_backbone(wait_timeout=wait_timeout) + + # # to_worker_fli_str = None + # # while to_worker_fli_str is None: + # # try: + # # to_worker_fli_str = self._ddict["to_worker_fli"] + # # self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) + # # except KeyError: + # # time.sleep(1) + + self._to_worker_fli = self._attach_to_worker_queue() + + # # # self._from_worker_ch = Channel.make_process_local() + # # # self._from_worker_ch_serialized = self._from_worker_ch.serialize() + # # # self._to_worker_ch = Channel.make_process_local() + channels = self._create_worker_channels() + self._from_worker_ch = channels[0] + self._to_worker_ch = channels[1] + + self._publisher = self._create_broadcaster() + + self.perf_timer: PerfTimer = PerfTimer( + debug=False, timing_on=timing_on, prefix=f"a{rank}_" + ) + self._start: t.Optional[float] = None + self._interm: t.Optional[float] = None + self._timings: _TimingDict = OrderedDict() + self._timing_on = timing_on + + def _add_label_to_timings(self, label: str) -> None: + if label not in self._timings: + self._timings[label] = [] + + @staticmethod + def _format_number(number: t.Union[numbers.Number, float]) -> str: + return f"{number:0.4e}" + + def start_timings(self, batch_size: numbers.Number) -> None: + if self._timing_on: + self._add_label_to_timings("batch_size") + self._timings["batch_size"].append(self._format_number(batch_size)) + self._start = time.perf_counter() + self._interm = time.perf_counter() + + def end_timings(self) -> None: + if self._timing_on and self._start is not None: + self._add_label_to_timings("total_time") + self._timings["total_time"].append( + self._format_number(time.perf_counter() - self._start) + ) + + def measure_time(self, label: str) -> None: + if self._timing_on and self._interm is not None: + self._add_label_to_timings(label) + self._timings[label].append( + self._format_number(time.perf_counter() - self._interm) + ) + self._interm = time.perf_counter() + + def print_timings(self, to_file: bool = False) -> None: + print(" ".join(self._timings.keys())) + + value_array = numpy.array(self._timings.values(), dtype=float) + value_array = numpy.transpose(value_array) + for i in range(value_array.shape[0]): + print(" ".join(self._format_number(value) for value in value_array[i])) + if to_file: + numpy.save("timings.npy", value_array) + numpy.savetxt("timings.txt", value_array) + + def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: + tensors = [batch.numpy()] + self.perf_timer.start_timings("batch_size", batch.shape[0]) + built_tensor_desc = MessageHandler.build_tensor_descriptor( + "c", "float32", list(batch.shape) + ) + self.perf_timer.measure_time("build_tensor_descriptor") + if isinstance(model, str): + model_arg = MessageHandler.build_model_key(model, self._backbone.descriptor) + else: + model_arg = MessageHandler.build_model( + model, "resnet-50", "1.0" + ) # type: ignore + request = MessageHandler.build_request( + reply_channel=self._from_worker_ch.descriptor, + model=model_arg, + inputs=[built_tensor_desc], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + self.perf_timer.measure_time("build_request") + request_bytes = MessageHandler.serialize_request(request) + self.perf_timer.measure_time("serialize_request") + + if self._to_worker_fli is None: + raise ValueError("No worker queue available.") + + # pylint: disable-next=protected-access + with self._to_worker_fli._channel.sendh( # type: ignore + timeout=None, + stream_channel=self._to_worker_ch.channel, + ) as to_sendh: + to_sendh.send_bytes(request_bytes) + self.perf_timer.measure_time("send_request") + for tensor in tensors: + to_sendh.send_bytes(tensor.tobytes()) # TODO NOT FAST ENOUGH!!! + # to_sendh.send_bytes(bytes(tensor.data)) + logger.info(f"Message size: {len(request_bytes)} bytes") + + self.perf_timer.measure_time("send_tensors") + with self._from_worker_ch.channel.recvh(timeout=None) as from_recvh: + resp = from_recvh.recv_bytes(timeout=None) + self.perf_timer.measure_time("receive_response") + response = MessageHandler.deserialize_response(resp) + self.perf_timer.measure_time("deserialize_response") + # list of data blobs? + # recv depending on the len(response.result.descriptors)? + data_blob: bytes = from_recvh.recv_bytes(timeout=None) + self.perf_timer.measure_time("receive_tensor") + result = torch.from_numpy( + numpy.frombuffer( + data_blob, + dtype=str(response.result.descriptors[0].dataType), + ) + ) + self.perf_timer.measure_time("deserialize_tensor") + + self.perf_timer.end_timings() + return result + + def set_model(self, key: str, model: bytes) -> None: + # todo: incorrect usage of backbone here to store + # user models? are we using the backbone if they do NOT + # have a feature store of their own? + self._backbone[key] = model + + # notify components of a change in the data at this key + event = OnWriteFeatureStore(self._backbone.descriptor, key) + self._publisher.send(event) diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py new file mode 100644 index 000000000..a4e61d430 --- /dev/null +++ b/tests/dragon/test_dragon_backend.py @@ -0,0 +1,174 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest.mock as mock + +import pytest + +# from smartsim._core.launcher.dragon.dragonBackend import DragonBackend, NodePrioritizer +# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import EventSender, OnCreateConsumer + +# dragon = pytest.importorskip("dragon") + +# import dragon.utils as du +# from dragon.channels import Channel +# from dragon.data.ddict.ddict import DDict +# from dragon.fli import DragonFLIError, FLInterface + +# from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +# from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader +# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( +# DragonFeatureStore, +# ) + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +def test_dragonbackend_listener_boostrapping(monkeypatch: pytest.MonkeyPatch): + """Verify that an event listener is started""" + # backend_channel = DragonCommChannel.from_local() + assert True + + # with monkeypatch.context() as patcher: + # # patcher.setattr("smartsim._core.launcher.dragon.dragonBackend", "NodePrioritizer", mock.MagicMock()) + # patcher.setattr(NodePrioritizer, "__init__", lambda self, nodes, lock: None) + # patcher.setattr(DragonBackend, "_initialize_hosts", lambda self: None) + + # backend = DragonBackend(pid=9999) + # backend._create_backbone() + + # # create the consumer and start a listener process + # backend_consumer = backend._create_eventing(backend._backbone) + + # # ensure the consumer that was created is retained + # assert backend._event_consumer is not None + # assert backend._event_consumer == backend_consumer + + # assert backend._backbone.notification_channels == [backend_consumer.descriptor] + + # # create components to publish events + # # sender_channel = DragonCommChannel.from_local() + # sender = EventSender(backend._backbone, backend_channel) + + # # simulate a new consumer registration + # new_consumer_channel = DragonCommChannel.from_local() + # registration = OnCreateConsumer(new_consumer_channel.descriptor) + # new_consumer_channel.send(bytes(registration), 0.1) + + # events = backend_consumer.receive() + # assert len(events) == 1 + + +# @pytest.mark.parametrize( +# "content", +# [ +# pytest.param(b"a"), +# pytest.param(b"new byte string"), +# ], +# ) +# def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch): +# """A descriptor can be stored, loaded, and reattached""" +# chan = Channel.make_process_local() +# queue = FLInterface(main_ch=chan) +# monkeypatch.setenv( +# "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) +# ) + +# config = EnvironmentConfigLoader( +# featurestore_factory=DragonFeatureStore.from_descriptor, +# callback_factory=DragonCommChannel.from_descriptor, +# queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, +# ) +# config_queue = config.get_queue() + +# _ = config_queue.send(content) + +# old_recv = queue.recvh() +# result, _ = old_recv.recv_bytes() +# assert result == content + + +# def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): +# """The serialized descriptors of a loaded and unloaded +# queue are the same""" +# chan = Channel.make_process_local() +# queue = FLInterface(main_ch=chan) +# monkeypatch.setenv( +# "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) +# ) + +# config = EnvironmentConfigLoader( +# featurestore_factory=DragonFeatureStore.from_descriptor, +# callback_factory=DragonCommChannel.from_descriptor, +# queue_factory=DragonFLIChannel.from_descriptor, +# ) +# config_queue = config.get_queue() +# assert config_queue._fli.serialize() == queue.serialize() + + +# def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): +# """An incorrect serialized descriptor will fails to attach""" +# monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring") +# config = EnvironmentConfigLoader( +# featurestore_factory=DragonFeatureStore.from_descriptor, +# callback_factory=None, +# queue_factory=DragonFLIChannel.from_descriptor, +# ) + +# with pytest.raises(DragonFLIError): +# config.get_queue() + + +# def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): +# """Verify the dragon feature store is loaded correctly by the +# EnvironmentConfigLoader to demonstrate featurestore_factory correctness""" +# feature_store = DragonFeatureStore(DDict()) +# monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor) + +# config = EnvironmentConfigLoader( +# featurestore_factory=DragonFeatureStore.from_descriptor, +# callback_factory=None, +# queue_factory=None, +# ) + +# print(f"calling config.get_backbone: `{feature_store.descriptor}`") + +# backbone = config.get_backbone() +# assert backbone is not None + + +# def test_environment_variables_not_set(): +# """EnvironmentConfigLoader getters return None when environment +# variables are not set""" +# config = EnvironmentConfigLoader( +# featurestore_factory=DragonFeatureStore.from_descriptor, +# callback_factory=DragonCommChannel.from_descriptor, +# queue_factory=DragonCommChannel.from_descriptor, +# ) +# assert config.get_backbone() is None +# assert config.get_queue() is None diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index e9bcc8dfd..b8c2af9c0 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -63,7 +63,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, ) config_queue = config.get_queue() diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 618b00d87..0f3e38f93 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import typing as t from unittest.mock import MagicMock import pytest @@ -38,6 +39,7 @@ from dragon.fli import FLInterface from dragon.mpbridge.queues import DragonQueue +from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.infrastructure.control.device_manager import WorkerDevice from smartsim._core.mli.infrastructure.control.request_dispatcher import ( @@ -62,11 +64,13 @@ InferenceReply, InferenceRequest, LoadModelResult, + MachineLearningWorkerBase, RequestBatch, TransformInputResult, TransformOutputResult, ) from smartsim._core.mli.message_handler import MessageHandler +from smartsim._core.mli.mli_schemas.response.response_capnp import ResponseBuilder from .utils.channel import FileSystemCommChannel from .utils.worker import IntegratedTorchWorker @@ -92,7 +96,7 @@ def app_feature_store() -> FeatureStore: @pytest.fixture def setup_worker_manager_model_bytes( - test_dir, + test_dir: str, monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, @@ -110,10 +114,10 @@ def setup_worker_manager_model_bytes( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, ) - dispatcher_task_queue = mp.Queue(maxsize=0) + dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0) worker_manager = WorkerManager( config_loader=config_loader, @@ -123,10 +127,14 @@ def setup_worker_manager_model_bytes( cooldown=3, ) - tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) - output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + tensor_key = MessageHandler.build_feature_store_key( + "key", app_feature_store.descriptor + ) + output_key = MessageHandler.build_feature_store_key( + "key", app_feature_store.descriptor + ) - request = InferenceRequest( + inf_request = InferenceRequest( model_key=None, callback=None, raw_inputs=None, @@ -140,7 +148,7 @@ def setup_worker_manager_model_bytes( model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) request_batch = RequestBatch( - [request], + [inf_request], TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), model_id=model_id, ) @@ -169,10 +177,10 @@ def setup_worker_manager_model_key( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, ) - dispatcher_task_queue = mp.Queue(maxsize=0) + dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0) worker_manager = WorkerManager( config_loader=config_loader, @@ -208,7 +216,7 @@ def setup_worker_manager_model_key( @pytest.fixture def setup_request_dispatcher_model_bytes( - test_dir, + test_dir: str, monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, @@ -226,7 +234,7 @@ def setup_request_dispatcher_model_bytes( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, ) request_dispatcher = RequestDispatcher( @@ -237,8 +245,12 @@ def setup_request_dispatcher_model_bytes( ) request_dispatcher._on_start() - tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) - output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + tensor_key = MessageHandler.build_feature_store_key( + "key", app_feature_store.descriptor + ) + output_key = MessageHandler.build_feature_store_key( + "key", app_feature_store.descriptor + ) model = MessageHandler.build_model(b"model", "model name", "v 0.0.1") request = MessageHandler.build_request( test_dir, model, [tensor_key], [output_key], [], None @@ -252,7 +264,7 @@ def setup_request_dispatcher_model_bytes( @pytest.fixture def setup_request_dispatcher_model_key( - test_dir, + test_dir: str, monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, @@ -270,7 +282,7 @@ def setup_request_dispatcher_model_key( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, ) request_dispatcher = RequestDispatcher( @@ -281,9 +293,13 @@ def setup_request_dispatcher_model_key( ) request_dispatcher._on_start() - tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) - output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) - model_key = MessageHandler.build_model_key( + tensor_key = MessageHandler.build_feature_store_key( + "key", app_feature_store.descriptor + ) + output_key = MessageHandler.build_feature_store_key( + "key", app_feature_store.descriptor + ) + model_key = MessageHandler.build_feature_store_key( key="model key", feature_store_descriptor=app_feature_store.descriptor ) request = MessageHandler.build_request( @@ -296,8 +312,12 @@ def setup_request_dispatcher_model_key( return request_dispatcher, integrated_worker_type -def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage): - def mock_stage(*args, **kwargs): +def mock_pipeline_stage( + monkeypatch: pytest.MonkeyPatch, + integrated_worker: MachineLearningWorkerBase, + stage: str, +) -> t.Callable[[t.Any], ResponseBuilder]: + def mock_stage(*args: t.Any, **kwargs: t.Any) -> None: raise ValueError(f"Simulated error in {stage}") monkeypatch.setattr(integrated_worker, stage, mock_stage) @@ -314,8 +334,10 @@ def mock_stage(*args, **kwargs): mock_reply_channel = MagicMock() mock_reply_channel.send = MagicMock() - def mock_exception_handler(exc, reply_channel, failure_message): - return exception_handler(exc, mock_reply_channel, failure_message) + def mock_exception_handler( + exc: Exception, reply_channel: CommChannelBase, failure_message: str + ) -> None: + exception_handler(exc, mock_reply_channel, failure_message) monkeypatch.setattr( "smartsim._core.mli.infrastructure.control.worker_manager.exception_handler", @@ -362,12 +384,12 @@ def mock_exception_handler(exc, reply_channel, failure_message): ], ) def test_wm_pipeline_stage_errors_handled( - request, - setup_worker_manager, + request: pytest.FixtureRequest, + setup_worker_manager: str, monkeypatch: pytest.MonkeyPatch, stage: str, error_message: str, -): +) -> None: """Ensures that the worker manager does not crash after a failure in various pipeline stages""" worker_manager, integrated_worker_type = request.getfixturevalue( setup_worker_manager @@ -446,12 +468,12 @@ def test_wm_pipeline_stage_errors_handled( ], ) def test_dispatcher_pipeline_stage_errors_handled( - request, - setup_request_dispatcher, + request: pytest.FixtureRequest, + setup_request_dispatcher: str, monkeypatch: pytest.MonkeyPatch, stage: str, error_message: str, -): +) -> None: """Ensures that the request dispatcher does not crash after a failure in various pipeline stages""" request_dispatcher, integrated_worker_type = request.getfixturevalue( setup_request_dispatcher @@ -473,7 +495,7 @@ def test_dispatcher_pipeline_stage_errors_handled( mock_reply_fn.assert_called_with("fail", error_message) -def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): +def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch) -> None: """Ensures that the worker manager does not crash after a failure in the execute pipeline stage""" diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py new file mode 100644 index 000000000..a2c8118ac --- /dev/null +++ b/tests/dragon/test_featurestore.py @@ -0,0 +1,338 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import multiprocessing as mp +import random +import time +import typing as t +import unittest.mock as mock +import uuid + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBroadcaster, + EventCategory, + EventConsumer, + OnCreateConsumer, + OnWriteFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + time as bbtime, +) +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict +from smartsim.log import get_logger + +logger = get_logger(__name__) + +# isort: off +from dragon import fli +from dragon.channels import Channel + +# isort: on + +if t.TYPE_CHECKING: + import conftest + + +# The tests in this file must run in a dragon environment +pytestmark = pytest.mark.dragon +WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" + + +@pytest.fixture +def storage_for_dragon_fs() -> t.Dict[str, str]: + return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) + + +@pytest.fixture +def storage_for_dragon_fs_with_req_queue( + storage_for_dragon_fs: t.Dict[str, str] +) -> t.Dict[str, str]: + # create a valid FLI so any call to attach does not fail + channel_ = Channel.make_process_local() + fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) + comm_channel = DragonFLIChannel(fli_, True) + + storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor + return storage_for_dragon_fs + + +@pytest.fixture +def storage_for_dragon_fs_with_mock_req_queue( + storage_for_dragon_fs: t.Dict[str, str] +) -> t.Dict[str, str]: + # # create a valid FLI so any call to attach does not fail + # channel_ = Channel.make_process_local() + # fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) + # comm_channel = DragonFLIChannel(fli_, True) + + mock_descriptor = "12345" + storage_for_dragon_fs[WORK_QUEUE_KEY] = mock_descriptor + return storage_for_dragon_fs + + +def test_eventconsumer_eventpublisher_integration( + storage_for_dragon_fs: t.Any, test_dir: str +) -> None: + """Verify that the publisher and consumer integrate as expected when + multiple publishers and consumers are sending simultaneously. This + test closely tracks the test in tests/test_featurestore.py also named + test_eventconsumer_eventpublisher_integration but requires dragon entities + + :param storage_for_dragon_fs: the dragon storage engine to use + :param test_dir: pytest fixture automatically generating unique working + directories for individual test outputs""" + + mock_storage = storage_for_dragon_fs + backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + + # verify ability to write and read from ddict + backbone["test_dir"] = test_dir + assert backbone["test_dir"] == test_dir + + wmgr_channel_ = Channel.make_process_local() + capp_channel_ = Channel.make_process_local() + back_channel_ = Channel.make_process_local() + + wmgr_channel = DragonCommChannel(wmgr_channel_) + capp_channel = DragonCommChannel(capp_channel_) + back_channel = DragonCommChannel(back_channel_) + + wmgr_consumer_descriptor = wmgr_channel.descriptor_string + capp_consumer_descriptor = capp_channel.descriptor_string + back_consumer_descriptor = back_channel.descriptor_string + + # create some consumers to receive messages + wmgr_consumer = EventConsumer( + wmgr_channel, + backbone, + filters=[EventCategory.FEATURE_STORE_WRITTEN], + ) + capp_consumer = EventConsumer( + capp_channel, + backbone, + ) + back_consumer = EventConsumer( + back_channel, + backbone, + filters=[EventCategory.CONSUMER_CREATED], + ) + + # create some broadcasters to publish messages + mock_worker_mgr = EventBroadcaster( + backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + mock_client_app = EventBroadcaster( + backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + + # register all of the consumers even though the OnCreateConsumer really should + # trigger its registration. event processing is tested elsewhere. + backbone.notification_channels = [ + wmgr_consumer_descriptor, + capp_consumer_descriptor, + back_consumer_descriptor, + ] + + # simulate worker manager sending a notification to backend that it's alive + event_1 = OnCreateConsumer(wmgr_consumer_descriptor, []) + mock_worker_mgr.send(event_1) + + # simulate the app updating a model a few times + event_2 = OnWriteFeatureStore(backbone.descriptor, "key-1") + event_3 = OnWriteFeatureStore(backbone.descriptor, "key-2") + event_4 = OnWriteFeatureStore(backbone.descriptor, "key-1") + + mock_client_app.send(event_2) + mock_client_app.send(event_3) + mock_client_app.send(event_4) + + # worker manager should only get updates about feature update + wmgr_messages = wmgr_consumer.receive() + assert len(wmgr_messages) == 3 + + # the backend should only receive messages about consumer creation + back_messages = back_consumer.receive() + assert len(back_messages) == 1 + + # hypothetical app has no filters and will get all events + app_messages = capp_consumer.receive() + assert len(app_messages) == 4 + + +def test_backbone_wait_for_prepopulated( + storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that asking the backbone to wait for a value succeed + immediately and do not cause a wait to occur if the data exists + + :param storage_for_dragon_fs: the storage engine to use, prepopulated with + """ + # set a very low timeout to confirm that it does not wait + wait_timeout = 0.1 + # storage = {WORK_QUEUE_KEY: "123456"} + storage = storage_for_dragon_fs_with_req_queue + + backbone = BackboneFeatureStore(storage) + + with monkeypatch.context() as ctx: + # all keys should be found and the timeout should never be checked. + ctx.setattr(bbtime, "sleep", mock.MagicMock()) + + values = backbone.wait_for([WORK_QUEUE_KEY]) + + # confirm that wait_for with one key returns one value + assert len(values) == 1 + + # confirm that the descriptor is non-null w/some non-trivial value + assert len(values[WORK_QUEUE_KEY]) > 5 + + # confirm that no wait occurred + bbtime.sleep.assert_not_called() + + +def set_value_after_delay( + descriptor: str, key: str, value: str, delay: float = 5 +) -> None: + """Helper method to persist a random value into the backbone + + :param descriptor: the backbone feature store descriptor to attach to + :param key: the key to write to + :param value: a value to write to the key""" + time.sleep(delay) + + backbone = BackboneFeatureStore.from_descriptor(descriptor) + backbone[key] = value + logger.debug(f"set_value_after_delay wrote `{value} to backbone[`{key}`]") + + +@pytest.mark.parametrize("delay", [0, 1, 2, 4, 8]) +def test_backbone_wait_for_partial_prepopulated( + storage_for_dragon_fs_with_mock_req_queue: t.Any, delay: float +) -> None: + """Verify that when data is not all in the backbone, the `wait_for` operation + continues to poll until it finds everything it needs + + :param storage_for_dragon_fs: the storage engine to use, prepopulated with + :param delay: the number of seconds the second process will wait before + setting the target value in the backbone featurestore + """ + # set a very low timeout to confirm that it does not wait + wait_timeout = 10 + storage = storage_for_dragon_fs_with_mock_req_queue + backbone = BackboneFeatureStore(storage) + + key, value = str(uuid.uuid4()), str(random.random() * 10) + + logger.debug(f"Starting process to write {key} after {delay}s") + p = mp.Process( + target=set_value_after_delay, args=(backbone.descriptor, key, value, delay) + ) + p.start() + + p2 = mp.Process( + target=backbone.wait_for, + args=([WORK_QUEUE_KEY, key],), + kwargs={"timeout": wait_timeout}, + ) + p2.start() + + p.join() + p2.join() + + # both values should be written at this time + ret_vals = backbone.wait_for([WORK_QUEUE_KEY, key], 0.1) + # confirm that wait_for with two keys returns two values + assert len(ret_vals) == 2, "values should contain values for both awaited keys" + + # confirm the pre-populated value has the correct output + assert ret_vals[WORK_QUEUE_KEY] == "12345" # mock descriptor value from fixture + + # confirm the population process completed and the awaited value is correct + assert ret_vals[key] == value, "verify order of values " + + +@pytest.mark.parametrize("num_keys", [0, 1, 3, 7, 11]) +def test_backbone_wait_for_multikey( + storage_for_dragon_fs_with_req_queue: t.Any, + num_keys: int, +) -> None: + """Verify that asking the backbone to wait for multiple keys results + in that number of values being returned + + :param storage_for_dragon_fs: the storage engine to use, prepopulated with + :param num_keys: the number of extra keys to set & request in the backbone + """ + # maximum delay allowed for setter processes + max_delay = 5 + storage = storage_for_dragon_fs_with_req_queue + backbone = BackboneFeatureStore(storage) + + extra_keys = [str(uuid.uuid4()) for _ in range(num_keys)] + extra_values = [str(uuid.uuid4()) for _ in range(num_keys)] + extras = dict(zip(extra_keys, extra_values)) + delays = [random.random() * max_delay for _ in range(num_keys)] + processes = [] + + for key, value, delay in zip(extra_keys, extra_values, delays): + assert delay < max_delay, "write delay exceeds test timeout" + logger.debug(f"Delaying {key} write by {delay} seconds") + p = mp.Process( + target=set_value_after_delay, args=(backbone.descriptor, key, value, delay) + ) + p.start() + processes.append(p) + + p2 = mp.Process( + target=backbone.wait_for, + args=([[*extra_keys]],), + kwargs={"timeout": max_delay * 2}, + ) + p2.start() + for p in processes: + p.join(timeout=max_delay * 2) + p2.join( + timeout=max_delay * 2 + ) # give it 10 seconds longer than p2 timeout for backoff + + # use without a wait to verify all values are written + actual_values = backbone.wait_for([*extra_keys], timeout=0.01) + + # confirm that wait_for returns all the expected values + assert len(actual_values) == num_keys + + # confirm that the returned values match (e.g. are returned in the right order) + for k in extras: + assert extras[k] == actual_values[k] diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 932e734c8..bb5dccad7 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pathlib +import time import typing as t import pytest @@ -54,6 +55,21 @@ # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon +WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" +RANDOMLY_SET_KEY = "_SOMETHING_ELSE" + + +@pytest.fixture +def storage_for_dragon_fs_with_req_queue() -> t.Dict[str, str]: + storage = {WORK_QUEUE_KEY: "12345", RANDOMLY_SET_KEY: "67890"} + return storage + + +def boom(*args, **kwargs) -> None: + """Helper function that blows up when used to mock up + some other function""" + raise Exception(f"you shall not pass! {args}, {kwargs}") + def test_event_uid() -> None: """Verify that all events include a unique identifier""" @@ -62,7 +78,7 @@ def test_event_uid() -> None: # generate a bunch of events and keep track all the IDs for i in range(num_iters): - event_a = OnCreateConsumer(str(i)) + event_a = OnCreateConsumer(str(i), filters=[]) event_b = OnWriteFeatureStore(str(i), "key") uids.add(event_a.uid) @@ -177,7 +193,7 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: # NOTE: we're not putting any consumers into the backbone here! backbone = BackboneFeatureStore(mock_storage) - event = OnCreateConsumer(consumer_descriptor) + event = OnCreateConsumer(consumer_descriptor, filters=[]) publisher = EventBroadcaster(backbone) num_receivers = 0 @@ -185,7 +201,7 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: # publishing this event without any known consumers registered should succeed # but report that it didn't have anybody to send the event to consumer_descriptor = storage_path / f"test-consumer" - event = OnCreateConsumer(consumer_descriptor) + event = OnCreateConsumer(consumer_descriptor, filters=[]) num_receivers += publisher.send(event) @@ -215,7 +231,7 @@ def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None: backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) backbone.notification_channels = [] - event = OnCreateConsumer(consumer_descriptor) + event = OnCreateConsumer(consumer_descriptor, filters=[]) publisher = EventBroadcaster( backbone, channel_factory=FileSystemCommChannel.from_descriptor ) @@ -247,7 +263,7 @@ def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) backbone.notification_channels = [consumer_descriptor] - event = OnCreateConsumer(consumer_descriptor) + event = OnCreateConsumer(consumer_descriptor, filters=[]) publisher = EventBroadcaster( backbone, # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied @@ -281,11 +297,11 @@ def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None: # mock building up some buffered events num_buffered_events = 14 for i in range(num_buffered_events): - event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}") + event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", []) publisher._event_buffer.append(bytes(event)) event0 = OnCreateConsumer( - storage_path / f"test-consumer-{str(num_buffered_events + 1)}" + storage_path / f"test-consumer-{str(num_buffered_events + 1)}", [] ) num_receivers = publisher.send(event0) @@ -332,13 +348,13 @@ def test_eventpublisher_broadcast_returns_total_sent( # mock building up some buffered events for i in range(num_buffered): - event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}") + event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", []) publisher._event_buffer.append(bytes(event)) assert publisher.num_buffered == num_buffered # this event will trigger clearing anything already in buffer - event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}") + event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}", []) # num_receivers should contain a number that computes w/all consumers and all events num_receivers = publisher.send(event0) @@ -363,7 +379,7 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: backbone, channel_factory=FileSystemCommChannel.from_descriptor ) - event = OnCreateConsumer(consumer_descriptor) + event = OnCreateConsumer(consumer_descriptor, filters=[]) # the only registered cnosumer is in the event, expect no pruning backbone.notification_channels = (consumer_descriptor,) @@ -377,7 +393,7 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: # ... and remove the old descriptor from the backbone when it's looked up backbone.notification_channels = (consumer_descriptor2,) - event = OnCreateConsumer(consumer_descriptor2) + event = OnCreateConsumer(consumer_descriptor2, filters=[]) publisher.send(event) @@ -433,7 +449,7 @@ def test_eventpublisher_serialize_failure( ) with monkeypatch.context() as patch: - event = OnCreateConsumer(target_descriptor) + event = OnCreateConsumer(target_descriptor, filters=[]) # patch the __bytes__ implementation to cause pickling to fail during send patch.setattr(event, "__bytes__", lambda x: b"abc") @@ -471,7 +487,7 @@ def boom(descriptor: str) -> None: publisher = EventBroadcaster(backbone, channel_factory=boom) with monkeypatch.context() as patch: - event = OnCreateConsumer(target_descriptor) + event = OnCreateConsumer(target_descriptor, filters=[]) backbone.notification_channels = (target_descriptor,) @@ -507,7 +523,7 @@ def boom(self) -> None: raise Exception("That was unexpected...") with monkeypatch.context() as patch: - event = OnCreateConsumer(target_descriptor) + event = OnCreateConsumer(target_descriptor, filters=[]) # patch the _broadcast implementation to cause send to fail after # after the event has been pickled @@ -538,7 +554,7 @@ def test_eventconsumer_receive(test_dir: str) -> None: backbone = BackboneFeatureStore(mock_storage) comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) - event = OnCreateConsumer(target_descriptor) + event = OnCreateConsumer(target_descriptor, filters=[]) # simulate a sent event by writing directly to the input comm channel comm_channel.send(bytes(event)) @@ -574,7 +590,7 @@ def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None: # simulate multiple sent events by writing directly to the input comm channel for _ in range(num_sent): - event = OnCreateConsumer(target_descriptor) + event = OnCreateConsumer(target_descriptor, filters=[]) comm_channel.send(bytes(event)) consumer = EventConsumer(comm_channel, backbone) @@ -628,9 +644,9 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: capp_channel = FileSystemCommChannel(storage_path / "test-capp") back_channel = FileSystemCommChannel(storage_path / "test-backend") - wmgr_consumer_descriptor = wmgr_channel.descriptor.decode("utf-8") - capp_consumer_descriptor = capp_channel.descriptor.decode("utf-8") - back_consumer_descriptor = back_channel.descriptor.decode("utf-8") + wmgr_consumer_descriptor = wmgr_channel.descriptor + capp_consumer_descriptor = capp_channel.descriptor + back_consumer_descriptor = back_channel.descriptor # create some consumers to receive messages wmgr_consumer = EventConsumer( @@ -667,7 +683,7 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: ] # simulate worker manager sending a notification to backend that it's alive - event_1 = OnCreateConsumer(wmgr_consumer_descriptor) + event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[]) mock_worker_mgr.send(event_1) # simulate the app updating a model a few times @@ -721,3 +737,43 @@ def test_eventconsumer_batch_timeout( ) assert "positive" in ex.value.args[0] + + +@pytest.mark.parametrize( + "wait_timeout, exp_wait_max", + [ + # aggregate the 1+1+1 into 3 on remaining parameters + pytest.param(1, 1 + 1 + 1, id="1s wait, 3 cycle steps"), + pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"), + pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"), + pytest.param(9, 3 + 2 + 4 + 8, id="9s wait, 6 cycle steps"), + # aggregate an entire cycle into 16 + pytest.param(19.5, 16 + 3 + 2 + 4, id="20s wait, repeat cycle"), + ], +) +def test_backbone_wait_timeout(wait_timeout: float, exp_wait_max: float) -> None: + """Verify that attempts to attach to the worker queue from the protoclient + timeout in an appropriate amount of time. Note: due to the backoff, we verify + the elapsed time is less than the 15s of a cycle of waits + + :param storage_for_dragon_fs: the dragon storage engine to use + """ + + # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8] + # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) + start_time = time.time() + + storage = {} + backbone = BackboneFeatureStore(storage) + + with pytest.raises(SmartSimError) as ex: + backbone.wait_for(["does-not-exist"]) + + end_time = time.time() + elapsed = end_time - start_time + + # confirm that we met our timeout + assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}" + + # confirm that the total wait time is aligned with the sleep cycle + assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index 59801eebe..104acd914 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -35,7 +35,6 @@ DragonCommChannel, create_local, ) -from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, EventBroadcaster, @@ -131,7 +130,7 @@ def test_eventconsumer_eventpublisher_integration( ] # simulate worker manager sending a notification to backend that it's alive - event_1 = OnCreateConsumer(wmgr_consumer_descriptor) + event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[]) mock_worker_mgr.send(event_1) # simulate the app updating a model a few times diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py new file mode 100644 index 000000000..590780154 --- /dev/null +++ b/tests/dragon/test_protoclient.py @@ -0,0 +1,231 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pickle +import time +import typing as t + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBroadcaster, + OnWriteFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict +from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +# isort: off +from dragon import fli +from dragon.channels import Channel + +# from ..ex..high_throughput_inference.mock_app import ProtoClient +from smartsim.protoclient import ProtoClient + + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon +WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" +logger = get_logger(__name__) + + +@pytest.fixture +def storage_for_dragon_fs() -> t.Dict[str, str]: + # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) + return dragon_ddict.DDict(1, 2, 4 * 1024**2) + + +@pytest.fixture +def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: + return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) + + +@pytest.fixture +def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: + """a stand-in for the worker manager so a worker queue exists""" + + # create the FLI + to_worker_channel = Channel.make_process_local() + # to_worker_channel = create_local() + fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + comm_channel = DragonFLIChannel(fli_, True) + + # store the descriptor in the backbone + # the_backbone.worker_queue = comm_channel.descriptor + the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor + + try: + comm_channel.send(b"foo") + except Exception as ex: + print(f"ohnooooo: {ex}") + + return comm_channel + + +@pytest.fixture +def storage_for_dragon_fs_with_req_queue( + storage_for_dragon_fs: t.Dict[str, str] +) -> t.Dict[str, str]: + # create a valid FLI so any call to attach does not fail + channel_ = Channel.make_process_local() + fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) + comm_channel = DragonFLIChannel(fli_, True) + + storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor + return storage_for_dragon_fs + + +@pytest.mark.parametrize( + "wait_timeout, exp_wait_max", + [ + # aggregate the 1+1+1 into 3 on remaining parameters + pytest.param(1, 1 + 1 + 1, id="1s wait, 3 cycle steps"), + pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"), + pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"), + ], +) +def test_protoclient_timeout( + wait_timeout: float, + exp_wait_max: float, + the_backbone: BackboneFeatureStore, + monkeypatch: pytest.MonkeyPatch, +): + """Verify that attempts to attach to the worker queue from the protoclient + timeout in an appropriate amount of time. Note: due to the backoff, we verify + the elapsed time is less than the 15s of a cycle of waits + + :param wait_timeout: a timeout for use when configuring a proto client + :param exp_wait_max: a ceiling for the expected time spent waiting for + the timeout + :param the_backbone: a pre-initialized backbone featurestore for setting up + the environment variable required by the client""" + + # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8] + # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) + start_time = time.time() + with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex: + ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) + + ProtoClient(False, wait_timeout=wait_timeout) + + end_time = time.time() + elapsed = end_time - start_time + + # todo: revisit. should this trigger any wait if the backbone is set above? + # confirm that we met our timeout + # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}" + + # confirm that the total wait time is aligned with the sleep cycle + assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" + + +def test_protoclient_initialization_no_backbone(): + """Verify that attempting to start the client without required environment variables + results in an exception. NOTE: Backbone env var is not set""" + + with pytest.raises(SmartSimError) as ex: + ProtoClient(timing_on=False) + + # confirm the missing value error has been raised + assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" "))) + + +def test_protoclient_initialization( + the_backbone: BackboneFeatureStore, + the_worker_queue: DragonFLIChannel, + monkeypatch: pytest.MonkeyPatch, +): + """Verify that attempting to start the client with required env vars results + in a fully initialized client + + :param the_backbone: a pre-initialized backbone featurestore + :param the_worker_queue: an FLI channel the client will retrieve + from the backbone""" + + with monkeypatch.context() as ctx: + ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) + # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture + + client = ProtoClient(timing_on=False) + + # confirm the backbone was attached correctly + assert client._backbone is not None + assert client._backbone.descriptor == the_backbone.descriptor + + # confirm the worker queue is created and attached correctly + assert client._to_worker_fli is not None + assert client._to_worker_fli.descriptor == the_worker_queue.descriptor + + # confirm the worker channels are created + assert client._from_worker_ch is not None + assert client._from_worker_ch.descriptor + + assert client._to_worker_ch is not None + assert client._to_worker_ch.descriptor + + # confirm a publisher is created + assert client._publisher is not None + + +def test_protoclient_write_model( + the_backbone: BackboneFeatureStore, + the_worker_queue: DragonFLIChannel, + monkeypatch: pytest.MonkeyPatch, +): + """Verify that writing a model using the client causes the model data to be + written to a feature store and triggers a key-written event + + :param the_backbone: a pre-initialized backbone featurestore + :param the_worker_queue: an FLI channel the client will retrieve + from the backbone""" + + with monkeypatch.context() as ctx: + ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) + # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture + + client = ProtoClient(timing_on=False) + + model_key = "my-model" + model_bytes = b"12345" + + client.set_model(model_key, model_bytes) + + # confirm the client modified the underlying feature store + assert client._backbone[model_key] == model_bytes + + publisher = t.cast(EventBroadcaster, client._publisher) + + # confirm the client raised the key-written event + assert len(publisher._event_buffer) == 1 + + event = t.cast(OnWriteFeatureStore, pickle.loads(publisher._event_buffer.pop())) + assert event.descriptor == the_backbone.descriptor + assert event.key == model_key diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index ccdbce58c..714492f37 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -53,6 +53,7 @@ import dragon.infrastructure.policy as dragon_policy import dragon.infrastructure.process_desc as dragon_process_desc import dragon.native.process as dragon_process +import torch.nn as nn from dragon import fli from dragon.channels import Channel from dragon.data.ddict.ddict import DDict @@ -86,6 +87,35 @@ pytestmark = pytest.mark.dragon +class MiniModel(nn.Module): + def __init__(self): + super().__init__() + + self._name = "mini-model" + self._net = torch.nn.Linear(2, 1) + + def forward(self, input): + return self._net(input) + + @property + def bytes(self) -> bytes: + """Returns the model serialized to a byte stream""" + buffer = io.BytesIO() + scripted = torch.jit.trace(self._net, self.get_batch()) + torch.jit.save(scripted, buffer) + return buffer.getvalue() + + @classmethod + def get_batch(cls) -> "torch.Tensor": + return torch.randn((100, 2), dtype=torch.float32) + + +def load_model() -> bytes: + """Create a simple torch model in memory for testing""" + mini_model = MiniModel() + return mini_model.bytes + + def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: """Create a simple torch model and persist to disk for testing purposes. @@ -106,29 +136,17 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: def mock_messages( request_dispatcher_queue: DragonFLIChannel, feature_store: FeatureStore, - feature_store_root_dir: pathlib.Path, - comm_channel_root_dir: pathlib.Path, ) -> None: """Mock event producer for triggering the inference pipeline""" - feature_store_root_dir.mkdir(parents=True, exist_ok=True) - comm_channel_root_dir.mkdir(parents=True, exist_ok=True) - - model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") - model_bytes = model_path.read_bytes() - model_key = str(feature_store_root_dir / "model_fs.pt") - - feature_store[model_key] = model_bytes + model_key = "mini-model" for iteration_number in range(2): channel = Channel.make_process_local() callback_channel = DragonCommChannel(channel) + output_key = f"output-{iteration_number}" - input_path = feature_store_root_dir / f"{iteration_number}/input.pt" - output_path = feature_store_root_dir / f"{iteration_number}/output.pt" - - input_key = str(input_path) - output_key = str(output_path) + feature_store[model_key] = load_model() tensor = ( (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32) @@ -139,12 +157,13 @@ def mock_messages( "c", "float32", list(tensor.shape) ) - message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) - message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) - message_model_key = MessageHandler.build_model_key(model_key, fsd) + message_tensor_output_key = MessageHandler.build_feature_store_key( + output_key, fsd + ) + message_model_key = MessageHandler.build_feature_store_key(model_key, fsd) request = MessageHandler.build_request( - reply_channel=base64.b64encode(channel.serialize()).decode("utf-8"), + reply_channel=callback_channel.descriptor, model=message_model_key, inputs=[tensor_desc], outputs=[message_tensor_output_key], @@ -190,25 +209,20 @@ def service_as_dragon_proc( ) -def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: +def test_request_dispatcher() -> None: """Test the request dispatcher batching and queueing system This also includes setting a queue to disposable, checking that it is no longer referenced by the dispatcher. """ - test_path = prepare_environment - fs_path = test_path / "feature_store" - comm_path = test_path / "comm_store" - to_worker_channel = dch.Channel.make_process_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_serialized = to_worker_fli.serialize() + to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, sender_supplied=True) # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader # or test environment may be unable to send messages w/queue - descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor + os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_ch.descriptor ddict = DDict(1, 2, 4 * 1024**2) dragon_fs = DragonFeatureStore(ddict) @@ -216,15 +230,14 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, ) - integrated_worker_type = TorchWorker request_dispatcher = RequestDispatcher( batch_timeout=0, batch_size=2, config_loader=config_loader, - worker_type=integrated_worker_type, + worker_type=TorchWorker, mem_pool_size=2 * 1024**2, ) @@ -241,9 +254,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: batch: t.Optional[RequestBatch] = None mem_allocs = [] tensors = [] - fs_path = test_path / f"feature_store" - comm_path = test_path / f"comm_store" - model_key = str(fs_path / "model_fs.pt") + model_key = "mini-model" # create a mock client application to populate the request queue msg_pump = mp.Process( @@ -251,8 +262,6 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: args=( worker_queue, dragon_fs, - fs_path, - comm_path, ), ) @@ -260,7 +269,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: time.sleep(1) - for attempts in range(15): + for _ in range(15): try: request_dispatcher._on_iteration() batch = request_dispatcher.task_queue.get(timeout=1) diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 1ebc512a5..43b8cc7ec 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -1,218 +1,339 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import io -import logging -import pathlib -import time - -import pytest - -torch = pytest.importorskip("torch") -dragon = pytest.importorskip("dragon") - -import base64 -import multiprocessing as mp - -try: - mp.set_start_method("dragon") -except Exception: - pass - -import os - -import dragon.channels as dch -from dragon import fli -from dragon.mpbridge.queues import DragonQueue - -from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -from smartsim._core.mli.infrastructure.control.worker_manager import ( - EnvironmentConfigLoader, - WorkerManager, -) -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( - DragonFeatureStore, -) -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore -from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker -from smartsim._core.mli.message_handler import MessageHandler -from smartsim.log import get_logger - -from .feature_store import FileSystemFeatureStore -from .utils.channel import FileSystemCommChannel - -logger = get_logger(__name__) -# The tests in this file belong to the dragon group -pytestmark = pytest.mark.dragon - - -def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: - """Create a simple torch model and persist to disk for - testing purposes. - - TODO: remove once unit tests are in place""" - # test_path = pathlib.Path(work_dir) - if not model_path.parent.exists(): - model_path.parent.mkdir(parents=True, exist_ok=True) - - model_path.unlink(missing_ok=True) - # model_path = test_path / "basic.pt" - - model = torch.nn.Linear(2, 1) - torch.save(model, model_path) - - return model_path - - -def mock_messages( - worker_manager_queue: CommChannelBase, - feature_store: FeatureStore, - feature_store_root_dir: pathlib.Path, - comm_channel_root_dir: pathlib.Path, -) -> None: - """Mock event producer for triggering the inference pipeline""" - feature_store_root_dir.mkdir(parents=True, exist_ok=True) - comm_channel_root_dir.mkdir(parents=True, exist_ok=True) - - model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") - model_bytes = model_path.read_bytes() - model_key = str(feature_store_root_dir / "model_fs.pt") - - feature_store[model_key] = model_bytes - - iteration_number = 0 - - while True: - iteration_number += 1 - time.sleep(1) - # 1. for demo, ignore upstream and just put stuff into downstream - # 2. for demo, only one downstream but we'd normally have to filter - # msg content and send to the correct downstream (worker) queue - # timestamp = time.time_ns() - # mock_channel = test_path / f"brainstorm-{timestamp}.txt" - # mock_channel.touch() - - # thread - just look for key (wait for keys) - # call checkpoint, try to get non-persistent key, it blocks - # working set size > 1 has side-effects - # only incurs cost when working set size has been exceeded - - channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" - callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) - - input_path = feature_store_root_dir / f"{iteration_number}/input.pt" - output_path = feature_store_root_dir / f"{iteration_number}/output.pt" - - input_key = str(input_path) - output_key = str(output_path) - - buffer = io.BytesIO() - tensor = torch.randn((1, 2), dtype=torch.float32) - torch.save(tensor, buffer) - feature_store[input_key] = buffer.getvalue() - fsd = feature_store.descriptor - - message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) - message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) - message_model_key = MessageHandler.build_model_key(model_key, fsd) - - request = MessageHandler.build_request( - reply_channel=callback_channel.descriptor, - model=message_model_key, - inputs=[message_tensor_input_key], - outputs=[message_tensor_output_key], - output_descriptors=[], - custom_attributes=None, - ) - request_bytes = MessageHandler.serialize_request(request) - worker_manager_queue.send(request_bytes) - - -@pytest.fixture -def prepare_environment(test_dir: str) -> pathlib.Path: - """Cleanup prior outputs to run demo repeatedly""" - path = pathlib.Path(f"{test_dir}/workermanager.log") - logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) - return path - - -def test_worker_manager(prepare_environment: pathlib.Path) -> None: - """Test the worker manager""" - - test_path = prepare_environment - fs_path = test_path / "feature_store" - comm_path = test_path / "comm_store" - - to_worker_channel = dch.Channel.make_process_local() - to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_serialized = to_worker_fli.serialize() - - # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader - # or test environment may be unable to send messages w/queue - descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor - - config_loader = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, - ) - integrated_worker_type = TorchWorker - - worker_manager = WorkerManager( - config_loader, - integrated_worker_type, - as_service=True, - cooldown=5, - device="cpu", - dispatcher_queue=mp.Queue(maxsize=0), - ) - - worker_queue = config_loader.get_queue() - if worker_queue is None: - logger.warn( - f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" - ) - - # create a mock client application to populate the request queue - msg_pump = mp.Process( - target=mock_messages, - args=( - worker_queue, - FileSystemFeatureStore(fs_path), - fs_path, - comm_path, - ), - ) - msg_pump.start() - - # create a process to execute commands - process = mp.Process(target=worker_manager.execute) - process.start() - process.join(timeout=5) - process.kill() - msg_pump.kill() +# # BSD 2-Clause License +# # +# # Copyright (c) 2021-2024, Hewlett Packard Enterprise +# # All rights reserved. +# # +# # Redistribution and use in source and binary forms, with or without +# # modification, are permitted provided that the following conditions are met: +# # +# # 1. Redistributions of source code must retain the above copyright notice, this +# # list of conditions and the following disclaimer. +# # +# # 2. Redistributions in binary form must reproduce the above copyright notice, +# # this list of conditions and the following disclaimer in the documentation +# # and/or other materials provided with the distribution. +# # +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# import io +# import logging +# import pathlib +# import time + +# import pytest + +# torch = pytest.importorskip("torch") +# dragon = pytest.importorskip("dragon") + +# import multiprocessing as mp + +# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( +# BackboneFeatureStore, +# ) +# from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import OutputDescriptor + +# try: +# mp.set_start_method("dragon") +# except Exception: +# pass + +# import os + +# import dragon.channels as dch +# import torch.nn as nn +# from dragon import fli +# from dragon.data.ddict.ddict import DDict + +# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +# from smartsim._core.mli.infrastructure.control.worker_manager import ( +# EnvironmentConfigLoader, +# WorkerManager, +# ) +# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( +# DragonFeatureStore, +# ) +# from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +# from smartsim._core.mli.message_handler import MessageHandler +# from smartsim.log import get_logger + +# from .utils.channel import FileSystemCommChannel + +# logger = get_logger(__name__) +# # The tests in this file belong to the dragon group +# pytestmark = pytest.mark.dragon + + +# class MiniModel(nn.Module): +# def __init__(self): +# super().__init__() + +# self._name = "mini-model" +# self._net = torch.nn.Linear(2, 1) + +# def forward(self, input): +# return self._net(input) + +# @property +# def bytes(self) -> bytes: +# """Returns the model serialized to a byte stream""" +# buffer = io.BytesIO() +# scripted = torch.jit.trace(self._net, self.get_batch()) +# torch.jit.save(scripted, buffer) +# return buffer.getvalue() + +# @classmethod +# def get_batch(cls) -> "torch.Tensor": +# return torch.randn((100, 2), dtype=torch.float32) + + +# def create_model(model_path: pathlib.Path) -> pathlib.Path: +# """Create a simple torch model and persist to disk for +# testing purposes. + +# TODO: remove once unit tests are in place""" +# if not model_path.parent.exists(): +# model_path.parent.mkdir(parents=True, exist_ok=True) + +# model_path.unlink(missing_ok=True) + +# mini_model = MiniModel() +# torch.save(mini_model, model_path) + +# return model_path + + +# def load_model() -> bytes: +# """Create a simple torch model in memory for testing""" +# mini_model = MiniModel() +# return mini_model.bytes + + +# def mock_messages( +# feature_store_root_dir: pathlib.Path, +# comm_channel_root_dir: pathlib.Path, +# kill_queue: mp.Queue, +# ) -> None: +# """Mock event producer for triggering the inference pipeline""" +# feature_store_root_dir.mkdir(parents=True, exist_ok=True) +# comm_channel_root_dir.mkdir(parents=True, exist_ok=True) + +# iteration_number = 0 + +# config_loader = EnvironmentConfigLoader( +# featurestore_factory=DragonFeatureStore.from_descriptor, +# callback_factory=FileSystemCommChannel.from_descriptor, +# queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, +# ) +# backbone = config_loader.get_backbone() + +# worker_queue = config_loader.get_queue() +# if worker_queue is None: +# queue_desc = config_loader._queue_descriptor +# logger.warn( +# f"FLI input queue not loaded correctly from config_loader: {queue_desc}" +# ) + +# model_key = "mini-model" +# model_bytes = load_model() +# backbone[model_key] = model_bytes + +# message_model_key = MessageHandler.build_feature_store_key( +# model_key, backbone.descriptor +# ) + +# while True: +# if not kill_queue.empty(): +# return +# iteration_number += 1 +# time.sleep(1) +# # 1. for demo, ignore upstream and just put stuff into downstream +# # 2. for demo, only one downstream but we'd normally have to filter +# # msg content and send to the correct downstream (worker) queue +# # timestamp = time.time_ns() +# # mock_channel = test_path / f"brainstorm-{timestamp}.txt" +# # mock_channel.touch() + +# # thread - just look for key (wait for keys) +# # call checkpoint, try to get non-persistent key, it blocks +# # working set size > 1 has side-effects +# # only incurs cost when working set size has been exceeded + +# channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" +# callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) + +# # input_key = f"my-input-{iteration_number}" +# output_key = f"my-output-{iteration_number}" + +# batch = MiniModel.get_batch() +# shape = batch.shape +# batch_bytes = batch.numpy().tobytes() +# # backbone[input_key] = batch_bytes + +# logger.debug(f"Model content: {backbone[model_key][:20]}") +# # logger.debug(f"Input content: {backbone[input_key][:20]}") + +# fsd = backbone.descriptor + +# # message_tensor_output_key = MessageHandler.build_feature_store_key( +# # output_key, fsd +# # ) +# # message_tensor_input_key = MessageHandler.build_feature_store_key( +# # input_key, fsd +# # ) + +# input_descriptor = MessageHandler.build_tensor_descriptor( +# "f", "float32", list(shape) +# ) + +# # output_descriptor = MessageHandler.build_output_tensor_descriptor( +# # "f", [], "float32", list(shape) +# # ) + +# # The first request is always the metadata... +# request = MessageHandler.build_request( +# reply_channel=callback_channel.descriptor, +# # model=message_model_key, +# model=MessageHandler.build_model(model_bytes, "mini-model", "1.0"), +# # inputs=[message_tensor_input_key], +# inputs=[input_descriptor], +# # outputs=[message_tensor_output_key], +# outputs=[], +# # output_descriptors=[output_descriptor], +# output_descriptors=[], +# custom_attributes=None, +# ) +# request_bytes = MessageHandler.serialize_request(request) +# fli: DragonFLIChannel = worker_queue + +# with fli._fli.sendh(timeout=None, stream_channel=fli._channel) as sendh: +# sendh.send_bytes(request_bytes) +# sendh.send_bytes(batch_bytes) + +# # worker_queue.send(request_bytes) +# # follow up with the actual data +# # worker_queue.send(batch_bytes) + +# logger.info("published message") + +# if iteration_number > 5: +# return + + +# def mock_mli_infrastructure_mgr(): +# config_loader = EnvironmentConfigLoader( +# featurestore_factory=DragonFeatureStore.from_descriptor, +# callback_factory=FileSystemCommChannel.from_descriptor, +# queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, +# ) + +# integrated_worker = TorchWorker + +# worker_manager = WorkerManager( +# config_loader, +# integrated_worker, +# as_service=True, +# cooldown=10, +# device="cpu", +# dispatcher_queue=mp.Queue(maxsize=0), +# ) +# worker_manager.execute() + + +# @pytest.fixture +# def prepare_environment(test_dir: str) -> pathlib.Path: +# """Cleanup prior outputs to run demo repeatedly""" +# path = pathlib.Path(f"{test_dir}/workermanager.log") +# logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) +# return path + + +# def test_worker_manager(prepare_environment: pathlib.Path) -> None: +# """Test the worker manager""" + +# test_path = prepare_environment +# fs_path = test_path / "feature_store" +# comm_path = test_path / "comm_store" + +# # old instantiation code start +# # to_worker_channel = dch.Channel.make_process_local() +# # to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) +# # to_worker_fli_serialized = to_worker_fli.serialize() + +# # # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader +# # # or test environment may be unable to send messages w/queue +# # descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") +# # os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor + +# mgr_per_node = 1 +# num_nodes = 2 +# mem_per_node = 1024**3 +# total_mem = num_nodes * mem_per_node + +# storage = DDict( +# managers_per_node=mgr_per_node, +# n_nodes=num_nodes, +# total_mem=total_mem, +# ) +# backbone = BackboneFeatureStore(storage, allow_reserved_writes=True) + +# to_worker_channel = dch.Channel.make_process_local() +# to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + +# to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, sender_supplied=True) + +# # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader +# # or test environment may be unable to send messages w/queue +# os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor +# os.environ["_SMARTSIM_INFRA_BACKBONE"] = backbone.descriptor + +# config_loader = EnvironmentConfigLoader( +# featurestore_factory=DragonFeatureStore.from_descriptor, +# callback_factory=FileSystemCommChannel.from_descriptor, +# queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, +# ) +# integrated_worker_type = TorchWorker + +# worker_manager = WorkerManager( +# config_loader, +# integrated_worker_type, +# as_service=True, +# cooldown=5, +# device="cpu", +# dispatcher_queue=mp.Queue(maxsize=0), +# ) + +# worker_queue = config_loader.get_queue() +# if worker_queue is None: +# logger.warn( +# f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" +# ) +# backbone.worker_queue = to_worker_fli_comm_channel.descriptor + +# # create a mock client application to populate the request queue +# kill_queue = mp.Queue() +# msg_pump = mp.Process( +# target=mock_messages, +# args=(fs_path, comm_path, kill_queue), +# ) +# msg_pump.start() + +# # create a process to execute commands +# process = mp.Process(target=mock_mli_infrastructure_mgr) + +# # let it send some messages before starting the worker manager +# msg_pump.join(timeout=5) +# process.start() +# msg_pump.join(timeout=5) +# kill_queue.put_nowait("kill!") +# process.join(timeout=5) +# msg_pump.kill() +# process.kill() diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index 6cde6258f..09e1703bc 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -39,17 +39,14 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" - def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: + def __init__(self, key: pathlib.Path) -> None: """Initialize the FileSystemCommChannel instance :param key: a path to the root directory of the feature store""" self._lock = threading.RLock() - if not isinstance(key, bytes): - super().__init__(key.as_posix().encode("utf-8")) - self._file_path = key - else: - super().__init__(key) - self._file_path = pathlib.Path(key.decode("utf-8")) + + super().__init__(key.as_posix()) + self._file_path = key if not self._file_path.parent.exists(): self._file_path.parent.mkdir(parents=True) @@ -110,17 +107,14 @@ def clear(self) -> None: @classmethod def from_descriptor( cls, - descriptor: t.Union[str, bytes], + descriptor: str, ) -> "FileSystemCommChannel": """A factory method that creates an instance from a descriptor string :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached FileSystemCommChannel""" try: - if isinstance(descriptor, str): - path = pathlib.Path(descriptor) - else: - path = pathlib.Path(descriptor.decode("utf-8")) + path = pathlib.Path(descriptor) return FileSystemCommChannel(path) except: logger.warning(f"failed to create fs comm channel: {descriptor!r}") diff --git a/tests/mli/channel.py b/tests/mli/channel.py index 234878423..b00ba9aa2 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -39,17 +39,14 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" - def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: + def __init__(self, key: pathlib.Path) -> None: """Initialize the FileSystemCommChannel instance :param key: a path to the root directory of the feature store""" self._lock = threading.RLock() - if isinstance(key, pathlib.Path): - super().__init__(key.as_posix().encode("utf-8")) - self._file_path = key - else: - super().__init__(key) - self._file_path = pathlib.Path(key.decode("utf-8")) + + super().__init__(key.as_posix()) + self._file_path = key if not self._file_path.parent.exists(): self._file_path.parent.mkdir(parents=True) @@ -110,17 +107,14 @@ def clear(self) -> None: @classmethod def from_descriptor( cls, - descriptor: t.Union[str, bytes], + descriptor: str, ) -> "FileSystemCommChannel": """A factory method that creates an instance from a descriptor string :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached FileSystemCommChannel""" try: - if isinstance(descriptor, str): - path = pathlib.Path(descriptor) - else: - path = pathlib.Path(descriptor.decode("utf-8")) + path = pathlib.Path(descriptor) return FileSystemCommChannel(path) except: logger.warning(f"failed to create fs comm channel: {descriptor}") diff --git a/tests/mli/test_integrated_torch_worker.py b/tests/mli/test_integrated_torch_worker.py index 60f1f0c6b..67a9a4a9b 100644 --- a/tests/mli/test_integrated_torch_worker.py +++ b/tests/mli/test_integrated_torch_worker.py @@ -106,9 +106,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # output_key = f"demo-output" -# message_tensor_output_key = MessageHandler.build_tensor_key(output_key) -# message_tensor_input_key = MessageHandler.build_tensor_key(input_key) -# message_model_key = MessageHandler.build_model_key(model_key) +# message_tensor_output_key = MessageHandler.build_feature_store_key(output_key) +# message_tensor_input_key = MessageHandler.build_feature_store_key(input_key) +# message_model_key = MessageHandler.build_feature_store_key(model_key) # request = MessageHandler.build_request( # reply_channel=callback_channel.descriptor, @@ -146,9 +146,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # output_key = f"demo-output" -# message_tensor_output_key = MessageHandler.build_tensor_key(output_key) -# message_tensor_input_key = MessageHandler.build_tensor_key(input_key) -# # message_model_key = MessageHandler.build_model_key(model_key) +# message_tensor_output_key = MessageHandler.build_feature_store_key(output_key) +# message_tensor_input_key = MessageHandler.build_feature_store_key(input_key) +# # message_model_key = MessageHandler.build_feature_store_key(model_key) # request = MessageHandler.build_request( # reply_channel=callback_channel.descriptor, @@ -187,9 +187,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # output_key = f"demo-output" -# message_tensor_output_key = MessageHandler.build_tensor_key(output_key) -# # message_tensor_input_key = MessageHandler.build_tensor_key(input_key) -# # message_model_key = MessageHandler.build_model_key(model_key) +# message_tensor_output_key = MessageHandler.build_feature_store_key(output_key) +# # message_tensor_input_key = MessageHandler.build_feature_store_key(input_key) +# # message_model_key = MessageHandler.build_feature_store_key(model_key) # message_tensor_input = MessageHandler.build_tensor( # input_tensor, "c", "float32", [2] # ) @@ -231,9 +231,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # output_key = f"demo-output" -# # message_tensor_output_key = MessageHandler.build_tensor_key(output_key) -# # message_tensor_input_key = MessageHandler.build_tensor_key(input_key) -# message_model_key = MessageHandler.build_model_key(model_key) +# # message_tensor_output_key = MessageHandler.build_feature_store_key(output_key) +# # message_tensor_input_key = MessageHandler.build_feature_store_key(input_key) +# message_model_key = MessageHandler.build_feature_store_key(model_key) # message_tensor_input = MessageHandler.build_tensor( # input_tensor, "c", "float32", [2] # ) diff --git a/tests/test_featurestore.py b/tests/test_featurestore.py new file mode 100644 index 000000000..f0b122bcf --- /dev/null +++ b/tests/test_featurestore.py @@ -0,0 +1,711 @@ +# # BSD 2-Clause License +# # +# # Copyright (c) 2021-2024, Hewlett Packard Enterprise +# # All rights reserved. +# # +# # Redistribution and use in source and binary forms, with or without +# # modification, are permitted provided that the following conditions are met: +# # +# # 1. Redistributions of source code must retain the above copyright notice, this +# # list of conditions and the following disclaimer. +# # +# # 2. Redistributions in binary form must reproduce the above copyright notice, +# # this list of conditions and the following disclaimer in the documentation +# # and/or other materials provided with the distribution. +# # +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# import pathlib +# import time +# import typing as t +# import unittest.mock as mock + +# import pytest + +# dragon = pytest.importorskip("dragon") + +# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( +# BackboneFeatureStore, +# EventBroadcaster, +# EventCategory, +# EventConsumer, +# OnCreateConsumer, +# OnWriteFeatureStore, +# ) +# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( +# time as bbtime, +# ) +# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( +# DragonFeatureStore, +# ) +# from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys +# from smartsim.error import SmartSimError +# from tests.mli.channel import FileSystemCommChannel +# from tests.mli.feature_store import MemoryFeatureStore + +# if t.TYPE_CHECKING: +# import conftest + + +# # The tests in this file belong to the group_a group +# pytestmark = pytest.mark.group_a + +# WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" +# RANDOMLY_SET_KEY = "_SOMETHING_ELSE" + + +# @pytest.fixture +# def storage_for_dragon_fs_with_req_queue() -> t.Dict[str, str]: +# storage = {WORK_QUEUE_KEY: "12345", RANDOMLY_SET_KEY: "67890"} +# return storage + + +# def boom(*args, **kwargs) -> None: +# """Helper function that blows up when used to mock up +# some other function""" +# raise Exception(f"you shall not pass! {args}, {kwargs}") + + +# def test_event_uid() -> None: +# """Verify that all events include a unique identifier""" +# uids: t.Set[str] = set() +# num_iters = 1000 + +# # generate a bunch of events and keep track all the IDs +# for i in range(num_iters): +# event_a = OnCreateConsumer(str(i), []) +# event_b = OnWriteFeatureStore(str(i), "key") + +# uids.add(event_a.uid) +# uids.add(event_b.uid) + +# # verify each event created a unique ID +# assert len(uids) == 2 * num_iters + + +# def test_mli_reserved_keys_conversion() -> None: +# """Verify that conversion from a string to an enum member +# works as expected""" + +# for reserved_key in ReservedKeys: +# # iterate through all keys and verify `from_string` works +# assert ReservedKeys.contains(reserved_key.value) + +# # show that the value (actual key) not the enum member name +# # will not be incorrectly identified as reserved +# assert not ReservedKeys.contains(str(reserved_key).split(".")[1]) + + +# def test_mli_reserved_keys_writes() -> None: +# """Verify that attempts to write to reserved keys are blocked from a +# standard DragonFeatureStore but enabled with the BackboneFeatureStore""" + +# mock_storage = {} +# dfs = DragonFeatureStore(mock_storage) +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# other = MemoryFeatureStore(mock_storage) + +# expected_value = "value" + +# for reserved_key in ReservedKeys: +# # we expect every reserved key to fail using DragonFeatureStore... +# with pytest.raises(SmartSimError) as ex: +# dfs[reserved_key] = expected_value + +# assert "reserved key" in ex.value.args[0] + +# # ... and expect other feature stores to respect reserved keys +# with pytest.raises(SmartSimError) as ex: +# other[reserved_key] = expected_value + +# assert "reserved key" in ex.value.args[0] + +# # ...and those same keys to succeed on the backbone +# backbone[reserved_key] = expected_value +# actual_value = backbone[reserved_key] +# assert actual_value == expected_value + + +# def test_mli_consumers_read_by_key() -> None: +# """Verify that the value returned from the mli consumers +# method is written to the correct key and reads are +# allowed via standard dragon feature store. +# NOTE: should reserved reads also be blocked""" + +# mock_storage = {} +# dfs = DragonFeatureStore(mock_storage) +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# other = MemoryFeatureStore(mock_storage) + +# expected_value = "value" + +# # write using backbone that has permission to write reserved keys +# backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value + +# # confirm read-only access to reserved keys from any FeatureStore +# for fs in [dfs, backbone, other]: +# assert fs[ReservedKeys.MLI_NOTIFY_CONSUMERS] == expected_value + + +# def test_mli_consumers_read_by_backbone() -> None: +# """Verify that the backbone reads the correct location +# when using the backbone feature store API instead of mapping API""" + +# mock_storage = {} +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# expected_value = "value" + +# backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value + +# # confirm reading via convenience method returns expected value +# assert backbone.notification_channels[0] == expected_value + + +# def test_mli_consumers_write_by_backbone() -> None: +# """Verify that the backbone writes the correct location +# when using the backbone feature store API instead of mapping API""" + +# mock_storage = {} +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# expected_value = ["value"] + +# backbone.notification_channels = expected_value + +# # confirm write using convenience method targets expected key +# assert backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] == ",".join(expected_value) + + +# def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: +# """Verify that a broadcast operation without any registered subscribers +# succeeds without raising Exceptions + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs""" +# storage_path = pathlib.Path(test_dir) / "features" +# mock_storage = {} +# consumer_descriptor = storage_path / "test-consumer" + +# # NOTE: we're not putting any consumers into the backbone here! +# backbone = BackboneFeatureStore(mock_storage) + +# event = OnCreateConsumer(consumer_descriptor, []) + +# publisher = EventBroadcaster(backbone) +# num_receivers = 0 + +# # publishing this event without any known consumers registered should succeed +# # but report that it didn't have anybody to send the event to +# consumer_descriptor = storage_path / f"test-consumer" +# event = OnCreateConsumer(consumer_descriptor, []) + +# num_receivers += publisher.send(event) + +# # confirm no changes to the backbone occur when fetching the empty consumer key +# key_in_features_store = ReservedKeys.MLI_NOTIFY_CONSUMERS in backbone +# assert not key_in_features_store + +# # confirm that the broadcast reports no events published +# assert num_receivers == 0 +# # confirm that the broadcast buffered the event for a later send +# assert publisher.num_buffered == 1 + + +# def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None: +# """Verify that a broadcast operation without any registered subscribers +# succeeds without raising Exceptions + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs""" +# storage_path = pathlib.Path(test_dir) / "features" +# mock_storage = {} + +# # note: file-system descriptors are just paths +# consumer_descriptor = storage_path / "test-consumer" + +# # prep our backbone with a consumer list +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# backbone.notification_channels = [] + +# event = OnCreateConsumer(consumer_descriptor, []) +# publisher = EventBroadcaster( +# backbone, channel_factory=FileSystemCommChannel.from_descriptor +# ) +# num_receivers = publisher.send(event) + +# registered_consumers = backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] + +# # confirm that no consumers exist in backbone to send to +# assert not registered_consumers +# # confirm that the broadcast reports no events published +# assert num_receivers == 0 +# # confirm that the broadcast buffered the event for a later send +# assert publisher.num_buffered == 1 + + +# def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None: +# """Verify that a broadcast operation reports an error if no channel +# factory was supplied for constructing the consumer channels + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs""" +# storage_path = pathlib.Path(test_dir) / "features" +# mock_storage = {} + +# # note: file-system descriptors are just paths +# consumer_descriptor = storage_path / "test-consumer" + +# # prep our backbone with a consumer list +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# backbone.notification_channels = [consumer_descriptor] + +# event = OnCreateConsumer(consumer_descriptor, []) +# publisher = EventBroadcaster( +# backbone, +# # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied +# ) + +# with pytest.raises(SmartSimError) as ex: +# publisher.send(event) + +# assert "factory" in ex.value.args[0] + + +# def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None: +# """Verify that a successful broadcast clears messages from the event +# buffer when a new message is sent and consumers are registered + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs""" +# storage_path = pathlib.Path(test_dir) / "features" +# mock_storage = {} + +# # note: file-system descriptors are just paths +# consumer_descriptor = storage_path / "test-consumer" + +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# backbone.notification_channels = (consumer_descriptor,) + +# publisher = EventBroadcaster( +# backbone, channel_factory=FileSystemCommChannel.from_descriptor +# ) + +# # mock building up some buffered events +# num_buffered_events = 14 +# for i in range(num_buffered_events): +# event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", []) +# publisher._event_buffer.append(bytes(event)) + +# event0 = OnCreateConsumer( +# storage_path / f"test-consumer-{str(num_buffered_events + 1)}", [] +# ) + +# num_receivers = publisher.send(event0) +# # 1 receiver x 15 total events == 15 events +# assert num_receivers == num_buffered_events + 1 + + +# @pytest.mark.parametrize( +# "num_consumers, num_buffered, expected_num_sent", +# [ +# pytest.param(0, 7, 0, id="0 x (7+1) - no consumers, multi-buffer"), +# pytest.param(1, 7, 8, id="1 x (7+1) - single consumer, multi-buffer"), +# pytest.param(2, 7, 16, id="2 x (7+1) - multi-consumer, multi-buffer"), +# pytest.param(4, 4, 20, id="4 x (4+1) - multi-consumer, multi-buffer (odd #)"), +# pytest.param(9, 0, 9, id="13 x (0+1) - multi-consumer, empty buffer"), +# ], +# ) +# def test_eventpublisher_broadcast_returns_total_sent( +# test_dir: str, num_consumers: int, num_buffered: int, expected_num_sent: int +# ) -> None: +# """Verify that a successful broadcast returns the total number of events +# sent, including buffered messages. + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs +# :param num_consumers: the number of consumers to mock setting up prior to send +# :param num_buffered: the number of pre-buffered events to mock up +# :param expected_num_sent: the expected result from calling send +# """ +# storage_path = pathlib.Path(test_dir) / "features" +# mock_storage = {} + +# # note: file-system descriptors are just paths +# consumers = [] +# for i in range(num_consumers): +# consumers.append(storage_path / f"test-consumer-{i}") + +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# backbone.notification_channels = consumers + +# publisher = EventBroadcaster( +# backbone, channel_factory=FileSystemCommChannel.from_descriptor +# ) + +# # mock building up some buffered events +# for i in range(num_buffered): +# event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", []) +# publisher._event_buffer.append(bytes(event)) + +# assert publisher.num_buffered == num_buffered + +# # this event will trigger clearing anything already in buffer +# event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}", []) + +# # num_receivers should contain a number that computes w/all consumers and all events +# num_receivers = publisher.send(event0) + +# assert num_receivers == expected_num_sent + + +# def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: +# """Verify that any unused consumers are pruned each time a new event is sent + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs""" +# storage_path = pathlib.Path(test_dir) / "features" +# mock_storage = {} + +# # note: file-system descriptors are just paths +# consumer_descriptor = storage_path / "test-consumer" + +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) + +# publisher = EventBroadcaster( +# backbone, channel_factory=FileSystemCommChannel.from_descriptor +# ) + +# event = OnCreateConsumer(consumer_descriptor, []) + +# # the only registered cnosumer is in the event, expect no pruning +# backbone.notification_channels = (consumer_descriptor,) + +# publisher.send(event) +# assert str(consumer_descriptor) in publisher._channel_cache +# assert len(publisher._channel_cache) == 1 + +# # add a new descriptor for another event... +# consumer_descriptor2 = storage_path / "test-consumer-2" +# # ... and remove the old descriptor from the backbone when it's looked up +# backbone.notification_channels = (consumer_descriptor2,) + +# event = OnCreateConsumer(consumer_descriptor2, []) + +# publisher.send(event) + +# assert str(consumer_descriptor2) in publisher._channel_cache +# assert str(consumer_descriptor) not in publisher._channel_cache +# assert len(publisher._channel_cache) == 1 + +# # test multi-consumer pruning by caching some extra channels +# prune0, prune1, prune2 = "abc", "def", "ghi" +# publisher._channel_cache[prune0] = "doesnt-matter-if-it-is-pruned" +# publisher._channel_cache[prune1] = "doesnt-matter-if-it-is-pruned" +# publisher._channel_cache[prune2] = "doesnt-matter-if-it-is-pruned" + +# # add in one of our old channels so we prune the above items, send to these +# backbone.notification_channels = (consumer_descriptor, consumer_descriptor2) + +# publisher.send(event) + +# assert str(consumer_descriptor2) in publisher._channel_cache + +# # NOTE: we should NOT prune something that isn't used by this message but +# # does appear in `backbone.notification_channels` +# assert str(consumer_descriptor) in publisher._channel_cache + +# # confirm all of our items that were not in the notification channels are gone +# for pruned in [prune0, prune1, prune2]: +# assert pruned not in publisher._channel_cache + +# # confirm we have only the two expected items in the channel cache +# assert len(publisher._channel_cache) == 2 + + +# def test_eventpublisher_serialize_failure( +# test_dir: str, monkeypatch: pytest.MonkeyPatch +# ) -> None: +# """Verify that errors during message serialization are raised to the caller + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs +# :param monkeypatch: pytest fixture for modifying behavior of existing code +# with mock implementations""" +# storage_path = pathlib.Path(test_dir) / "features" +# storage_path.mkdir(parents=True, exist_ok=True) + +# mock_storage = {} + +# # note: file-system descriptors are just paths +# target_descriptor = str(storage_path / "test-consumer") + +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# publisher = EventBroadcaster( +# backbone, channel_factory=FileSystemCommChannel.from_descriptor +# ) + +# with monkeypatch.context() as patch: +# event = OnCreateConsumer(target_descriptor, []) + +# # patch the __bytes__ implementation to cause pickling to fail during send +# patch.setattr(event, "__bytes__", lambda x: b"abc") + +# backbone.notification_channels = (target_descriptor,) + +# # send a message into the channel +# with pytest.raises(ValueError) as ex: +# publisher.send(event) + +# assert "serialize" in ex.value.args[0] + + +# def test_eventpublisher_factory_failure( +# test_dir: str, monkeypatch: pytest.MonkeyPatch +# ) -> None: +# """Verify that errors during channel construction are raised to the caller + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs +# :param monkeypatch: pytest fixture for modifying behavior of existing code +# with mock implementations""" +# storage_path = pathlib.Path(test_dir) / "features" +# storage_path.mkdir(parents=True, exist_ok=True) + +# mock_storage = {} + +# # note: file-system descriptors are just paths +# target_descriptor = str(storage_path / "test-consumer") + +# def boom(descriptor: str) -> None: +# raise Exception(f"you shall not pass! {descriptor}") + +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# publisher = EventBroadcaster(backbone, channel_factory=boom) + +# with monkeypatch.context() as patch: +# event = OnCreateConsumer(target_descriptor, []) + +# backbone.notification_channels = (target_descriptor,) + +# # send a message into the channel +# with pytest.raises(SmartSimError) as ex: +# publisher.send(event) + +# assert "construct" in ex.value.args[0] + + +# def test_eventpublisher_failure(test_dir: str, monkeypatch: pytest.MonkeyPatch) -> None: +# """Verify that unexpected errors during message send are caught and wrapped in a +# SmartSimError so they are not propagated directly to the caller + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs +# :param monkeypatch: pytest fixture for modifying behavior of existing code +# with mock implementations""" +# storage_path = pathlib.Path(test_dir) / "features" +# storage_path.mkdir(parents=True, exist_ok=True) + +# mock_storage = {} + +# # note: file-system descriptors are just paths +# target_descriptor = str(storage_path / "test-consumer") + +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# publisher = EventBroadcaster( +# backbone, channel_factory=FileSystemCommChannel.from_descriptor +# ) + +# def boom(self) -> None: +# raise Exception("That was unexpected...") + +# with monkeypatch.context() as patch: +# event = OnCreateConsumer(target_descriptor, []) + +# # patch the _broadcast implementation to cause send to fail after +# # after the event has been pickled +# patch.setattr(publisher, "_broadcast", boom) + +# backbone.notification_channels = (target_descriptor,) + +# # Here, we see the exception raised by broadcast that isn't expected +# # is not allowed directly out, and instead is wrapped in SmartSimError +# with pytest.raises(SmartSimError) as ex: +# publisher.send(event) + +# assert "unexpected" in ex.value.args[0] + + +# def test_eventconsumer_receive(test_dir: str) -> None: +# """Verify that a consumer retrieves a message from the given channel + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs""" +# storage_path = pathlib.Path(test_dir) / "features" +# storage_path.mkdir(parents=True, exist_ok=True) + +# mock_storage = {} + +# # note: file-system descriptors are just paths +# target_descriptor = str(storage_path / "test-consumer") + +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) +# event = OnCreateConsumer(target_descriptor, []) + +# # simulate a sent event by writing directly to the input comm channel +# comm_channel.send(bytes(event)) + +# consumer = EventConsumer(comm_channel, backbone) + +# all_received: t.List[OnCreateConsumer] = consumer.receive() +# assert len(all_received) == 1 + +# # verify we received the same event that was raised +# assert all_received[0].category == event.category +# assert all_received[0].descriptor == event.descriptor + + +# @pytest.mark.parametrize("num_sent", [0, 1, 2, 4, 8, 16]) +# def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None: +# """Verify that a consumer retrieves multiple message from the given channel + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs +# :param num_sent: parameterized value used to vary the number of events +# that are enqueued and validations are checked at multiple queue sizes""" +# storage_path = pathlib.Path(test_dir) / "features" +# storage_path.mkdir(parents=True, exist_ok=True) + +# mock_storage = {} + +# # note: file-system descriptors are just paths +# target_descriptor = str(storage_path / "test-consumer") + +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) + +# # simulate multiple sent events by writing directly to the input comm channel +# for _ in range(num_sent): +# event = OnCreateConsumer(target_descriptor, []) +# comm_channel.send(bytes(event)) + +# consumer = EventConsumer(comm_channel, backbone) + +# all_received: t.List[OnCreateConsumer] = consumer.receive() +# assert len(all_received) == num_sent + + +# def test_eventconsumer_receive_empty(test_dir: str) -> None: +# """Verify that a consumer receiving an empty message ignores the +# message and continues processing + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs""" +# storage_path = pathlib.Path(test_dir) / "features" +# storage_path.mkdir(parents=True, exist_ok=True) + +# mock_storage = {} + +# # note: file-system descriptors are just paths +# target_descriptor = str(storage_path / "test-consumer") + +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) + +# # simulate a sent event by writing directly to the input comm channel +# comm_channel.send(bytes(b"")) + +# consumer = EventConsumer(comm_channel, backbone) + +# messages = consumer.receive() + +# # the messages array should be empty +# assert not messages + + +# def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: +# """Verify that the publisher and consumer integrate as expected when +# multiple publishers and consumers are sending simultaneously. + +# :param test_dir: pytest fixture automatically generating unique working +# directories for individual test outputs""" +# storage_path = pathlib.Path(test_dir) / "features" +# storage_path.mkdir(parents=True, exist_ok=True) + +# mock_storage = {} +# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) +# mock_fs_descriptor = str(storage_path / f"mock-feature-store") + +# wmgr_channel = FileSystemCommChannel(storage_path / "test-wmgr") +# capp_channel = FileSystemCommChannel(storage_path / "test-capp") +# back_channel = FileSystemCommChannel(storage_path / "test-backend") + +# wmgr_consumer_descriptor = wmgr_channel.descriptor +# capp_consumer_descriptor = capp_channel.descriptor +# back_consumer_descriptor = back_channel.descriptor + +# # create some consumers to receive messages +# wmgr_consumer = EventConsumer( +# wmgr_channel, +# backbone, +# filters=[EventCategory.FEATURE_STORE_WRITTEN], +# ) +# capp_consumer = EventConsumer( +# capp_channel, +# backbone, +# ) +# back_consumer = EventConsumer( +# back_channel, +# backbone, +# filters=[EventCategory.CONSUMER_CREATED], +# ) + +# # create some broadcasters to publish messages +# mock_worker_mgr = EventBroadcaster( +# backbone, +# channel_factory=FileSystemCommChannel.from_descriptor, +# ) +# mock_client_app = EventBroadcaster( +# backbone, +# channel_factory=FileSystemCommChannel.from_descriptor, +# ) + +# # register all of the consumers even though the OnCreateConsumer really should +# # trigger its registration. event processing is tested elsewhere. +# backbone.notification_channels = [ +# wmgr_consumer_descriptor, +# capp_consumer_descriptor, +# back_consumer_descriptor, +# ] + +# # simulate worker manager sending a notification to backend that it's alive +# event_1 = OnCreateConsumer(wmgr_consumer_descriptor, []) +# mock_worker_mgr.send(event_1) + +# # simulate the app updating a model a few times +# event_2 = OnWriteFeatureStore(mock_fs_descriptor, "key-1") +# event_3 = OnWriteFeatureStore(mock_fs_descriptor, "key-2") +# event_4 = OnWriteFeatureStore(mock_fs_descriptor, "key-1") + +# mock_client_app.send(event_2) +# mock_client_app.send(event_3) +# mock_client_app.send(event_4) + +# # worker manager should only get updates about feature update +# wmgr_messages = wmgr_consumer.receive() +# assert len(wmgr_messages) == 3 + +# # the backend should only receive messages about consumer creation +# back_messages = back_consumer.receive() +# assert len(back_messages) == 1 + +# # hypothetical app has no filters and will get all events +# app_messages = capp_consumer.receive() +# assert len(app_messages) == 4 diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py index c09c787fc..092ae4fe0 100644 --- a/tests/test_message_handler/test_build_model_key.py +++ b/tests/test_message_handler/test_build_model_key.py @@ -34,14 +34,14 @@ handler = MessageHandler() -def test_build_model_key_successful(): +def test_build_feature_store_key_successful(): fsd = "mock-feature-store-descriptor" - model_key = handler.build_model_key("tensor_key", fsd) + model_key = handler.build_feature_store_key("tensor_key", fsd) assert model_key.key == "tensor_key" - assert model_key.featureStoreDescriptor == fsd + assert model_key.descriptor == fsd -def test_build_model_key_unsuccessful(): +def test_build_feature_store_key_unsuccessful(): with pytest.raises(ValueError): fsd = "mock-feature-store-descriptor" - model_key = handler.build_model_key(100, fsd) + model_key = handler.build_feature_store_key(100, fsd) diff --git a/tests/test_message_handler/test_output_descriptor.py b/tests/test_message_handler/test_output_descriptor.py index beb9a4765..2b5575965 100644 --- a/tests/test_message_handler/test_output_descriptor.py +++ b/tests/test_message_handler/test_output_descriptor.py @@ -34,7 +34,7 @@ handler = MessageHandler() fsd = "mock-feature-store-descriptor" -tensor_key = handler.build_tensor_key("key", fsd) +tensor_key = handler.build_feature_store_key("key", fsd) @pytest.mark.parametrize( diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index 7ede41b50..751722534 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -33,14 +33,14 @@ fsd = "mock-feature-store-descriptor" -model_key = MessageHandler.build_model_key("model_key", fsd) +model_key = MessageHandler.build_feature_store_key("model_key", fsd) model = MessageHandler.build_model(b"model data", "model_name", "v0.0.1") -input_key1 = MessageHandler.build_tensor_key("input_key1", fsd) -input_key2 = MessageHandler.build_tensor_key("input_key2", fsd) +input_key1 = MessageHandler.build_feature_store_key("input_key1", fsd) +input_key2 = MessageHandler.build_feature_store_key("input_key2", fsd) -output_key1 = MessageHandler.build_tensor_key("output_key1", fsd) -output_key2 = MessageHandler.build_tensor_key("output_key2", fsd) +output_key1 = MessageHandler.build_feature_store_key("output_key1", fsd) +output_key2 = MessageHandler.build_feature_store_key("output_key2", fsd) output_descriptor1 = MessageHandler.build_output_tensor_descriptor( "c", [output_key1, output_key2], "int64", [] @@ -101,7 +101,7 @@ "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1, input_key2], [output_key1, output_key2], @@ -109,7 +109,7 @@ torch_attributes, ), pytest.param( - b"another reply channel", + "another reply channel", model, [input_key1], [output_key2], @@ -117,7 +117,7 @@ tf_attributes, ), pytest.param( - b"another reply channel", + "another reply channel", model, [input_key1], [output_key2], @@ -125,7 +125,7 @@ torch_attributes, ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], [output_key1], @@ -185,7 +185,7 @@ def test_build_request_indirect_successful( id="bad channel", ), pytest.param( - b"reply channel", + "reply channel", "bad model", [input_key1], [output_key2], @@ -194,7 +194,7 @@ def test_build_request_indirect_successful( id="bad model", ), pytest.param( - b"reply channel", + "reply channel", model_key, ["input_key1", "input_key2"], [output_key1, output_key2], @@ -212,7 +212,7 @@ def test_build_request_indirect_successful( id="bad input schema type", ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], ["output_key1", "output_key2"], @@ -230,7 +230,7 @@ def test_build_request_indirect_successful( id="bad output schema type", ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], [output_key1, output_key2], @@ -239,7 +239,7 @@ def test_build_request_indirect_successful( id="bad custom attributes", ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], [output_key1, output_key2], @@ -248,7 +248,7 @@ def test_build_request_indirect_successful( id="bad custom attributes schema type", ), pytest.param( - b"reply channel", + "reply channel", model_key, [input_key1], [output_key1, output_key2], @@ -276,7 +276,7 @@ def test_build_request_indirect_unsuccessful( "reply_channel, model, input, output, output_descriptors, custom_attributes", [ pytest.param( - b"reply channel", + "reply channel", model_key, [tensor_1, tensor_2], [], @@ -284,7 +284,7 @@ def test_build_request_indirect_unsuccessful( torch_attributes, ), pytest.param( - b"another reply channel", + "another reply channel", model, [tensor_1], [], @@ -292,7 +292,7 @@ def test_build_request_indirect_unsuccessful( tf_attributes, ), pytest.param( - b"another reply channel", + "another reply channel", model, [tensor_2], [], @@ -300,7 +300,7 @@ def test_build_request_indirect_unsuccessful( tf_attributes, ), pytest.param( - b"another reply channel", + "another reply channel", model, [tensor_1], [], diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py index 86774132e..d0305407c 100644 --- a/tests/test_message_handler/test_response.py +++ b/tests/test_message_handler/test_response.py @@ -33,8 +33,8 @@ fsd = "mock-feature-store-descriptor" -result_key1 = MessageHandler.build_tensor_key("result_key1", fsd) -result_key2 = MessageHandler.build_tensor_key("result_key2", fsd) +result_key1 = MessageHandler.build_feature_store_key("result_key1", fsd) +result_key2 = MessageHandler.build_feature_store_key("result_key2", fsd) torch_attributes = MessageHandler.build_torch_response_attributes() tf_attributes = MessageHandler.build_tf_response_attributes() From 49e0da4bd5ffde73e7cd68a0e7519783de7b740d Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 17 Sep 2024 19:46:30 -0500 Subject: [PATCH 02/40] Revert featurestorekey test changes Revert tensor/model key, update tests, enhance logging Tweak exception naming to follow standard Test remote queue delay and nowait Remove large test timeout Update new tests w/fsk reversion Modify import order for remote tester --- .../standalone_worker_manager.py | 29 +- smartsim/_core/_cli/scripts/dragon_install.py | 9 +- .../_core/launcher/dragon/dragonConnector.py | 8 +- .../_core/mli/comm/channel/dragon_channel.py | 76 +- .../control/request_dispatcher.py | 23 +- .../mli/infrastructure/environment_loader.py | 4 +- .../storage/backbone_feature_store.py | 54 +- .../infrastructure/storage/feature_store.py | 2 +- smartsim/protoclient.py | 10 +- tests/dragon/channel.py | 18 +- .../test_core_machine_learning_worker.py | 54 +- tests/dragon/test_device_manager.py | 15 +- tests/dragon/test_error_handling.py | 39 +- tests/dragon/test_featurestore.py | 87 ++- tests/dragon/test_featurestore_base.py | 4 +- tests/dragon/test_featurestore_integration.py | 8 +- tests/dragon/test_inference_reply.py | 6 +- tests/dragon/test_inference_request.py | 6 +- tests/dragon/test_protoclient.py | 400 +++++----- tests/dragon/test_request_dispatcher.py | 101 ++- tests/dragon/test_torch_worker.py | 4 +- tests/dragon/test_worker_manager.py | 6 +- tests/dragon/utils/channel.py | 2 +- tests/mli/test_integrated_torch_worker.py | 24 +- tests/test_dragon_installer.py | 18 +- tests/test_dragon_launcher.py | 19 + tests/test_featurestore.py | 711 ------------------ .../test_build_model_key.py | 8 +- .../test_output_descriptor.py | 2 +- tests/test_message_handler/test_request.py | 10 +- tests/test_message_handler/test_response.py | 4 +- 31 files changed, 603 insertions(+), 1158 deletions(-) delete mode 100644 tests/test_featurestore.py diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py index e34df0ccd..1d0b11055 100644 --- a/ex/high_throughput_inference/standalone_worker_manager.py +++ b/ex/high_throughput_inference/standalone_worker_manager.py @@ -146,16 +146,17 @@ def service_as_dragon_proc( to_worker_channel = Channel.make_process_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, True) + to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, True) - backbone.worker_queue = to_worker_fli_comm_channel.descriptor + backbone.worker_queue = to_worker_fli_comm_ch.descriptor + + os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = to_worker_fli_comm_ch.descriptor + os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor arg_worker_type = cloudpickle.loads( base64.b64decode(args.worker_class.encode("ascii")) ) - os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor - config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=DragonCommChannel.from_descriptor, @@ -173,7 +174,7 @@ def service_as_dragon_proc( worker_device = args.device for wm_idx in range(args.num_workers): - worker_manager = WorkerManager( + worker_manager = WorkerManager( config_loader=config_loader, worker_type=arg_worker_type, as_service=True, @@ -191,21 +192,25 @@ def service_as_dragon_proc( # the GPU-to-CPU mapping is taken from the nvidia-smi tool # TODO can this be computed on the fly? gpu_to_cpu_aff: dict[int, list[int]] = {} - gpu_to_cpu_aff[0] = list(range(48,64)) + list(range(112,128)) - gpu_to_cpu_aff[1] = list(range(32,48)) + list(range(96,112)) - gpu_to_cpu_aff[2] = list(range(16,32)) + list(range(80,96)) - gpu_to_cpu_aff[3] = list(range(0,16)) + list(range(64,80)) + gpu_to_cpu_aff[0] = list(range(48, 64)) + list(range(112, 128)) + gpu_to_cpu_aff[1] = list(range(32, 48)) + list(range(96, 112)) + gpu_to_cpu_aff[2] = list(range(16, 32)) + list(range(80, 96)) + gpu_to_cpu_aff[3] = list(range(0, 16)) + list(range(64, 80)) worker_manager_procs = [] for worker_idx in range(args.num_workers): wm_cpus = len(gpu_to_cpu_aff[worker_idx]) - 4 wm_affinity = gpu_to_cpu_aff[worker_idx][:wm_cpus] disp_affinity.extend(gpu_to_cpu_aff[worker_idx][wm_cpus:]) - worker_manager_procs.append(service_as_dragon_proc( + worker_manager_procs.append( + service_as_dragon_proc( worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[worker_idx] - )) + ) + ) - dispatcher_proc = service_as_dragon_proc(dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[]) + dispatcher_proc = service_as_dragon_proc( + dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[] + ) # TODO: use ProcessGroup and restart=True? all_procs = [dispatcher_proc, *worker_manager_procs] diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index 4fd0be300..662820fed 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -95,13 +95,14 @@ def get_auth_token(request: DragonInstallRequest) -> t.Optional[Token]: def create_dotenv(dragon_root_dir: pathlib.Path, dragon_version: str) -> None: """Create a .env file with required environment variables for the Dragon runtime""" dragon_root = str(dragon_root_dir) - dragon_inc_dir = str(dragon_root_dir / "include") - dragon_lib_dir = str(dragon_root_dir / "lib") - dragon_bin_dir = str(dragon_root_dir / "bin") + dragon_rut_dir = dragon_root + dragon_inc_dir = dragon_root + "/include" + dragon_lib_dir = dragon_root + "/lib" + dragon_bin_dir = dragon_root + "/bin" dragon_vars = { "DRAGON_BASE_DIR": dragon_root, - "DRAGON_ROOT_DIR": dragon_root, # note: same as base_dir + "DRAGON_ROOT_DIR": dragon_rut_dir, "DRAGON_INCLUDE_DIR": dragon_inc_dir, "DRAGON_LIB_DIR": dragon_lib_dir, "DRAGON_VERSION": dragon_version, diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 0cd68c24e..9cbc55674 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -245,9 +245,11 @@ def load_persisted_env(self) -> t.Dict[str, str]: with open(config.dragon_dotenv, encoding="utf-8") as dot_env: for kvp in dot_env.readlines(): - split = kvp.strip().split("=", maxsplit=1) - key, value = split[0], split[-1] - self._env_vars[key] = value + # skip any commented lines + if not kvp.startswith("#"): + split = kvp.strip().split("=", maxsplit=1) + key, value = split[0], split[-1] + self._env_vars[key] = value return self._env_vars diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index a22ebe952..4f8d3e552 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -40,15 +40,44 @@ logger = get_logger(__name__) -import dragon.channels as dch - DEFAULT_CHANNEL_BUFFER_SIZE = 500 """Maximum number of messages that can be buffered. DragonCommChannel will raise an exception if no clients consume messages before the buffer is filled.""" +LAST_OFFSET = 0 +"""The last offset used to create a local channel. This is used to avoid +unnecessary retries when creating a local channel.""" + + +def _channel_to_descriptor(channel: dch.Channel) -> str: + """Utility method for converting a channel to a descriptor string. + + :param channel: The dragon channel to convert + :returns: The descriptor string + """ + if channel is None: + raise SmartSimError("Channel is not available to create a descriptor") + + serialized_ch = channel.serialize() + return base64.b64encode(serialized_ch).decode("utf-8") + + +def _pool_to_descriptor(pool: dm.MemoryPool) -> str: + """Utility method for converting a pool to a descriptor string. + + :param pool: The memory pool to convert + :returns: The descriptor string""" + if pool is None: + raise SmartSimError("Memory pool is not available to create a descriptor") + + serialized_pool = pool.serialize() + return base64.b64encode(serialized_pool).decode("utf-8") + def create_local(capacity: int = 0) -> dch.Channel: - """Creates a Channel attached to the local memory pool. + """Creates a Channel attached to the local memory pool. Replacement for + direct calls to `dch.Channel.make_process_local()` to enable + supplying a channel capacity. :param capacity: The number of events the channel can buffer; uses the default buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied @@ -56,9 +85,14 @@ def create_local(capacity: int = 0) -> dch.Channel: :raises SmartSimError: If unable to attach local channel """ pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd)) + pool_descriptor = _pool_to_descriptor(pool) channel: t.Optional[dch.Channel] = None offset = 0 + global LAST_OFFSET + if LAST_OFFSET: + offset = LAST_OFFSET + capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE while not channel: @@ -66,18 +100,18 @@ def create_local(capacity: int = 0) -> dch.Channel: offset += 1 cid = df.BASE_USER_MANAGED_CUID + offset try: - channel = dch.Channel( - mem_pool=pool, - c_uid=cid, - capacity=capacity, - ) + channel = dch.Channel(mem_pool=pool, c_uid=cid, capacity=capacity) + LAST_OFFSET = offset + descriptor = _channel_to_descriptor(channel) logger.debug( - f"Channel {cid} created in pool {pool.serialize()} w/capacity {capacity}" + "Local channel creatd: " + f"{cid=}, {pool_descriptor=}, {capacity=}, {descriptor=}" ) - except Exception as e: + except dch.ChannelError as e: if offset < 100: - logger.warning(f"Unable to attach to channel id {cid}. Retrying...") + logger.warning(f"Channnel id {cid} is not open. Retrying...") else: + LAST_OFFSET = 0 logger.error(f"All attempts to attach local channel have failed") raise SmartSimError("Failed to attach local channel") from e @@ -92,8 +126,7 @@ def __init__(self, channel: "dch.Channel") -> None: :param channel: A channel to use for communications """ - serialized_ch = channel.serialize() - descriptor = base64.b64encode(serialized_ch).decode("utf-8") + descriptor = _channel_to_descriptor(channel) super().__init__(descriptor) self._channel = channel @@ -115,7 +148,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: try: with self._channel.sendh(timeout=timeout) as sendh: sendh.send_bytes(value) - logger.debug(f"DragonCommChannel {self.descriptor!r} sent message") + logger.debug(f"DragonCommChannel {self.descriptor} sent message") except Exception as e: raise SmartSimError( f"Error sending message: DragonCommChannel {self.descriptor!r}" @@ -130,8 +163,6 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: with self._channel.recvh(timeout=timeout) as recvh: messages: t.List[bytes] = [] - # todo: consider that this could (under load) never exit. do we need - # to configure a maximum number to pull at once? try: message_bytes = recvh.recv_bytes(timeout=timeout) messages.append(message_bytes) @@ -139,7 +170,7 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: except dch.ChannelEmpty: # emptied the queue, ok to swallow this ex logger.debug(f"DragonCommChannel exhausted: {self.descriptor}") - except dch.ChannelRecvTimeout as ex: + except dch.ChannelRecvTimeout: logger.debug(f"Timeout exceeded on channel.recv: {self.descriptor}") return messages @@ -164,7 +195,7 @@ def descriptor_string(self) -> str: @classmethod def from_descriptor( cls, - descriptor: t.Union[bytes, str], + descriptor: str, ) -> "DragonCommChannel": """A factory method that creates an instance from a descriptor string. @@ -173,6 +204,9 @@ def from_descriptor( :returns: An attached DragonCommChannel :raises SmartSimError: If creation of comm channel fails""" try: + if isinstance(descriptor, bytes): + raise ValueError("Descriptor must be a string") + utf8_descriptor: t.Union[str, bytes] = descriptor if isinstance(descriptor, str): utf8_descriptor = descriptor.encode("utf-8") @@ -183,10 +217,10 @@ def from_descriptor( actual_descriptor = base64.b64decode(utf8_descriptor) channel = dch.Channel.attach(actual_descriptor) return DragonCommChannel(channel) - except Exception as ex: + except Exception as e: raise SmartSimError( - f"Failed to create dragon comm channel: {descriptor!r}" - ) from ex + f"Failed to create dragon comm channel: {descriptor}" + ) from e @classmethod def from_local(cls, _descriptor: t.Optional[str] = None) -> "DragonCommChannel": diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 67797fe44..d14755f53 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -142,13 +142,22 @@ def ready(self) -> bool: :returns: True if the queue can be flushed, False otherwise """ if self.empty(): + logger.debug("Request dispatcher queue is empty") return False - timed_out = ( - self._batch_timeout > 0 and self._elapsed_time >= self._batch_timeout - ) - logger.debug(f"Is full: {self.full()} or has timed out: {timed_out}") - return self.full() or timed_out + timed_out = False + if self._batch_timeout > 0: + timed_out = self._elapsed_time >= self._batch_timeout + + if self.full(): + logger.debug("Request dispatcher ready to deliver full batch") + return True + + if timed_out: + logger.debug("Request dispatcher delivering partial batch") + return True + + return False def make_disposable(self) -> None: """Set this queue as disposable, and never use it again after it gets @@ -281,7 +290,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: fs_missing = fs_desired - fs_actual if not self.has_featurestore_factory: - logger.error("No feature store factory configured") + logger.error("No feature store factory is configured. Unable to dispatch.") return False # create the feature stores we need to service request @@ -463,7 +472,7 @@ def dispatch(self, request: InferenceRequest) -> None: ) self._active_queues[tmp_id] = tmp_queue self._queues[tmp_id] = [tmp_queue] - tmp_queue.put_nowait(request) + tmp_queue.put(request) tmp_queue.make_disposable() return diff --git a/smartsim/_core/mli/infrastructure/environment_loader.py b/smartsim/_core/mli/infrastructure/environment_loader.py index e67cc469a..2c89184d8 100644 --- a/smartsim/_core/mli/infrastructure/environment_loader.py +++ b/smartsim/_core/mli/infrastructure/environment_loader.py @@ -83,7 +83,9 @@ def get_backbone(self) -> t.Optional[FeatureStore]: return None if self._featurestore_factory is None: - logger.warning("No feature store factory is configured") + logger.warning( + "No feature store factory is configured. Backbone not created." + ) return None self.backbone = self._featurestore_factory(descriptor) diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index 0db41f77a..9cc8a6bf9 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -27,6 +27,7 @@ import base64 import enum import itertools +import os import pickle import time import typing as t @@ -67,9 +68,10 @@ class BackboneFeatureStore(DragonFeatureStore): MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS" MLI_BACKEND_CONSUMER = "_SMARTIM_MLI_BACKEND_CONSUMER" - MLI_WORKER_QUEUE = "to_worker_fli" + MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE" MLI_BACKBONE = "_SMARTSIM_INFRA_BACKBONE" _CREATED_ON = "creation" + _DEFAULT_WAIT_TIMEOUT = 30.0 def __init__( self, @@ -86,7 +88,7 @@ def __init__( self._enable_reserved_writes = allow_reserved_writes if self._CREATED_ON not in self: - self._record_creation_date() + self._record_creation_data() @property def wait_timeout(self) -> float: @@ -154,7 +156,7 @@ def creation_date(self) -> str: """Return the creation date for the backbone feature store""" return str(self[self._CREATED_ON]) - def _record_creation_date(self) -> None: + def _record_creation_data(self) -> None: """Write the creation timestamp to the feature store""" if self._CREATED_ON not in self: if not self._allow_reserved_writes: @@ -163,6 +165,9 @@ def _record_creation_date(self) -> None: ) self[self._CREATED_ON] = str(time.time()) + if os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) is None: + os.environ.update(self.get_env()) + @classmethod def from_writable_descriptor( cls, @@ -181,9 +186,8 @@ def from_writable_descriptor( f"Error creating dragon feature store: {descriptor}" ) from ex - @staticmethod def _check_wait_timeout( - start_time: float, timeout: float, indicators: t.Dict[str, bool] + self, start_time: float, timeout: float, indicators: t.Dict[str, bool] ) -> None: """Perform timeout verification @@ -193,11 +197,11 @@ def _check_wait_timeout( elapsed = time.time() - start_time if timeout and elapsed > timeout: raise SmartSimError( - f"Timeout retrieving all keys from backbone: {indicators}" + f"Backbone {self.descriptor=} timeout retrieving all keys: {indicators}" ) def wait_for( - self, keys: t.List[str], timeout: float = 0 + self, keys: t.List[str], timeout: float = _DEFAULT_WAIT_TIMEOUT ) -> t.Dict[str, t.Union[str, bytes, None]]: """Perform a blocking wait until all specified keys have been found in the backbone @@ -205,39 +209,39 @@ def wait_for( :param keys: The required collection of keys to retrieve :param timeout: The maximum wait time in seconds. Overrides class level setting """ + if timeout < 0: + timeout = self._DEFAULT_WAIT_TIMEOUT + logger.info(f"Using default wait_for timeout: {timeout}s") + + if not keys: + return {} - to_check = list(keys) - was_found = [False for _ in to_check] # add test ensuring dupes are handled.. - values: t.List[t.Union[str, bytes, None]] = [None for _ in to_check] + values: t.Dict[str, t.Union[str, bytes, None]] = {k: None for k in set(keys)} + is_found = {k: False for k in values.keys()} - backoff: t.List[float] = [0.1, 0.5, 1, 2, 4, 8] + backoff: t.List[float] = [0.1, 0.5, 1, 2, 4] backoff_iter = itertools.cycle(backoff) start_time = time.time() - while not all(was_found): + while not all(is_found.values()): delay = next(backoff_iter) - for index, key in enumerate(to_check): - if was_found[index]: - continue - + for key in [k for k, v in is_found.items() if not v]: try: - values[index] = self[key] - was_found[index] = True - except KeyError: + values[key] = self[key] + is_found[key] = True + except Exception: if delay == backoff[-1]: logger.debug(f"Re-attempting `{key}` retrieval in {delay}s") - if all(was_found): + if all(is_found.values()): + logger.debug(f"wait_for({keys}) retrieved all keys") continue - self._check_wait_timeout( - start_time, timeout, dict(zip(to_check, was_found)) - ) - + self._check_wait_timeout(start_time, timeout, is_found) time.sleep(delay) - return dict(zip(keys, values)) + return values def get_env(self) -> t.Dict[str, str]: """Returns a dictionary populated with environment variables necessary to diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py index ac6cdaf31..8c85a352d 100644 --- a/smartsim/_core/mli/infrastructure/storage/feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py @@ -47,7 +47,7 @@ class ReservedKeys(str, enum.Enum): """Storage location for the channel used to send messages directly to the MLI backend""" - MLI_WORKER_QUEUE = "to_worker_fli" # todo: ensure this adheres to standard + MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE" """Storage location for the channel used to send work requests to the available worker managers""" diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py index bf195a756..b0e235f8c 100644 --- a/smartsim/protoclient.py +++ b/smartsim/protoclient.py @@ -68,6 +68,8 @@ class ProtoClient: + _DEFAULT_TIMEOUT = 30.0 + @staticmethod def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore: """Use the supplied environment variables to attach @@ -92,7 +94,9 @@ def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore: def _attach_to_worker_queue(self) -> DragonFLIChannel: """Wait until the backbone contains the worker queue configuration, then attach an FLI to the given worker queue""" - configuration = self._backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE]) + configuration = self._backbone.wait_for( + [BackboneFeatureStore.MLI_WORKER_QUEUE], self._timeout + ) # descriptor = configuration.get(BackboneFeatureStore.MLI_WORKER_QUEUE, None) # NOTE: without wait_for, this MUST be in the backbone.... # descriptor = self._backbone.worker_queue @@ -130,14 +134,14 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None: :param timing_on: Flag indicating if timing information should be written to file - :param wait_timeout: Maximum wait time allowed to attach to the + :param wait_timeout: Maximum wait time (in seconds) allowed to attach to the worker queue :raises: SmartSimError if unable to attach to a backbone featurestore""" # comm = MPI.COMM_WORLD # rank = comm.Get_rank() rank: int = 0 - self._queue_timeout = wait_timeout + self._timeout = wait_timeout or self._DEFAULT_TIMEOUT connect_to_infrastructure() # ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] diff --git a/tests/dragon/channel.py b/tests/dragon/channel.py index 234878423..b00ba9aa2 100644 --- a/tests/dragon/channel.py +++ b/tests/dragon/channel.py @@ -39,17 +39,14 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" - def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: + def __init__(self, key: pathlib.Path) -> None: """Initialize the FileSystemCommChannel instance :param key: a path to the root directory of the feature store""" self._lock = threading.RLock() - if isinstance(key, pathlib.Path): - super().__init__(key.as_posix().encode("utf-8")) - self._file_path = key - else: - super().__init__(key) - self._file_path = pathlib.Path(key.decode("utf-8")) + + super().__init__(key.as_posix()) + self._file_path = key if not self._file_path.parent.exists(): self._file_path.parent.mkdir(parents=True) @@ -110,17 +107,14 @@ def clear(self) -> None: @classmethod def from_descriptor( cls, - descriptor: t.Union[str, bytes], + descriptor: str, ) -> "FileSystemCommChannel": """A factory method that creates an instance from a descriptor string :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached FileSystemCommChannel""" try: - if isinstance(descriptor, str): - path = pathlib.Path(descriptor) - else: - path = pathlib.Path(descriptor.decode("utf-8")) + path = pathlib.Path(descriptor) return FileSystemCommChannel(path) except: logger.warning(f"failed to create fs comm channel: {descriptor}") diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py index ed9ac625c..e9c356b4e 100644 --- a/tests/dragon/test_core_machine_learning_worker.py +++ b/tests/dragon/test_core_machine_learning_worker.py @@ -34,7 +34,7 @@ import torch import smartsim.error as sse -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import ModelKey, TensorKey from smartsim._core.mli.infrastructure.worker.worker import ( InferenceRequest, MachineLearningWorkerCore, @@ -98,7 +98,7 @@ def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> N fsd = feature_store.descriptor feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes() - model_key = FeatureStoreKey(key=key, descriptor=fsd) + model_key = ModelKey(key=key, descriptor=fsd) request = InferenceRequest(model_key=model_key) batch = RequestBatch([request], None, model_key) @@ -116,7 +116,7 @@ def test_fetch_model_disk_missing() -> None: key = "/path/that/doesnt/exist" - model_key = FeatureStoreKey(key=key, descriptor=fsd) + model_key = ModelKey(key=key, descriptor=fsd) request = InferenceRequest(model_key=model_key) batch = RequestBatch([request], None, model_key) @@ -141,7 +141,7 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + model_key = ModelKey(key=key, descriptor=feature_store.descriptor) request = InferenceRequest(model_key=model_key) batch = RequestBatch([request], None, model_key) @@ -159,7 +159,7 @@ def test_fetch_model_feature_store_missing() -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + model_key = ModelKey(key=key, descriptor=feature_store.descriptor) request = InferenceRequest(model_key=model_key) batch = RequestBatch([request], None, model_key) @@ -182,7 +182,7 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + model_key = ModelKey(key=key, descriptor=feature_store.descriptor) request = InferenceRequest(model_key=model_key) batch = RequestBatch([request], None, model_key) @@ -199,11 +199,9 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest( - input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)] - ) + request = InferenceRequest(input_keys=[TensorKey(key=tensor_name, descriptor=fsd)]) - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) worker = MachineLearningWorkerCore @@ -223,9 +221,9 @@ def test_fetch_input_disk_missing() -> None: fsd = feature_store.descriptor key = "/path/that/doesnt/exist" - request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) + request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)]) - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) with pytest.raises(sse.SmartSimError) as ex: @@ -245,14 +243,12 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest( - input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)] - ) + request = InferenceRequest(input_keys=[TensorKey(key=tensor_name, descriptor=fsd)]) # put model bytes into the feature store feature_store[tensor_name] = persist_torch_tensor.read_bytes() - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) @@ -284,13 +280,13 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> request = InferenceRequest( input_keys=[ - FeatureStoreKey(key=tensor_name + "1", descriptor=fsd), - FeatureStoreKey(key=tensor_name + "2", descriptor=fsd), - FeatureStoreKey(key=tensor_name + "3", descriptor=fsd), + TensorKey(key=tensor_name + "1", descriptor=fsd), + TensorKey(key=tensor_name + "2", descriptor=fsd), + TensorKey(key=tensor_name + "3", descriptor=fsd), ] ) - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) @@ -310,9 +306,9 @@ def test_fetch_input_feature_store_missing() -> None: key = "bad-key" feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) + request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)]) - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) with pytest.raises(sse.SmartSimError) as ex: @@ -332,9 +328,9 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None: key = "test-model" feature_store[key] = persist_torch_tensor.read_bytes() - request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) + request = InferenceRequest(input_keys=[TensorKey(key=key, descriptor=fsd)]) - model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + model_key = ModelKey(key="test-model", descriptor=fsd) batch = RequestBatch([request], None, model_key) fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) @@ -351,9 +347,9 @@ def test_place_outputs() -> None: # create a key to retrieve from the feature store keys = [ - FeatureStoreKey(key=key_name + "1", descriptor=fsd), - FeatureStoreKey(key=key_name + "2", descriptor=fsd), - FeatureStoreKey(key=key_name + "3", descriptor=fsd), + TensorKey(key=key_name + "1", descriptor=fsd), + TensorKey(key=key_name + "2", descriptor=fsd), + TensorKey(key=key_name + "3", descriptor=fsd), ] data = [b"abcdef", b"ghijkl", b"mnopqr"] @@ -376,6 +372,6 @@ def test_place_outputs() -> None: pytest.param("key", "", id="invalid descriptor"), ], ) -def test_invalid_featurestorekey(key, descriptor) -> None: +def test_invalid_tensorkey(key, descriptor) -> None: with pytest.raises(ValueError): - fsk = FeatureStoreKey(key, descriptor) + fsk = TensorKey(key, descriptor) diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py index c58879cb6..d270e921c 100644 --- a/tests/dragon/test_device_manager.py +++ b/tests/dragon/test_device_manager.py @@ -36,7 +36,8 @@ ) from smartsim._core.mli.infrastructure.storage.feature_store import ( FeatureStore, - FeatureStoreKey, + ModelKey, + TensorKey, ) from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, @@ -116,9 +117,9 @@ def test_device_manager_model_in_request(): worker = MockWorker() - tensor_key = FeatureStoreKey(key="key", descriptor="desc") - output_key = FeatureStoreKey(key="key", descriptor="desc") - model_key = FeatureStoreKey(key="model key", descriptor="desc") + tensor_key = TensorKey(key="key", descriptor="desc") + output_key = TensorKey(key="key", descriptor="desc") + model_key = ModelKey(key="model key", descriptor="desc") request = InferenceRequest( model_key=model_key, @@ -154,9 +155,9 @@ def test_device_manager_model_key(): worker = MockWorker() - tensor_key = FeatureStoreKey(key="key", descriptor="desc") - output_key = FeatureStoreKey(key="key", descriptor="desc") - model_key = FeatureStoreKey(key="model key", descriptor="desc") + tensor_key = TensorKey(key="key", descriptor="desc") + output_key = TensorKey(key="key", descriptor="desc") + model_key = ModelKey(key="model key", descriptor="desc") request = InferenceRequest( model_key=model_key, diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 0f3e38f93..b0934b6f5 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -55,7 +55,8 @@ ) from smartsim._core.mli.infrastructure.storage.feature_store import ( FeatureStore, - FeatureStoreKey, + ModelKey, + TensorKey, ) from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, @@ -127,12 +128,8 @@ def setup_worker_manager_model_bytes( cooldown=3, ) - tensor_key = MessageHandler.build_feature_store_key( - "key", app_feature_store.descriptor - ) - output_key = MessageHandler.build_feature_store_key( - "key", app_feature_store.descriptor - ) + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) inf_request = InferenceRequest( model_key=None, @@ -145,7 +142,7 @@ def setup_worker_manager_model_bytes( batch_size=0, ) - model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + model_id = ModelKey(key="key", descriptor=app_feature_store.descriptor) request_batch = RequestBatch( [inf_request], @@ -190,9 +187,9 @@ def setup_worker_manager_model_key( cooldown=3, ) - tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) - output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) - model_id = FeatureStoreKey(key="model key", descriptor=app_feature_store.descriptor) + tensor_key = TensorKey(key="key", descriptor=app_feature_store.descriptor) + output_key = TensorKey(key="key", descriptor=app_feature_store.descriptor) + model_id = ModelKey(key="model key", descriptor=app_feature_store.descriptor) request = InferenceRequest( model_key=model_id, @@ -245,12 +242,8 @@ def setup_request_dispatcher_model_bytes( ) request_dispatcher._on_start() - tensor_key = MessageHandler.build_feature_store_key( - "key", app_feature_store.descriptor - ) - output_key = MessageHandler.build_feature_store_key( - "key", app_feature_store.descriptor - ) + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) model = MessageHandler.build_model(b"model", "model name", "v 0.0.1") request = MessageHandler.build_request( test_dir, model, [tensor_key], [output_key], [], None @@ -293,14 +286,10 @@ def setup_request_dispatcher_model_key( ) request_dispatcher._on_start() - tensor_key = MessageHandler.build_feature_store_key( - "key", app_feature_store.descriptor - ) - output_key = MessageHandler.build_feature_store_key( - "key", app_feature_store.descriptor - ) - model_key = MessageHandler.build_feature_store_key( - key="model key", feature_store_descriptor=app_feature_store.descriptor + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + model_key = MessageHandler.build_model_key( + key="model key", descriptor=app_feature_store.descriptor ) request = MessageHandler.build_request( test_dir, model_key, [tensor_key], [output_key], [], None diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index a2c8118ac..7f1649741 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -66,7 +66,6 @@ # The tests in this file must run in a dragon environment pytestmark = pytest.mark.dragon -WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" @pytest.fixture @@ -83,7 +82,9 @@ def storage_for_dragon_fs_with_req_queue( fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) comm_channel = DragonFLIChannel(fli_, True) - storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor + storage_for_dragon_fs[BackboneFeatureStore.MLI_WORKER_QUEUE] = ( + comm_channel.descriptor + ) return storage_for_dragon_fs @@ -97,7 +98,7 @@ def storage_for_dragon_fs_with_mock_req_queue( # comm_channel = DragonFLIChannel(fli_, True) mock_descriptor = "12345" - storage_for_dragon_fs[WORK_QUEUE_KEY] = mock_descriptor + storage_for_dragon_fs[BackboneFeatureStore.MLI_WORKER_QUEUE] = mock_descriptor return storage_for_dragon_fs @@ -192,6 +193,31 @@ def test_eventconsumer_eventpublisher_integration( assert len(app_messages) == 4 +def test_backbone_wait_for_no_keys( + storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that asking the backbone to wait for a value succeeds + immediately and does not cause a wait to occur if the supplied key + list is empty + + :param storage_for_dragon_fs: the storage engine to use, prepopulated with + """ + # set a very low timeout to confirm that it does not wait + storage = storage_for_dragon_fs_with_req_queue + + backbone = BackboneFeatureStore(storage) + + with monkeypatch.context() as ctx: + # all keys should be found and the timeout should never be checked. + ctx.setattr(bbtime, "sleep", mock.MagicMock()) + + values = backbone.wait_for([]) + assert len(values) == 0 + + # confirm that no wait occurred + bbtime.sleep.assert_not_called() + + def test_backbone_wait_for_prepopulated( storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -201,8 +227,6 @@ def test_backbone_wait_for_prepopulated( :param storage_for_dragon_fs: the storage engine to use, prepopulated with """ # set a very low timeout to confirm that it does not wait - wait_timeout = 0.1 - # storage = {WORK_QUEUE_KEY: "123456"} storage = storage_for_dragon_fs_with_req_queue backbone = BackboneFeatureStore(storage) @@ -211,18 +235,50 @@ def test_backbone_wait_for_prepopulated( # all keys should be found and the timeout should never be checked. ctx.setattr(bbtime, "sleep", mock.MagicMock()) - values = backbone.wait_for([WORK_QUEUE_KEY]) + values = backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE]) # confirm that wait_for with one key returns one value assert len(values) == 1 # confirm that the descriptor is non-null w/some non-trivial value - assert len(values[WORK_QUEUE_KEY]) > 5 + assert len(values[BackboneFeatureStore.MLI_WORKER_QUEUE]) > 5 # confirm that no wait occurred bbtime.sleep.assert_not_called() +def test_backbone_wait_for_prepopulated_dupe( + storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that asking the backbone to wait for keys that are duplicated + results in a single value being returned for each key + + :param storage_for_dragon_fs: the storage engine to use, prepopulated with + """ + # set a very low timeout to confirm that it does not wait + storage = storage_for_dragon_fs_with_req_queue + + backbone = BackboneFeatureStore(storage) + key1, key2 = "key-1", "key-2" + value1, value2 = "i-am-value-1", "i-am-value-2" + backbone[key1] = value1 + backbone[key2] = value2 + + with monkeypatch.context() as ctx: + # all keys should be found and the timeout should never be checked. + ctx.setattr(bbtime, "sleep", mock.MagicMock()) + + values = backbone.wait_for([key1, key2, key1]) # key1 is duplicated + + # confirm that wait_for with one key returns one value + assert len(values) == 2 + assert key1 in values + assert key2 in values + + assert values[key1] == value1 + assert values[key2] == value2 + + def set_value_after_delay( descriptor: str, key: str, value: str, delay: float = 5 ) -> None: @@ -238,6 +294,7 @@ def set_value_after_delay( logger.debug(f"set_value_after_delay wrote `{value} to backbone[`{key}`]") +@pytest.mark.skip(reason="Using mp on build agent is not working correctly") @pytest.mark.parametrize("delay", [0, 1, 2, 4, 8]) def test_backbone_wait_for_partial_prepopulated( storage_for_dragon_fs_with_mock_req_queue: t.Any, delay: float @@ -264,7 +321,7 @@ def test_backbone_wait_for_partial_prepopulated( p2 = mp.Process( target=backbone.wait_for, - args=([WORK_QUEUE_KEY, key],), + args=([BackboneFeatureStore.MLI_WORKER_QUEUE, key],), kwargs={"timeout": wait_timeout}, ) p2.start() @@ -273,21 +330,25 @@ def test_backbone_wait_for_partial_prepopulated( p2.join() # both values should be written at this time - ret_vals = backbone.wait_for([WORK_QUEUE_KEY, key], 0.1) + ret_vals = backbone.wait_for([key, BackboneFeatureStore.MLI_WORKER_QUEUE, key], 0.1) # confirm that wait_for with two keys returns two values assert len(ret_vals) == 2, "values should contain values for both awaited keys" # confirm the pre-populated value has the correct output - assert ret_vals[WORK_QUEUE_KEY] == "12345" # mock descriptor value from fixture + assert ( + ret_vals[BackboneFeatureStore.MLI_WORKER_QUEUE] == "12345" + ) # mock descriptor value from fixture # confirm the population process completed and the awaited value is correct assert ret_vals[key] == value, "verify order of values " +@pytest.mark.skip(reason="Using mp on build agent is not working correctly") @pytest.mark.parametrize("num_keys", [0, 1, 3, 7, 11]) def test_backbone_wait_for_multikey( storage_for_dragon_fs_with_req_queue: t.Any, num_keys: int, + test_dir: str, ) -> None: """Verify that asking the backbone to wait for multiple keys results in that number of values being returned @@ -317,7 +378,7 @@ def test_backbone_wait_for_multikey( p2 = mp.Process( target=backbone.wait_for, - args=([[*extra_keys]],), + args=(extra_keys,), kwargs={"timeout": max_delay * 2}, ) p2.start() @@ -328,7 +389,9 @@ def test_backbone_wait_for_multikey( ) # give it 10 seconds longer than p2 timeout for backoff # use without a wait to verify all values are written - actual_values = backbone.wait_for([*extra_keys], timeout=0.01) + num_keys = len(extra_keys) + actual_values = backbone.wait_for(extra_keys, timeout=0.01) + assert len(extra_keys) == num_keys # confirm that wait_for returns all the expected values assert len(actual_values) == num_keys diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index bb5dccad7..94733afc7 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -767,7 +767,9 @@ def test_backbone_wait_timeout(wait_timeout: float, exp_wait_max: float) -> None backbone = BackboneFeatureStore(storage) with pytest.raises(SmartSimError) as ex: - backbone.wait_for(["does-not-exist"]) + backbone.wait_for(["does-not-exist"], wait_timeout) + + assert "timeout" in str(ex.value.args[0]).lower() end_time = time.time() elapsed = end_time - start_time diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index 104acd914..b088df5b4 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -77,7 +77,6 @@ def test_eventconsumer_eventpublisher_integration( mock_storage = storage_for_dragon_fs backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) - mock_fs_descriptor = backbone.descriptor # verify ability to write and read from ddict backbone["test_dir"] = test_dir @@ -220,7 +219,12 @@ def test_eventconsumer_max_dequeue( pytest.param(0, id="use default: 500"), pytest.param(1, id="non-zero buffer size: 1"), pytest.param(500, id="buffer size: 500"), - pytest.param(1000, id="buffer size: 1000"), + pytest.param(800, id="buffer size: 800"), + pytest.param( + 1000, + id="buffer size: 1000, unreliable in dragon-v0.10", + marks=pytest.mark.skip, + ), ], ) def test_channel_buffer_size( diff --git a/tests/dragon/test_inference_reply.py b/tests/dragon/test_inference_reply.py index 1eb137ae6..bdc7be14b 100644 --- a/tests/dragon/test_inference_reply.py +++ b/tests/dragon/test_inference_reply.py @@ -28,7 +28,7 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import TensorKey from smartsim._core.mli.infrastructure.worker.worker import InferenceReply from smartsim._core.mli.message_handler import MessageHandler @@ -44,8 +44,8 @@ def inference_reply() -> InferenceReply: @pytest.fixture -def fs_key() -> FeatureStoreKey: - return FeatureStoreKey("key", "descriptor") +def fs_key() -> TensorKey: + return TensorKey("key", "descriptor") @pytest.mark.parametrize( diff --git a/tests/dragon/test_inference_request.py b/tests/dragon/test_inference_request.py index 909d021d6..f5c8b9bdc 100644 --- a/tests/dragon/test_inference_request.py +++ b/tests/dragon/test_inference_request.py @@ -28,7 +28,7 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import TensorKey from smartsim._core.mli.infrastructure.worker.worker import InferenceRequest from smartsim._core.mli.message_handler import MessageHandler @@ -44,8 +44,8 @@ def inference_request() -> InferenceRequest: @pytest.fixture -def fs_key() -> FeatureStoreKey: - return FeatureStoreKey("key", "descriptor") +def fs_key() -> TensorKey: + return TensorKey("key", "descriptor") @pytest.mark.parametrize( diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index 590780154..3eb800bb7 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -1,231 +1,231 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pickle -import time -import typing as t - -import pytest - -dragon = pytest.importorskip("dragon") - -from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local -from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( - BackboneFeatureStore, - EventBroadcaster, - OnWriteFeatureStore, -) -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict -from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys -from smartsim.error.errors import SmartSimError -from smartsim.log import get_logger - -# isort: off -from dragon import fli -from dragon.channels import Channel - -# from ..ex..high_throughput_inference.mock_app import ProtoClient -from smartsim.protoclient import ProtoClient - - -# The tests in this file belong to the dragon group -pytestmark = pytest.mark.dragon -WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" -logger = get_logger(__name__) - - -@pytest.fixture -def storage_for_dragon_fs() -> t.Dict[str, str]: - # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) - return dragon_ddict.DDict(1, 2, 4 * 1024**2) - - -@pytest.fixture -def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: - return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) - - -@pytest.fixture -def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: - """a stand-in for the worker manager so a worker queue exists""" - - # create the FLI - to_worker_channel = Channel.make_process_local() - # to_worker_channel = create_local() - fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - comm_channel = DragonFLIChannel(fli_, True) - - # store the descriptor in the backbone - # the_backbone.worker_queue = comm_channel.descriptor - the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor - - try: - comm_channel.send(b"foo") - except Exception as ex: - print(f"ohnooooo: {ex}") - - return comm_channel - - -@pytest.fixture -def storage_for_dragon_fs_with_req_queue( - storage_for_dragon_fs: t.Dict[str, str] -) -> t.Dict[str, str]: - # create a valid FLI so any call to attach does not fail - channel_ = Channel.make_process_local() - fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) - comm_channel = DragonFLIChannel(fli_, True) - - storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor - return storage_for_dragon_fs - - -@pytest.mark.parametrize( - "wait_timeout, exp_wait_max", - [ - # aggregate the 1+1+1 into 3 on remaining parameters - pytest.param(1, 1 + 1 + 1, id="1s wait, 3 cycle steps"), - pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"), - pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"), - ], -) -def test_protoclient_timeout( - wait_timeout: float, - exp_wait_max: float, - the_backbone: BackboneFeatureStore, - monkeypatch: pytest.MonkeyPatch, -): - """Verify that attempts to attach to the worker queue from the protoclient - timeout in an appropriate amount of time. Note: due to the backoff, we verify - the elapsed time is less than the 15s of a cycle of waits +# # BSD 2-Clause License +# # +# # Copyright (c) 2021-2024, Hewlett Packard Enterprise +# # All rights reserved. +# # +# # Redistribution and use in source and binary forms, with or without +# # modification, are permitted provided that the following conditions are met: +# # +# # 1. Redistributions of source code must retain the above copyright notice, this +# # list of conditions and the following disclaimer. +# # +# # 2. Redistributions in binary form must reproduce the above copyright notice, +# # this list of conditions and the following disclaimer in the documentation +# # and/or other materials provided with the distribution. +# # +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# import pickle +# import time +# import typing as t + +# import pytest + +# dragon = pytest.importorskip("dragon") + +# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local +# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( +# BackboneFeatureStore, +# EventBroadcaster, +# OnWriteFeatureStore, +# ) +# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict +# from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys +# from smartsim.error.errors import SmartSimError +# from smartsim.log import get_logger + +# # isort: off +# from dragon import fli +# from dragon.channels import Channel + +# # from ..ex..high_throughput_inference.mock_app import ProtoClient +# from smartsim.protoclient import ProtoClient + + +# # The tests in this file belong to the dragon group +# pytestmark = pytest.mark.dragon +# WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" +# logger = get_logger(__name__) + + +# @pytest.fixture +# def storage_for_dragon_fs() -> t.Dict[str, str]: +# # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) +# return dragon_ddict.DDict(1, 2, 4 * 1024**2) + + +# @pytest.fixture +# def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: +# return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) + + +# @pytest.fixture +# def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: +# """a stand-in for the worker manager so a worker queue exists""" + +# # create the FLI +# to_worker_channel = Channel.make_process_local() +# # to_worker_channel = create_local() +# fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) +# comm_channel = DragonFLIChannel(fli_, True) + +# # store the descriptor in the backbone +# # the_backbone.worker_queue = comm_channel.descriptor +# the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor + +# try: +# comm_channel.send(b"foo") +# except Exception as ex: +# print(f"ohnooooo: {ex}") + +# return comm_channel + + +# @pytest.fixture +# def storage_for_dragon_fs_with_req_queue( +# storage_for_dragon_fs: t.Dict[str, str] +# ) -> t.Dict[str, str]: +# # create a valid FLI so any call to attach does not fail +# channel_ = Channel.make_process_local() +# fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) +# comm_channel = DragonFLIChannel(fli_, True) + +# storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor +# return storage_for_dragon_fs + + +# @pytest.mark.parametrize( +# "wait_timeout, exp_wait_max", +# [ +# # aggregate the 1+1+1 into 3 on remaining parameters +# pytest.param(0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps"), +# pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"), +# pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"), +# ], +# ) +# def test_protoclient_timeout( +# wait_timeout: float, +# exp_wait_max: float, +# the_backbone: BackboneFeatureStore, +# monkeypatch: pytest.MonkeyPatch, +# ): +# """Verify that attempts to attach to the worker queue from the protoclient +# timeout in an appropriate amount of time. Note: due to the backoff, we verify +# the elapsed time is less than the 15s of a cycle of waits - :param wait_timeout: a timeout for use when configuring a proto client - :param exp_wait_max: a ceiling for the expected time spent waiting for - the timeout - :param the_backbone: a pre-initialized backbone featurestore for setting up - the environment variable required by the client""" +# :param wait_timeout: a timeout for use when configuring a proto client +# :param exp_wait_max: a ceiling for the expected time spent waiting for +# the timeout +# :param the_backbone: a pre-initialized backbone featurestore for setting up +# the environment variable required by the client""" - # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8] - # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) - start_time = time.time() - with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex: - ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) +# # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8] +# # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) +# start_time = time.time() +# with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex: +# ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) - ProtoClient(False, wait_timeout=wait_timeout) +# ProtoClient(False, wait_timeout=wait_timeout) - end_time = time.time() - elapsed = end_time - start_time +# end_time = time.time() +# elapsed = end_time - start_time - # todo: revisit. should this trigger any wait if the backbone is set above? - # confirm that we met our timeout - # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}" +# # todo: revisit. should this trigger any wait if the backbone is set above? +# # confirm that we met our timeout +# # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}" - # confirm that the total wait time is aligned with the sleep cycle - assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" +# # confirm that the total wait time is aligned with the sleep cycle +# assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" -def test_protoclient_initialization_no_backbone(): - """Verify that attempting to start the client without required environment variables - results in an exception. NOTE: Backbone env var is not set""" +# def test_protoclient_initialization_no_backbone(): +# """Verify that attempting to start the client without required environment variables +# results in an exception. NOTE: Backbone env var is not set""" - with pytest.raises(SmartSimError) as ex: - ProtoClient(timing_on=False) +# with pytest.raises(SmartSimError) as ex: +# ProtoClient(timing_on=False) - # confirm the missing value error has been raised - assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" "))) +# # confirm the missing value error has been raised +# assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" "))) -def test_protoclient_initialization( - the_backbone: BackboneFeatureStore, - the_worker_queue: DragonFLIChannel, - monkeypatch: pytest.MonkeyPatch, -): - """Verify that attempting to start the client with required env vars results - in a fully initialized client +# def test_protoclient_initialization( +# the_backbone: BackboneFeatureStore, +# the_worker_queue: DragonFLIChannel, +# monkeypatch: pytest.MonkeyPatch, +# ): +# """Verify that attempting to start the client with required env vars results +# in a fully initialized client - :param the_backbone: a pre-initialized backbone featurestore - :param the_worker_queue: an FLI channel the client will retrieve - from the backbone""" +# :param the_backbone: a pre-initialized backbone featurestore +# :param the_worker_queue: an FLI channel the client will retrieve +# from the backbone""" - with monkeypatch.context() as ctx: - ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) - # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture +# with monkeypatch.context() as ctx: +# ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) +# # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture - client = ProtoClient(timing_on=False) +# client = ProtoClient(timing_on=False) - # confirm the backbone was attached correctly - assert client._backbone is not None - assert client._backbone.descriptor == the_backbone.descriptor +# # confirm the backbone was attached correctly +# assert client._backbone is not None +# assert client._backbone.descriptor == the_backbone.descriptor - # confirm the worker queue is created and attached correctly - assert client._to_worker_fli is not None - assert client._to_worker_fli.descriptor == the_worker_queue.descriptor +# # confirm the worker queue is created and attached correctly +# assert client._to_worker_fli is not None +# assert client._to_worker_fli.descriptor == the_worker_queue.descriptor - # confirm the worker channels are created - assert client._from_worker_ch is not None - assert client._from_worker_ch.descriptor +# # confirm the worker channels are created +# assert client._from_worker_ch is not None +# assert client._from_worker_ch.descriptor - assert client._to_worker_ch is not None - assert client._to_worker_ch.descriptor +# assert client._to_worker_ch is not None +# assert client._to_worker_ch.descriptor - # confirm a publisher is created - assert client._publisher is not None +# # confirm a publisher is created +# assert client._publisher is not None -def test_protoclient_write_model( - the_backbone: BackboneFeatureStore, - the_worker_queue: DragonFLIChannel, - monkeypatch: pytest.MonkeyPatch, -): - """Verify that writing a model using the client causes the model data to be - written to a feature store and triggers a key-written event +# def test_protoclient_write_model( +# the_backbone: BackboneFeatureStore, +# the_worker_queue: DragonFLIChannel, +# monkeypatch: pytest.MonkeyPatch, +# ): +# """Verify that writing a model using the client causes the model data to be +# written to a feature store and triggers a key-written event - :param the_backbone: a pre-initialized backbone featurestore - :param the_worker_queue: an FLI channel the client will retrieve - from the backbone""" +# :param the_backbone: a pre-initialized backbone featurestore +# :param the_worker_queue: an FLI channel the client will retrieve +# from the backbone""" - with monkeypatch.context() as ctx: - ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) - # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture +# with monkeypatch.context() as ctx: +# ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) +# # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture - client = ProtoClient(timing_on=False) +# client = ProtoClient(timing_on=False) - model_key = "my-model" - model_bytes = b"12345" +# model_key = "my-model" +# model_bytes = b"12345" - client.set_model(model_key, model_bytes) +# client.set_model(model_key, model_bytes) - # confirm the client modified the underlying feature store - assert client._backbone[model_key] == model_bytes +# # confirm the client modified the underlying feature store +# assert client._backbone[model_key] == model_bytes - publisher = t.cast(EventBroadcaster, client._publisher) +# publisher = t.cast(EventBroadcaster, client._publisher) - # confirm the client raised the key-written event - assert len(publisher._event_buffer) == 1 +# # confirm the client raised the key-written event +# assert len(publisher._event_buffer) == 1 - event = t.cast(OnWriteFeatureStore, pickle.loads(publisher._event_buffer.pop())) - assert event.descriptor == the_backbone.descriptor - assert event.key == model_key +# event = t.cast(OnWriteFeatureStore, pickle.loads(publisher._event_buffer.pop())) +# assert event.descriptor == the_backbone.descriptor +# assert event.key == model_key diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index 714492f37..e666710e6 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -27,6 +27,7 @@ import gc import io import logging +import os import pathlib import socket import time @@ -36,18 +37,16 @@ import numpy as np import pytest -torch = pytest.importorskip("torch") -dragon = pytest.importorskip("dragon") +pytest.importorskip("torch") +pytest.importorskip("dragon") -import base64 -import multiprocessing as mp -try: - mp.set_start_method("dragon") -except Exception: - pass +# isort: off +import dragon +import multiprocessing as mp +import torch -import os +# isort: on import dragon.channels as dch import dragon.infrastructure.policy as dragon_policy @@ -55,14 +54,14 @@ import dragon.native.process as dragon_process import torch.nn as nn from dragon import fli -from dragon.channels import Channel from dragon.data.ddict.ddict import DDict -from dragon.managed_memory import MemoryAlloc, MemoryPool -from dragon.mpbridge.queues import DragonQueue +from dragon.managed_memory import MemoryAlloc from smartsim._core.entrypoints.service import Service -from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_channel import ( + DragonCommChannel, + create_local, +) from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestBatch, @@ -71,6 +70,9 @@ from smartsim._core.mli.infrastructure.control.worker_manager import ( EnvironmentConfigLoader, ) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) @@ -79,13 +81,12 @@ from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger -from .feature_store import FileSystemFeatureStore -from .utils.channel import FileSystemCommChannel - logger = get_logger(__name__) # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon +mp.set_start_method("dragon") + class MiniModel(nn.Module): def __init__(self): @@ -136,14 +137,18 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: def mock_messages( request_dispatcher_queue: DragonFLIChannel, feature_store: FeatureStore, + parent_iteration: int, + callback_descriptor: str, ) -> None: """Mock event producer for triggering the inference pipeline""" model_key = "mini-model" + # mock_message sends 2 messages, so we offset by 2 * (# of iterations in caller) + offset = 2 * parent_iteration for iteration_number in range(2): + logged_iteration = offset + iteration_number + logger.debug(f"Sending mock message {logged_iteration}") - channel = Channel.make_process_local() - callback_channel = DragonCommChannel(channel) output_key = f"output-{iteration_number}" feature_store[model_key] = load_model() @@ -157,25 +162,35 @@ def mock_messages( "c", "float32", list(tensor.shape) ) - message_tensor_output_key = MessageHandler.build_feature_store_key( - output_key, fsd - ) - message_model_key = MessageHandler.build_feature_store_key(model_key, fsd) + message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) + message_model_key = MessageHandler.build_model_key(model_key, fsd) request = MessageHandler.build_request( - reply_channel=callback_channel.descriptor, + reply_channel=callback_descriptor, model=message_model_key, inputs=[tensor_desc], outputs=[message_tensor_output_key], output_descriptors=[], custom_attributes=None, ) + + logger.info(f"Sending request {iteration_number} to request_dispatcher_queue") request_bytes = MessageHandler.serialize_request(request) with request_dispatcher_queue._fli.sendh( timeout=None, stream_channel=request_dispatcher_queue._channel ) as sendh: sendh.send_bytes(request_bytes) sendh.send_bytes(tensor.tobytes()) + + logger.info( + f"Retrieving {iteration_number} from callback channel: {callback_descriptor}" + ) + callback_channel = DragonCommChannel.from_descriptor(callback_descriptor) + + # Results will be empty. The test pulls messages off the queue before they + # can be serviced by a worker. Just ensure the callback channel works. + results = callback_channel.recv(timeout=0.1) + logger.debug(f"Received mock message results on callback channel: {results}") time.sleep(1) @@ -216,16 +231,17 @@ def test_request_dispatcher() -> None: longer referenced by the dispatcher. """ - to_worker_channel = dch.Channel.make_process_local() + to_worker_channel = create_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, sender_supplied=True) + ddict = DDict(1, 2, 4 * 1024**2) + backbone_fs = BackboneFeatureStore(ddict, allow_reserved_writes=True) + # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader # or test environment may be unable to send messages w/queue - os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_ch.descriptor - - ddict = DDict(1, 2, 4 * 1024**2) - dragon_fs = DragonFeatureStore(ddict) + os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = to_worker_fli_comm_ch.descriptor + os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone_fs.descriptor config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -243,46 +259,49 @@ def test_request_dispatcher() -> None: worker_queue = config_loader.get_queue() if worker_queue is None: - logger.warn( + logger.warning( "FLI input queue not loaded correctly from config_loader: " f"{config_loader._queue_descriptor}" ) request_dispatcher._on_start() - for _ in range(2): + for i in range(2): batch: t.Optional[RequestBatch] = None mem_allocs = [] tensors = [] - model_key = "mini-model" + + # NOTE: creating callbacks in test to avoid a local channel being torn + # down when mock_messages terms but before the final response message is sent + + callback_channel = DragonCommChannel.from_local() # create a mock client application to populate the request queue msg_pump = mp.Process( target=mock_messages, - args=( - worker_queue, - dragon_fs, - ), + args=(worker_queue, backbone_fs, i, callback_channel.descriptor), ) msg_pump.start() time.sleep(1) - for _ in range(15): + for _ in range(200): try: request_dispatcher._on_iteration() - batch = request_dispatcher.task_queue.get(timeout=1) + batch = request_dispatcher.task_queue.get(timeout=0.1) break except Empty: continue except Exception as exc: raise exc - try: - assert batch is not None - assert batch.has_valid_requests + assert batch is not None + assert batch.has_valid_requests + model_key = batch.model_id.key + + try: transform_result = batch.inputs for transformed, dims, dtype in zip( transform_result.transformed, diff --git a/tests/dragon/test_torch_worker.py b/tests/dragon/test_torch_worker.py index 9a5ed6309..2a9e7d01b 100644 --- a/tests/dragon/test_torch_worker.py +++ b/tests/dragon/test_torch_worker.py @@ -37,7 +37,7 @@ from torch import nn from torch.nn import functional as F -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.feature_store import ModelKey from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, @@ -109,7 +109,7 @@ def get_request() -> InferenceRequest: ] return InferenceRequest( - model_key=FeatureStoreKey(key="model", descriptor="xyz"), + model_key=ModelKey(key="model", descriptor="xyz"), callback=None, raw_inputs=tensor_numpy, input_keys=None, diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 43b8cc7ec..b2ddb3481 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -146,7 +146,7 @@ # model_bytes = load_model() # backbone[model_key] = model_bytes -# message_model_key = MessageHandler.build_feature_store_key( +# message_model_key = MessageHandler.build_model_key( # model_key, backbone.descriptor # ) @@ -183,10 +183,10 @@ # fsd = backbone.descriptor -# # message_tensor_output_key = MessageHandler.build_feature_store_key( +# # message_tensor_output_key = MessageHandler.build_tensor_key( # # output_key, fsd # # ) -# # message_tensor_input_key = MessageHandler.build_feature_store_key( +# # message_tensor_input_key = MessageHandler.build_tensor_key( # # input_key, fsd # # ) diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index 09e1703bc..b00ba9aa2 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -117,5 +117,5 @@ def from_descriptor( path = pathlib.Path(descriptor) return FileSystemCommChannel(path) except: - logger.warning(f"failed to create fs comm channel: {descriptor!r}") + logger.warning(f"failed to create fs comm channel: {descriptor}") raise diff --git a/tests/mli/test_integrated_torch_worker.py b/tests/mli/test_integrated_torch_worker.py index 67a9a4a9b..60f1f0c6b 100644 --- a/tests/mli/test_integrated_torch_worker.py +++ b/tests/mli/test_integrated_torch_worker.py @@ -106,9 +106,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # output_key = f"demo-output" -# message_tensor_output_key = MessageHandler.build_feature_store_key(output_key) -# message_tensor_input_key = MessageHandler.build_feature_store_key(input_key) -# message_model_key = MessageHandler.build_feature_store_key(model_key) +# message_tensor_output_key = MessageHandler.build_tensor_key(output_key) +# message_tensor_input_key = MessageHandler.build_tensor_key(input_key) +# message_model_key = MessageHandler.build_model_key(model_key) # request = MessageHandler.build_request( # reply_channel=callback_channel.descriptor, @@ -146,9 +146,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # output_key = f"demo-output" -# message_tensor_output_key = MessageHandler.build_feature_store_key(output_key) -# message_tensor_input_key = MessageHandler.build_feature_store_key(input_key) -# # message_model_key = MessageHandler.build_feature_store_key(model_key) +# message_tensor_output_key = MessageHandler.build_tensor_key(output_key) +# message_tensor_input_key = MessageHandler.build_tensor_key(input_key) +# # message_model_key = MessageHandler.build_model_key(model_key) # request = MessageHandler.build_request( # reply_channel=callback_channel.descriptor, @@ -187,9 +187,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # output_key = f"demo-output" -# message_tensor_output_key = MessageHandler.build_feature_store_key(output_key) -# # message_tensor_input_key = MessageHandler.build_feature_store_key(input_key) -# # message_model_key = MessageHandler.build_feature_store_key(model_key) +# message_tensor_output_key = MessageHandler.build_tensor_key(output_key) +# # message_tensor_input_key = MessageHandler.build_tensor_key(input_key) +# # message_model_key = MessageHandler.build_model_key(model_key) # message_tensor_input = MessageHandler.build_tensor( # input_tensor, "c", "float32", [2] # ) @@ -231,9 +231,9 @@ def persist_torch_model(test_dir: str) -> pathlib.Path: # output_key = f"demo-output" -# # message_tensor_output_key = MessageHandler.build_feature_store_key(output_key) -# # message_tensor_input_key = MessageHandler.build_feature_store_key(input_key) -# message_model_key = MessageHandler.build_feature_store_key(model_key) +# # message_tensor_output_key = MessageHandler.build_tensor_key(output_key) +# # message_tensor_input_key = MessageHandler.build_tensor_key(input_key) +# message_model_key = MessageHandler.build_model_key(model_key) # message_tensor_input = MessageHandler.build_tensor( # input_tensor, "c", "float32", [2] # ) diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index 7b678239a..b1d8cd34c 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -511,10 +511,18 @@ def test_create_dotenv_existing_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir # ensure file was overwritten and env vars are not duplicated dotenv_content = exp_env_path.read_text(encoding="utf-8") - split_content = dotenv_content.split(var_name) - - # split to confirm env var only appars once - assert len(split_content) == 2 + lines = [ + line for line in dotenv_content.split("\n") if line and not "#" in line + ] + for line in lines: + if line.startswith(var_name): + # make sure the var isn't defined recursively + # DRAGON_BASE_DIR=$DRAGON_BASE_DIR + assert var_name not in line[len(var_name) + 1 :] + else: + # make sure any values reference the original base dir var + if var_name in line: + assert f"${var_name}" in line def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str): @@ -532,7 +540,7 @@ def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str): content = exp_env_path.read_text(encoding="utf-8") # ensure we have values written, but ignore empty lines - lines = [line for line in content.split("\n") if line] + lines = [line for line in content.split("\n") if line and not "#" in line] assert lines # ensure each line is formatted as key=value diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index 37c46a573..ea45a2cb7 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -510,7 +510,26 @@ def test_load_env_env_file_created(monkeypatch: pytest.MonkeyPatch, test_dir: st assert loaded_env # confirm .env was parsed as expected by inspecting a key + assert "DRAGON_BASE_DIR" in loaded_env + base_dir = loaded_env["DRAGON_BASE_DIR"] + assert "DRAGON_ROOT_DIR" in loaded_env + assert loaded_env["DRAGON_ROOT_DIR"] == base_dir + + assert "DRAGON_INCLUDE_DIR" in loaded_env + assert loaded_env["DRAGON_INCLUDE_DIR"] == f"{base_dir}/include" + + assert "DRAGON_LIB_DIR" in loaded_env + assert loaded_env["DRAGON_LIB_DIR"] == f"{base_dir}/lib" + + assert "DRAGON_VERSION" in loaded_env + assert loaded_env["DRAGON_VERSION"] == DEFAULT_DRAGON_VERSION + + assert "PATH" in loaded_env + assert loaded_env["PATH"] == f"{base_dir}/bin" + + assert "LD_LIBRARY_PATH" in loaded_env + assert loaded_env["LD_LIBRARY_PATH"] == f"{base_dir}/lib" def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): diff --git a/tests/test_featurestore.py b/tests/test_featurestore.py deleted file mode 100644 index f0b122bcf..000000000 --- a/tests/test_featurestore.py +++ /dev/null @@ -1,711 +0,0 @@ -# # BSD 2-Clause License -# # -# # Copyright (c) 2021-2024, Hewlett Packard Enterprise -# # All rights reserved. -# # -# # Redistribution and use in source and binary forms, with or without -# # modification, are permitted provided that the following conditions are met: -# # -# # 1. Redistributions of source code must retain the above copyright notice, this -# # list of conditions and the following disclaimer. -# # -# # 2. Redistributions in binary form must reproduce the above copyright notice, -# # this list of conditions and the following disclaimer in the documentation -# # and/or other materials provided with the distribution. -# # -# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# import pathlib -# import time -# import typing as t -# import unittest.mock as mock - -# import pytest - -# dragon = pytest.importorskip("dragon") - -# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( -# BackboneFeatureStore, -# EventBroadcaster, -# EventCategory, -# EventConsumer, -# OnCreateConsumer, -# OnWriteFeatureStore, -# ) -# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( -# time as bbtime, -# ) -# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( -# DragonFeatureStore, -# ) -# from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys -# from smartsim.error import SmartSimError -# from tests.mli.channel import FileSystemCommChannel -# from tests.mli.feature_store import MemoryFeatureStore - -# if t.TYPE_CHECKING: -# import conftest - - -# # The tests in this file belong to the group_a group -# pytestmark = pytest.mark.group_a - -# WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" -# RANDOMLY_SET_KEY = "_SOMETHING_ELSE" - - -# @pytest.fixture -# def storage_for_dragon_fs_with_req_queue() -> t.Dict[str, str]: -# storage = {WORK_QUEUE_KEY: "12345", RANDOMLY_SET_KEY: "67890"} -# return storage - - -# def boom(*args, **kwargs) -> None: -# """Helper function that blows up when used to mock up -# some other function""" -# raise Exception(f"you shall not pass! {args}, {kwargs}") - - -# def test_event_uid() -> None: -# """Verify that all events include a unique identifier""" -# uids: t.Set[str] = set() -# num_iters = 1000 - -# # generate a bunch of events and keep track all the IDs -# for i in range(num_iters): -# event_a = OnCreateConsumer(str(i), []) -# event_b = OnWriteFeatureStore(str(i), "key") - -# uids.add(event_a.uid) -# uids.add(event_b.uid) - -# # verify each event created a unique ID -# assert len(uids) == 2 * num_iters - - -# def test_mli_reserved_keys_conversion() -> None: -# """Verify that conversion from a string to an enum member -# works as expected""" - -# for reserved_key in ReservedKeys: -# # iterate through all keys and verify `from_string` works -# assert ReservedKeys.contains(reserved_key.value) - -# # show that the value (actual key) not the enum member name -# # will not be incorrectly identified as reserved -# assert not ReservedKeys.contains(str(reserved_key).split(".")[1]) - - -# def test_mli_reserved_keys_writes() -> None: -# """Verify that attempts to write to reserved keys are blocked from a -# standard DragonFeatureStore but enabled with the BackboneFeatureStore""" - -# mock_storage = {} -# dfs = DragonFeatureStore(mock_storage) -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# other = MemoryFeatureStore(mock_storage) - -# expected_value = "value" - -# for reserved_key in ReservedKeys: -# # we expect every reserved key to fail using DragonFeatureStore... -# with pytest.raises(SmartSimError) as ex: -# dfs[reserved_key] = expected_value - -# assert "reserved key" in ex.value.args[0] - -# # ... and expect other feature stores to respect reserved keys -# with pytest.raises(SmartSimError) as ex: -# other[reserved_key] = expected_value - -# assert "reserved key" in ex.value.args[0] - -# # ...and those same keys to succeed on the backbone -# backbone[reserved_key] = expected_value -# actual_value = backbone[reserved_key] -# assert actual_value == expected_value - - -# def test_mli_consumers_read_by_key() -> None: -# """Verify that the value returned from the mli consumers -# method is written to the correct key and reads are -# allowed via standard dragon feature store. -# NOTE: should reserved reads also be blocked""" - -# mock_storage = {} -# dfs = DragonFeatureStore(mock_storage) -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# other = MemoryFeatureStore(mock_storage) - -# expected_value = "value" - -# # write using backbone that has permission to write reserved keys -# backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value - -# # confirm read-only access to reserved keys from any FeatureStore -# for fs in [dfs, backbone, other]: -# assert fs[ReservedKeys.MLI_NOTIFY_CONSUMERS] == expected_value - - -# def test_mli_consumers_read_by_backbone() -> None: -# """Verify that the backbone reads the correct location -# when using the backbone feature store API instead of mapping API""" - -# mock_storage = {} -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# expected_value = "value" - -# backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] = expected_value - -# # confirm reading via convenience method returns expected value -# assert backbone.notification_channels[0] == expected_value - - -# def test_mli_consumers_write_by_backbone() -> None: -# """Verify that the backbone writes the correct location -# when using the backbone feature store API instead of mapping API""" - -# mock_storage = {} -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# expected_value = ["value"] - -# backbone.notification_channels = expected_value - -# # confirm write using convenience method targets expected key -# assert backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] == ",".join(expected_value) - - -# def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: -# """Verify that a broadcast operation without any registered subscribers -# succeeds without raising Exceptions - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs""" -# storage_path = pathlib.Path(test_dir) / "features" -# mock_storage = {} -# consumer_descriptor = storage_path / "test-consumer" - -# # NOTE: we're not putting any consumers into the backbone here! -# backbone = BackboneFeatureStore(mock_storage) - -# event = OnCreateConsumer(consumer_descriptor, []) - -# publisher = EventBroadcaster(backbone) -# num_receivers = 0 - -# # publishing this event without any known consumers registered should succeed -# # but report that it didn't have anybody to send the event to -# consumer_descriptor = storage_path / f"test-consumer" -# event = OnCreateConsumer(consumer_descriptor, []) - -# num_receivers += publisher.send(event) - -# # confirm no changes to the backbone occur when fetching the empty consumer key -# key_in_features_store = ReservedKeys.MLI_NOTIFY_CONSUMERS in backbone -# assert not key_in_features_store - -# # confirm that the broadcast reports no events published -# assert num_receivers == 0 -# # confirm that the broadcast buffered the event for a later send -# assert publisher.num_buffered == 1 - - -# def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None: -# """Verify that a broadcast operation without any registered subscribers -# succeeds without raising Exceptions - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs""" -# storage_path = pathlib.Path(test_dir) / "features" -# mock_storage = {} - -# # note: file-system descriptors are just paths -# consumer_descriptor = storage_path / "test-consumer" - -# # prep our backbone with a consumer list -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# backbone.notification_channels = [] - -# event = OnCreateConsumer(consumer_descriptor, []) -# publisher = EventBroadcaster( -# backbone, channel_factory=FileSystemCommChannel.from_descriptor -# ) -# num_receivers = publisher.send(event) - -# registered_consumers = backbone[ReservedKeys.MLI_NOTIFY_CONSUMERS] - -# # confirm that no consumers exist in backbone to send to -# assert not registered_consumers -# # confirm that the broadcast reports no events published -# assert num_receivers == 0 -# # confirm that the broadcast buffered the event for a later send -# assert publisher.num_buffered == 1 - - -# def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None: -# """Verify that a broadcast operation reports an error if no channel -# factory was supplied for constructing the consumer channels - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs""" -# storage_path = pathlib.Path(test_dir) / "features" -# mock_storage = {} - -# # note: file-system descriptors are just paths -# consumer_descriptor = storage_path / "test-consumer" - -# # prep our backbone with a consumer list -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# backbone.notification_channels = [consumer_descriptor] - -# event = OnCreateConsumer(consumer_descriptor, []) -# publisher = EventBroadcaster( -# backbone, -# # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied -# ) - -# with pytest.raises(SmartSimError) as ex: -# publisher.send(event) - -# assert "factory" in ex.value.args[0] - - -# def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None: -# """Verify that a successful broadcast clears messages from the event -# buffer when a new message is sent and consumers are registered - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs""" -# storage_path = pathlib.Path(test_dir) / "features" -# mock_storage = {} - -# # note: file-system descriptors are just paths -# consumer_descriptor = storage_path / "test-consumer" - -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# backbone.notification_channels = (consumer_descriptor,) - -# publisher = EventBroadcaster( -# backbone, channel_factory=FileSystemCommChannel.from_descriptor -# ) - -# # mock building up some buffered events -# num_buffered_events = 14 -# for i in range(num_buffered_events): -# event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", []) -# publisher._event_buffer.append(bytes(event)) - -# event0 = OnCreateConsumer( -# storage_path / f"test-consumer-{str(num_buffered_events + 1)}", [] -# ) - -# num_receivers = publisher.send(event0) -# # 1 receiver x 15 total events == 15 events -# assert num_receivers == num_buffered_events + 1 - - -# @pytest.mark.parametrize( -# "num_consumers, num_buffered, expected_num_sent", -# [ -# pytest.param(0, 7, 0, id="0 x (7+1) - no consumers, multi-buffer"), -# pytest.param(1, 7, 8, id="1 x (7+1) - single consumer, multi-buffer"), -# pytest.param(2, 7, 16, id="2 x (7+1) - multi-consumer, multi-buffer"), -# pytest.param(4, 4, 20, id="4 x (4+1) - multi-consumer, multi-buffer (odd #)"), -# pytest.param(9, 0, 9, id="13 x (0+1) - multi-consumer, empty buffer"), -# ], -# ) -# def test_eventpublisher_broadcast_returns_total_sent( -# test_dir: str, num_consumers: int, num_buffered: int, expected_num_sent: int -# ) -> None: -# """Verify that a successful broadcast returns the total number of events -# sent, including buffered messages. - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs -# :param num_consumers: the number of consumers to mock setting up prior to send -# :param num_buffered: the number of pre-buffered events to mock up -# :param expected_num_sent: the expected result from calling send -# """ -# storage_path = pathlib.Path(test_dir) / "features" -# mock_storage = {} - -# # note: file-system descriptors are just paths -# consumers = [] -# for i in range(num_consumers): -# consumers.append(storage_path / f"test-consumer-{i}") - -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# backbone.notification_channels = consumers - -# publisher = EventBroadcaster( -# backbone, channel_factory=FileSystemCommChannel.from_descriptor -# ) - -# # mock building up some buffered events -# for i in range(num_buffered): -# event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", []) -# publisher._event_buffer.append(bytes(event)) - -# assert publisher.num_buffered == num_buffered - -# # this event will trigger clearing anything already in buffer -# event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}", []) - -# # num_receivers should contain a number that computes w/all consumers and all events -# num_receivers = publisher.send(event0) - -# assert num_receivers == expected_num_sent - - -# def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: -# """Verify that any unused consumers are pruned each time a new event is sent - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs""" -# storage_path = pathlib.Path(test_dir) / "features" -# mock_storage = {} - -# # note: file-system descriptors are just paths -# consumer_descriptor = storage_path / "test-consumer" - -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) - -# publisher = EventBroadcaster( -# backbone, channel_factory=FileSystemCommChannel.from_descriptor -# ) - -# event = OnCreateConsumer(consumer_descriptor, []) - -# # the only registered cnosumer is in the event, expect no pruning -# backbone.notification_channels = (consumer_descriptor,) - -# publisher.send(event) -# assert str(consumer_descriptor) in publisher._channel_cache -# assert len(publisher._channel_cache) == 1 - -# # add a new descriptor for another event... -# consumer_descriptor2 = storage_path / "test-consumer-2" -# # ... and remove the old descriptor from the backbone when it's looked up -# backbone.notification_channels = (consumer_descriptor2,) - -# event = OnCreateConsumer(consumer_descriptor2, []) - -# publisher.send(event) - -# assert str(consumer_descriptor2) in publisher._channel_cache -# assert str(consumer_descriptor) not in publisher._channel_cache -# assert len(publisher._channel_cache) == 1 - -# # test multi-consumer pruning by caching some extra channels -# prune0, prune1, prune2 = "abc", "def", "ghi" -# publisher._channel_cache[prune0] = "doesnt-matter-if-it-is-pruned" -# publisher._channel_cache[prune1] = "doesnt-matter-if-it-is-pruned" -# publisher._channel_cache[prune2] = "doesnt-matter-if-it-is-pruned" - -# # add in one of our old channels so we prune the above items, send to these -# backbone.notification_channels = (consumer_descriptor, consumer_descriptor2) - -# publisher.send(event) - -# assert str(consumer_descriptor2) in publisher._channel_cache - -# # NOTE: we should NOT prune something that isn't used by this message but -# # does appear in `backbone.notification_channels` -# assert str(consumer_descriptor) in publisher._channel_cache - -# # confirm all of our items that were not in the notification channels are gone -# for pruned in [prune0, prune1, prune2]: -# assert pruned not in publisher._channel_cache - -# # confirm we have only the two expected items in the channel cache -# assert len(publisher._channel_cache) == 2 - - -# def test_eventpublisher_serialize_failure( -# test_dir: str, monkeypatch: pytest.MonkeyPatch -# ) -> None: -# """Verify that errors during message serialization are raised to the caller - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs -# :param monkeypatch: pytest fixture for modifying behavior of existing code -# with mock implementations""" -# storage_path = pathlib.Path(test_dir) / "features" -# storage_path.mkdir(parents=True, exist_ok=True) - -# mock_storage = {} - -# # note: file-system descriptors are just paths -# target_descriptor = str(storage_path / "test-consumer") - -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# publisher = EventBroadcaster( -# backbone, channel_factory=FileSystemCommChannel.from_descriptor -# ) - -# with monkeypatch.context() as patch: -# event = OnCreateConsumer(target_descriptor, []) - -# # patch the __bytes__ implementation to cause pickling to fail during send -# patch.setattr(event, "__bytes__", lambda x: b"abc") - -# backbone.notification_channels = (target_descriptor,) - -# # send a message into the channel -# with pytest.raises(ValueError) as ex: -# publisher.send(event) - -# assert "serialize" in ex.value.args[0] - - -# def test_eventpublisher_factory_failure( -# test_dir: str, monkeypatch: pytest.MonkeyPatch -# ) -> None: -# """Verify that errors during channel construction are raised to the caller - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs -# :param monkeypatch: pytest fixture for modifying behavior of existing code -# with mock implementations""" -# storage_path = pathlib.Path(test_dir) / "features" -# storage_path.mkdir(parents=True, exist_ok=True) - -# mock_storage = {} - -# # note: file-system descriptors are just paths -# target_descriptor = str(storage_path / "test-consumer") - -# def boom(descriptor: str) -> None: -# raise Exception(f"you shall not pass! {descriptor}") - -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# publisher = EventBroadcaster(backbone, channel_factory=boom) - -# with monkeypatch.context() as patch: -# event = OnCreateConsumer(target_descriptor, []) - -# backbone.notification_channels = (target_descriptor,) - -# # send a message into the channel -# with pytest.raises(SmartSimError) as ex: -# publisher.send(event) - -# assert "construct" in ex.value.args[0] - - -# def test_eventpublisher_failure(test_dir: str, monkeypatch: pytest.MonkeyPatch) -> None: -# """Verify that unexpected errors during message send are caught and wrapped in a -# SmartSimError so they are not propagated directly to the caller - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs -# :param monkeypatch: pytest fixture for modifying behavior of existing code -# with mock implementations""" -# storage_path = pathlib.Path(test_dir) / "features" -# storage_path.mkdir(parents=True, exist_ok=True) - -# mock_storage = {} - -# # note: file-system descriptors are just paths -# target_descriptor = str(storage_path / "test-consumer") - -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# publisher = EventBroadcaster( -# backbone, channel_factory=FileSystemCommChannel.from_descriptor -# ) - -# def boom(self) -> None: -# raise Exception("That was unexpected...") - -# with monkeypatch.context() as patch: -# event = OnCreateConsumer(target_descriptor, []) - -# # patch the _broadcast implementation to cause send to fail after -# # after the event has been pickled -# patch.setattr(publisher, "_broadcast", boom) - -# backbone.notification_channels = (target_descriptor,) - -# # Here, we see the exception raised by broadcast that isn't expected -# # is not allowed directly out, and instead is wrapped in SmartSimError -# with pytest.raises(SmartSimError) as ex: -# publisher.send(event) - -# assert "unexpected" in ex.value.args[0] - - -# def test_eventconsumer_receive(test_dir: str) -> None: -# """Verify that a consumer retrieves a message from the given channel - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs""" -# storage_path = pathlib.Path(test_dir) / "features" -# storage_path.mkdir(parents=True, exist_ok=True) - -# mock_storage = {} - -# # note: file-system descriptors are just paths -# target_descriptor = str(storage_path / "test-consumer") - -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) -# event = OnCreateConsumer(target_descriptor, []) - -# # simulate a sent event by writing directly to the input comm channel -# comm_channel.send(bytes(event)) - -# consumer = EventConsumer(comm_channel, backbone) - -# all_received: t.List[OnCreateConsumer] = consumer.receive() -# assert len(all_received) == 1 - -# # verify we received the same event that was raised -# assert all_received[0].category == event.category -# assert all_received[0].descriptor == event.descriptor - - -# @pytest.mark.parametrize("num_sent", [0, 1, 2, 4, 8, 16]) -# def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None: -# """Verify that a consumer retrieves multiple message from the given channel - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs -# :param num_sent: parameterized value used to vary the number of events -# that are enqueued and validations are checked at multiple queue sizes""" -# storage_path = pathlib.Path(test_dir) / "features" -# storage_path.mkdir(parents=True, exist_ok=True) - -# mock_storage = {} - -# # note: file-system descriptors are just paths -# target_descriptor = str(storage_path / "test-consumer") - -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) - -# # simulate multiple sent events by writing directly to the input comm channel -# for _ in range(num_sent): -# event = OnCreateConsumer(target_descriptor, []) -# comm_channel.send(bytes(event)) - -# consumer = EventConsumer(comm_channel, backbone) - -# all_received: t.List[OnCreateConsumer] = consumer.receive() -# assert len(all_received) == num_sent - - -# def test_eventconsumer_receive_empty(test_dir: str) -> None: -# """Verify that a consumer receiving an empty message ignores the -# message and continues processing - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs""" -# storage_path = pathlib.Path(test_dir) / "features" -# storage_path.mkdir(parents=True, exist_ok=True) - -# mock_storage = {} - -# # note: file-system descriptors are just paths -# target_descriptor = str(storage_path / "test-consumer") - -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) - -# # simulate a sent event by writing directly to the input comm channel -# comm_channel.send(bytes(b"")) - -# consumer = EventConsumer(comm_channel, backbone) - -# messages = consumer.receive() - -# # the messages array should be empty -# assert not messages - - -# def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: -# """Verify that the publisher and consumer integrate as expected when -# multiple publishers and consumers are sending simultaneously. - -# :param test_dir: pytest fixture automatically generating unique working -# directories for individual test outputs""" -# storage_path = pathlib.Path(test_dir) / "features" -# storage_path.mkdir(parents=True, exist_ok=True) - -# mock_storage = {} -# backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) -# mock_fs_descriptor = str(storage_path / f"mock-feature-store") - -# wmgr_channel = FileSystemCommChannel(storage_path / "test-wmgr") -# capp_channel = FileSystemCommChannel(storage_path / "test-capp") -# back_channel = FileSystemCommChannel(storage_path / "test-backend") - -# wmgr_consumer_descriptor = wmgr_channel.descriptor -# capp_consumer_descriptor = capp_channel.descriptor -# back_consumer_descriptor = back_channel.descriptor - -# # create some consumers to receive messages -# wmgr_consumer = EventConsumer( -# wmgr_channel, -# backbone, -# filters=[EventCategory.FEATURE_STORE_WRITTEN], -# ) -# capp_consumer = EventConsumer( -# capp_channel, -# backbone, -# ) -# back_consumer = EventConsumer( -# back_channel, -# backbone, -# filters=[EventCategory.CONSUMER_CREATED], -# ) - -# # create some broadcasters to publish messages -# mock_worker_mgr = EventBroadcaster( -# backbone, -# channel_factory=FileSystemCommChannel.from_descriptor, -# ) -# mock_client_app = EventBroadcaster( -# backbone, -# channel_factory=FileSystemCommChannel.from_descriptor, -# ) - -# # register all of the consumers even though the OnCreateConsumer really should -# # trigger its registration. event processing is tested elsewhere. -# backbone.notification_channels = [ -# wmgr_consumer_descriptor, -# capp_consumer_descriptor, -# back_consumer_descriptor, -# ] - -# # simulate worker manager sending a notification to backend that it's alive -# event_1 = OnCreateConsumer(wmgr_consumer_descriptor, []) -# mock_worker_mgr.send(event_1) - -# # simulate the app updating a model a few times -# event_2 = OnWriteFeatureStore(mock_fs_descriptor, "key-1") -# event_3 = OnWriteFeatureStore(mock_fs_descriptor, "key-2") -# event_4 = OnWriteFeatureStore(mock_fs_descriptor, "key-1") - -# mock_client_app.send(event_2) -# mock_client_app.send(event_3) -# mock_client_app.send(event_4) - -# # worker manager should only get updates about feature update -# wmgr_messages = wmgr_consumer.receive() -# assert len(wmgr_messages) == 3 - -# # the backend should only receive messages about consumer creation -# back_messages = back_consumer.receive() -# assert len(back_messages) == 1 - -# # hypothetical app has no filters and will get all events -# app_messages = capp_consumer.receive() -# assert len(app_messages) == 4 diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py index 092ae4fe0..6c9b3dc95 100644 --- a/tests/test_message_handler/test_build_model_key.py +++ b/tests/test_message_handler/test_build_model_key.py @@ -34,14 +34,14 @@ handler = MessageHandler() -def test_build_feature_store_key_successful(): +def test_build_model_key_successful(): fsd = "mock-feature-store-descriptor" - model_key = handler.build_feature_store_key("tensor_key", fsd) + model_key = handler.build_model_key("tensor_key", fsd) assert model_key.key == "tensor_key" assert model_key.descriptor == fsd -def test_build_feature_store_key_unsuccessful(): +def test_build_model_key_unsuccessful(): with pytest.raises(ValueError): fsd = "mock-feature-store-descriptor" - model_key = handler.build_feature_store_key(100, fsd) + model_key = handler.build_model_key(100, fsd) diff --git a/tests/test_message_handler/test_output_descriptor.py b/tests/test_message_handler/test_output_descriptor.py index 2b5575965..beb9a4765 100644 --- a/tests/test_message_handler/test_output_descriptor.py +++ b/tests/test_message_handler/test_output_descriptor.py @@ -34,7 +34,7 @@ handler = MessageHandler() fsd = "mock-feature-store-descriptor" -tensor_key = handler.build_feature_store_key("key", fsd) +tensor_key = handler.build_tensor_key("key", fsd) @pytest.mark.parametrize( diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index 751722534..a60818f7d 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -33,14 +33,14 @@ fsd = "mock-feature-store-descriptor" -model_key = MessageHandler.build_feature_store_key("model_key", fsd) +model_key = MessageHandler.build_model_key("model_key", fsd) model = MessageHandler.build_model(b"model data", "model_name", "v0.0.1") -input_key1 = MessageHandler.build_feature_store_key("input_key1", fsd) -input_key2 = MessageHandler.build_feature_store_key("input_key2", fsd) +input_key1 = MessageHandler.build_tensor_key("input_key1", fsd) +input_key2 = MessageHandler.build_tensor_key("input_key2", fsd) -output_key1 = MessageHandler.build_feature_store_key("output_key1", fsd) -output_key2 = MessageHandler.build_feature_store_key("output_key2", fsd) +output_key1 = MessageHandler.build_tensor_key("output_key1", fsd) +output_key2 = MessageHandler.build_tensor_key("output_key2", fsd) output_descriptor1 = MessageHandler.build_output_tensor_descriptor( "c", [output_key1, output_key2], "int64", [] diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py index d0305407c..86774132e 100644 --- a/tests/test_message_handler/test_response.py +++ b/tests/test_message_handler/test_response.py @@ -33,8 +33,8 @@ fsd = "mock-feature-store-descriptor" -result_key1 = MessageHandler.build_feature_store_key("result_key1", fsd) -result_key2 = MessageHandler.build_feature_store_key("result_key2", fsd) +result_key1 = MessageHandler.build_tensor_key("result_key1", fsd) +result_key2 = MessageHandler.build_tensor_key("result_key2", fsd) torch_attributes = MessageHandler.build_torch_response_attributes() tf_attributes = MessageHandler.build_tf_response_attributes() From fd0a5ecbe5ad7596adb0067aacbdc77cf9c4e8ea Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Thu, 19 Sep 2024 20:43:57 -0500 Subject: [PATCH 03/40] capture dragon start method ex --- tests/dragon/test_request_dispatcher.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index e666710e6..54dfcb68d 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -85,7 +85,10 @@ # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon -mp.set_start_method("dragon") +try: + mp.set_start_method("dragon") +except Exception: + pass class MiniModel(nn.Module): From 00a4496678d13e2b686cd3191097b9172b563ac1 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Thu, 19 Sep 2024 23:34:29 -0500 Subject: [PATCH 04/40] Split message pump into separate module from dispatcher test --- tests/dragon/test_request_dispatcher.py | 179 +++------------------- tests/dragon/utils/msg_pump.py | 194 ++++++++++++++++++++++++ 2 files changed, 217 insertions(+), 156 deletions(-) create mode 100644 tests/dragon/utils/msg_pump.py diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index 54dfcb68d..352b2d538 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -25,11 +25,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import gc -import io import logging import os import pathlib -import socket +import subprocess as sp +import sys import time import typing as t from queue import Empty @@ -37,7 +37,6 @@ import numpy as np import pytest -pytest.importorskip("torch") pytest.importorskip("dragon") @@ -48,16 +47,10 @@ # isort: on -import dragon.channels as dch -import dragon.infrastructure.policy as dragon_policy -import dragon.infrastructure.process_desc as dragon_process_desc -import dragon.native.process as dragon_process -import torch.nn as nn from dragon import fli from dragon.data.ddict.ddict import DDict from dragon.managed_memory import MemoryAlloc -from smartsim._core.entrypoints.service import Service from smartsim._core.mli.comm.channel.dragon_channel import ( DragonCommChannel, create_local, @@ -76,9 +69,7 @@ from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker -from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger logger = get_logger(__name__) @@ -91,142 +82,6 @@ pass -class MiniModel(nn.Module): - def __init__(self): - super().__init__() - - self._name = "mini-model" - self._net = torch.nn.Linear(2, 1) - - def forward(self, input): - return self._net(input) - - @property - def bytes(self) -> bytes: - """Returns the model serialized to a byte stream""" - buffer = io.BytesIO() - scripted = torch.jit.trace(self._net, self.get_batch()) - torch.jit.save(scripted, buffer) - return buffer.getvalue() - - @classmethod - def get_batch(cls) -> "torch.Tensor": - return torch.randn((100, 2), dtype=torch.float32) - - -def load_model() -> bytes: - """Create a simple torch model in memory for testing""" - mini_model = MiniModel() - return mini_model.bytes - - -def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: - """Create a simple torch model and persist to disk for - testing purposes. - - TODO: remove once unit tests are in place""" - # test_path = pathlib.Path(work_dir) - if not model_path.parent.exists(): - model_path.parent.mkdir(parents=True, exist_ok=True) - - model_path.unlink(missing_ok=True) - - model = torch.nn.Linear(2, 1) - torch.save(model, model_path) - - return model_path - - -def mock_messages( - request_dispatcher_queue: DragonFLIChannel, - feature_store: FeatureStore, - parent_iteration: int, - callback_descriptor: str, -) -> None: - """Mock event producer for triggering the inference pipeline""" - model_key = "mini-model" - # mock_message sends 2 messages, so we offset by 2 * (# of iterations in caller) - offset = 2 * parent_iteration - - for iteration_number in range(2): - logged_iteration = offset + iteration_number - logger.debug(f"Sending mock message {logged_iteration}") - - output_key = f"output-{iteration_number}" - - feature_store[model_key] = load_model() - - tensor = ( - (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32) - ).numpy() - fsd = feature_store.descriptor - - tensor_desc = MessageHandler.build_tensor_descriptor( - "c", "float32", list(tensor.shape) - ) - - message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) - message_model_key = MessageHandler.build_model_key(model_key, fsd) - - request = MessageHandler.build_request( - reply_channel=callback_descriptor, - model=message_model_key, - inputs=[tensor_desc], - outputs=[message_tensor_output_key], - output_descriptors=[], - custom_attributes=None, - ) - - logger.info(f"Sending request {iteration_number} to request_dispatcher_queue") - request_bytes = MessageHandler.serialize_request(request) - with request_dispatcher_queue._fli.sendh( - timeout=None, stream_channel=request_dispatcher_queue._channel - ) as sendh: - sendh.send_bytes(request_bytes) - sendh.send_bytes(tensor.tobytes()) - - logger.info( - f"Retrieving {iteration_number} from callback channel: {callback_descriptor}" - ) - callback_channel = DragonCommChannel.from_descriptor(callback_descriptor) - - # Results will be empty. The test pulls messages off the queue before they - # can be serviced by a worker. Just ensure the callback channel works. - results = callback_channel.recv(timeout=0.1) - logger.debug(f"Received mock message results on callback channel: {results}") - time.sleep(1) - - -@pytest.fixture -def prepare_environment(test_dir: str) -> pathlib.Path: - """Cleanup prior outputs to run demo repeatedly""" - path = pathlib.Path(f"{test_dir}/workermanager.log") - logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) - return path - - -def service_as_dragon_proc( - service: Service, cpu_affinity: list[int], gpu_affinity: list[int] -) -> dragon_process.Process: - - options = dragon_process_desc.ProcessOptions(make_inf_channels=True) - local_policy = dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=socket.gethostname(), - cpu_affinity=cpu_affinity, - gpu_affinity=gpu_affinity, - ) - return dragon_process.Process( - target=service.execute, - args=[], - cwd=os.getcwd(), - policy=local_policy, - options=options, - stderr=dragon_process.Popen.STDOUT, - stdout=dragon_process.Popen.STDOUT, - ) - - def test_request_dispatcher() -> None: """Test the request dispatcher batching and queueing system @@ -279,15 +134,27 @@ def test_request_dispatcher() -> None: callback_channel = DragonCommChannel.from_local() - # create a mock client application to populate the request queue - msg_pump = mp.Process( - target=mock_messages, - args=(worker_queue, backbone_fs, i, callback_channel.descriptor), + fp = pathlib.Path(__file__).parent / "utils" / "msg_pump.py" + cmd = [ + sys.executable, + str(fp.absolute()), + "--dispatch-fli-descriptor", + worker_queue.descriptor, + "--fs-descriptor", + backbone_fs.descriptor, + "--parent-iteration", + str(i), + "--callback-descriptor", + callback_channel.descriptor, + ] + + popen = sp.Popen( + args=cmd, + stdout=sp.PIPE, + stderr=sp.PIPE, ) - msg_pump.start() - - time.sleep(1) + time.sleep(2) for _ in range(200): try: @@ -347,8 +214,6 @@ def test_request_dispatcher() -> None: for mem_alloc in mem_allocs: mem_alloc.free() - msg_pump.kill() - request_dispatcher._active_queues[model_key].make_disposable() assert request_dispatcher._active_queues[model_key].can_be_removed @@ -357,6 +222,8 @@ def test_request_dispatcher() -> None: assert model_key not in request_dispatcher._active_queues assert model_key not in request_dispatcher._queues + popen.wait() + # Try to remove the dispatcher and free the memory del request_dispatcher gc.collect() diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py new file mode 100644 index 000000000..e54cdf7fd --- /dev/null +++ b/tests/dragon/utils/msg_pump.py @@ -0,0 +1,194 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import logging +import pathlib +import time +import typing as t + +import pytest + +pytest.importorskip("torch") +pytest.importorskip("dragon") + + +# isort: off +import dragon +import multiprocessing as mp +import torch +import torch.nn as nn + +# isort: on + +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger(__name__, log_level=logging.DEBUG) + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + +try: + mp.set_start_method("dragon") +except Exception: + pass + + +class MiniModel(nn.Module): + def __init__(self): + super().__init__() + + self._name = "mini-model" + self._net = torch.nn.Linear(2, 1) + + def forward(self, input): + return self._net(input) + + @property + def bytes(self) -> bytes: + """Returns the model serialized to a byte stream""" + buffer = io.BytesIO() + scripted = torch.jit.trace(self._net, self.get_batch()) + torch.jit.save(scripted, buffer) + return buffer.getvalue() + + @classmethod + def get_batch(cls) -> "torch.Tensor": + return torch.randn((100, 2), dtype=torch.float32) + + +def load_model() -> bytes: + """Create a simple torch model in memory for testing""" + mini_model = MiniModel() + return mini_model.bytes + + +def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: + """Create a simple torch model and persist to disk for + testing purposes. + + TODO: remove once unit tests are in place""" + # test_path = pathlib.Path(work_dir) + if not model_path.parent.exists(): + model_path.parent.mkdir(parents=True, exist_ok=True) + + model_path.unlink(missing_ok=True) + + model = torch.nn.Linear(2, 1) + torch.save(model, model_path) + + return model_path + + +def mock_messages( + dispatch_fli_descriptor: str, + fs_descriptor: str, + parent_iteration: int, + callback_descriptor: str, +) -> None: + """Mock event producer for triggering the inference pipeline""" + model_key = "mini-model" + # mock_message sends 2 messages, so we offset by 2 * (# of iterations in caller) + offset = 2 * parent_iteration + + feature_store = BackboneFeatureStore.from_descriptor(fs_descriptor) + request_dispatcher_queue = DragonFLIChannel.from_sender_supplied_descriptor( + dispatch_fli_descriptor + ) + + for iteration_number in range(2): + logged_iteration = offset + iteration_number + logger.debug(f"Sending mock message {logged_iteration}") + + output_key = f"output-{iteration_number}" + + feature_store[model_key] = load_model() + + tensor = ( + (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32) + ).numpy() + fsd = feature_store.descriptor + + tensor_desc = MessageHandler.build_tensor_descriptor( + "c", "float32", list(tensor.shape) + ) + + message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) + message_model_key = MessageHandler.build_model_key(model_key, fsd) + + request = MessageHandler.build_request( + reply_channel=callback_descriptor, + model=message_model_key, + inputs=[tensor_desc], + outputs=[message_tensor_output_key], + output_descriptors=[], + custom_attributes=None, + ) + + logger.info(f"Sending request {iteration_number} to request_dispatcher_queue") + request_bytes = MessageHandler.serialize_request(request) + with request_dispatcher_queue._fli.sendh( + timeout=None, stream_channel=request_dispatcher_queue._channel + ) as sendh: + sendh.send_bytes(request_bytes) + sendh.send_bytes(tensor.tobytes()) + + logger.info( + f"Retrieving {iteration_number} from callback channel: {callback_descriptor}" + ) + callback_channel = DragonCommChannel.from_descriptor(callback_descriptor) + + # Results will be empty. The test pulls messages off the queue before they + # can be serviced by a worker. Just ensure the callback channel works. + results = callback_channel.recv(timeout=0.1) + logger.debug(f"Received mock message results on callback channel: {results}") + time.sleep(1) + + +if __name__ == "__main__": + import argparse + + args = argparse.ArgumentParser() + + args.add_argument("--dispatch-fli-descriptor", type=str) + args.add_argument("--fs-descriptor", type=str) + args.add_argument("--parent-iteration", type=int) + args.add_argument("--callback-descriptor", type=str) + + args = args.parse_args() + + mock_messages( + args.dispatch_fli_descriptor, + args.fs_descriptor, + args.parent_iteration, + args.callback_descriptor, + ) From a87aba7a0a83e17e8ff679104e9170bcf0681bbe Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Mon, 23 Sep 2024 16:28:12 -0500 Subject: [PATCH 05/40] extract msg_pump_factory for reuse in test_worker_manager.py --- conftest.py | 74 ++++++++++++++++++++++++- tests/dragon/test_request_dispatcher.py | 46 ++++++++------- 2 files changed, 99 insertions(+), 21 deletions(-) diff --git a/conftest.py b/conftest.py index 991c0d17b..622dd7a7c 100644 --- a/conftest.py +++ b/conftest.py @@ -227,7 +227,6 @@ def kill_all_test_spawned_processes() -> None: print("Not all processes were killed after test") - def get_hostlist() -> t.Optional[t.List[str]]: global test_hostlist if not test_hostlist: @@ -1022,3 +1021,76 @@ def _prepare_db(db_config: DBConfiguration) -> PrepareDatabaseOutput: return PrepareDatabaseOutput(db, new_db) return _prepare_db + + +class MsgPumpRequest(t.NamedTuple): + """Fields required for starting a simulated inference request producer.""" + + backbone_descriptor: str + """The descriptor to use when connecting the message pump to a + backbone featurestore. + + Passed to the message pump as `--fs-descriptor` + """ + work_queue_descriptor: str + """The descriptor to use for sending work from the pump to the worker manager. + + Passed to the message pump as `--dispatch-fli-descriptor` + """ + callback_descriptor: str + """The descriptor the worker should use to returning results. + + Passed to the message pump as `--callback-descriptor` + """ + iteration_index: int = 1 + """If calling the message pump repeatedly, supply an iteration index to ensure + that logged messages appear unique instead of apparing to be duplicated logs. + + Passed to the message pump as `--parent-iteration` + """ + + def as_command(self) -> t.List[str]: + """Produce CLI arguments suitable for calling subprocess.Popen that + to execute the msg pump. + + NOTE: does NOT include the `[sys.executable, msg_pump_path, ...]` + portion of the necessary parameters to Popen. + + :returns: A list of strings containing the arguments of the request + formatted for inclusion in a call to subprocess.Popen""" + return [ + "--dispatch-fli-descriptor", + self.work_queue_descriptor, + "--fs-descriptor", + self.backbone_descriptor, + "--parent-iteration", + str(self.iteration_index), + "--callback-descriptor", + self.callback_descriptor, + ] + + +@pytest.fixture(scope="session") +def msg_pump_factory() -> t.Callable[[MsgPumpRequest], subprocess.Popen]: + """A pytest fixture used to create a mock event producer capable of + feeding asynchronous inference requests to tests requiring them. + + :returns: A function that can be passed appropriate descriptors + for starting a message pump.""" + + def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen: + """Invokes the message pump entry-point""" + # /tests/dragon/utils/msg_pump.py + msg_pump_script = "tests/dragon/utils/msg_pump.py" + msg_pump_path = pathlib.Path(__file__).parent / msg_pump_script + + cmd = [sys.executable, str(msg_pump_path.absolute()), *request.as_command()] + + popen = subprocess.Popen( + args=cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + return popen + + return run_message_pump diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index 352b2d538..e111f8c74 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -37,12 +37,15 @@ import numpy as np import pytest +import conftest + pytest.importorskip("dragon") # isort: off import dragon import multiprocessing as mp + import torch # isort: on @@ -73,16 +76,23 @@ from smartsim.log import get_logger logger = get_logger(__name__) +mock_msg_pump_path = pathlib.Path(__file__).parent / "utils" / "msg_pump.py" +_MsgPumpFactory = t.Callable[[conftest.MsgPumpRequest], sp.Popen] + # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon + try: mp.set_start_method("dragon") except Exception: pass -def test_request_dispatcher() -> None: +@pytest.mark.parametrize("num_iterations", [4]) +def test_request_dispatcher( + msg_pump_factory: _MsgPumpFactory, num_iterations: int +) -> None: """Test the request dispatcher batching and queueing system This also includes setting a queue to disposable, checking that it is no @@ -123,8 +133,9 @@ def test_request_dispatcher() -> None: ) request_dispatcher._on_start() + pump_processes: t.List[sp.Popen] = [] - for i in range(2): + for i in range(num_iterations): batch: t.Optional[RequestBatch] = None mem_allocs = [] tensors = [] @@ -134,27 +145,17 @@ def test_request_dispatcher() -> None: callback_channel = DragonCommChannel.from_local() - fp = pathlib.Path(__file__).parent / "utils" / "msg_pump.py" - cmd = [ - sys.executable, - str(fp.absolute()), - "--dispatch-fli-descriptor", - worker_queue.descriptor, - "--fs-descriptor", + request = conftest.MsgPumpRequest( backbone_fs.descriptor, - "--parent-iteration", - str(i), - "--callback-descriptor", + worker_queue.descriptor, callback_channel.descriptor, - ] - - popen = sp.Popen( - args=cmd, - stdout=sp.PIPE, - stderr=sp.PIPE, + i, ) - time.sleep(2) + msg_pump = msg_pump_factory(request) + pump_processes.append(msg_pump) + + time.sleep(1) for _ in range(200): try: @@ -222,7 +223,12 @@ def test_request_dispatcher() -> None: assert model_key not in request_dispatcher._active_queues assert model_key not in request_dispatcher._queues - popen.wait() + msg_pump.wait() + + for msg_pump in pump_processes: + if msg_pump.returncode is not None: + continue + msg_pump.terminate() # Try to remove the dispatcher and free the memory del request_dispatcher From 48791226307e987092bc2f3b0219917772f2c33e Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 24 Sep 2024 14:33:57 -0500 Subject: [PATCH 06/40] add test verifying protoclient event raises --- .../_core/launcher/dragon/dragonBackend.py | 30 +- .../storage/backbone_feature_store.py | 57 ++- smartsim/protoclient.py | 109 ++-- tests/dragon/test_protoclient.py | 473 ++++++++++-------- 4 files changed, 377 insertions(+), 292 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 0f8121ab5..a1367af2a 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -277,6 +277,7 @@ def status_message(self) -> str: ) def _heartbeat(self) -> None: + """Update the value of the last heartbeat to the current time.""" self._last_beat = self.current_time @property @@ -621,38 +622,15 @@ def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer: name="BackendConsumerRegistrar", event_handler=self._on_consumer_created, ) + while consumer.register(): + # wait for the consumer to complete registration + ... # self._backbone.backend_channel = # consumer.descriptor # i want to get rid of this extra channel # self._bootstrap_event_listeners(backbone, consumer) self._event_consumer = consumer - # options = dragon_process_desc. - # ProcessOptions(make_inf_channels=True) # what is this!? - # grp_consumer = dragon_process_group.ProcessGroup( - # restart=False, pmi_enabled=False - # ) - # self._event_consumer_process = dragon_process.ProcessTemplate( - # target=self._event_consumer.listen, - # # args=request.exe_args, - # # cwd=request.path, - # env={ - # # **request.current_env, - # # **request.env, - # **self._backbone.get_env(), - # }, - # stdout=dragon_process.Popen.PIPE, - # stderr=dragon_process.Popen.PIPE, - # # policy=local_policy, - # options=options, - # ) - # grp_consumer.add(self._event_consumer_process) - # # self._event_consumer_process = - # mp.Process(target=self._event_consumer.listen) - # # self._event_consumer_process.start() - # grp_consumer.init() - # grp_consumer.start() - logger.info("Created event consumer") return self._event_consumer diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index 9cc8a6bf9..e48f4e4e9 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -104,7 +104,7 @@ def notification_channels(self) -> t.Sequence[str]: :returns: The list of descriptors """ - if "_SMARTSIM_MLI_NOTIFY_CONSUMERS" in self: + if self.MLI_NOTIFY_CONSUMERS in self: stored_consumers = self[self.MLI_NOTIFY_CONSUMERS] return str(stored_consumers).split(",") return [] @@ -367,6 +367,8 @@ def send(self, event: EventBase) -> int: raise Exception("No channel to send on") num_sent = 0 + logger.debug(f"Sending {event} to {self._channel.descriptor}") + try: event_bytes = bytes(event) self._channel.send(event_bytes) @@ -399,7 +401,7 @@ def __init__( ) """A mapping of instantiated channels that can be re-used. Automatically calls the channel factory if a descriptor is not already in the collection""" - self._event_buffer: t.Deque[bytes] = deque() + self._event_buffer: t.Deque[EventBase] = deque() """A buffer for storing events when a consumer list is not found""" self._descriptors: t.Set[str] """Stores the most recent list of broadcast consumers. Updated automatically @@ -416,15 +418,15 @@ def num_buffered(self) -> int: return len(self._event_buffer) def _save_to_buffer(self, event: EventBase) -> None: - """Places a serialized event in the buffer to be sent once a consumer + """Places the event in the buffer to be sent once a consumer list is available. :param event: The event to serialize and buffer :raises ValueError: If the event cannot be serialized """ try: - event_bytes = bytes(event) - self._event_buffer.append(event_bytes) + self._event_buffer.append(event) + logger.debug(f"Buffered event {event=}") except Exception as ex: raise ValueError(f"Unable to serialize event from {self._uid}") from ex @@ -459,7 +461,7 @@ def _get_comm_channel(self, descriptor: str) -> CommChannelBase: :param descriptor: The descriptor to pass to the channel factory :returns: The instantiated channel - :raises SmartSimError: If the channel fails to build + :raises SmartSimError: If the channel fails to attach """ comm_channel = self._channel_cache[descriptor] if comm_channel is not None: @@ -477,11 +479,24 @@ def _get_comm_channel(self, descriptor: str) -> CommChannelBase: logger.error(msg, exc_info=True) raise SmartSimError(msg) from ex + def _get_next_event_event(self) -> t.Optional[EventBase]: + """Pop the next event to be sent from the queue. + + :returns: The next event to send if any events are enqueued, otherwise `None`. + """ + try: + return self._event_buffer.popleft() + except IndexError: + logger.debug(f"Broadcast buffer exhausted for {self._uid}") + + return None + def _broadcast(self, timeout: float = 0.001) -> int: """Broadcasts all buffered events to registered event consumers. :param timeout: Maximum time to wait (in seconds) for messages to send :returns: The number of events broadcasted to consumers + :raises SmartSimError: If the channel fails to attach :raises SmartSimError: If broadcasting fails """ # allow descriptors to be empty since events are buffered @@ -493,31 +508,26 @@ def _broadcast(self, timeout: float = 0.001) -> int: self._prune_unused_consumers() self._log_broadcast_start() - num_sent: int = 0 - next_event: t.Optional[bytes] = self._event_buffer.popleft() + num_sent = 0 + num_listeners = len(self._descriptors) # send each event to every consumer - while next_event is not None: - for descriptor in map(str, self._descriptors): + while event := self._get_next_event_event(): + logger.debug(f"Broadcasting {event=} to {num_listeners} listeners") + event_bytes = bytes(event) + + for i, descriptor in enumerate(self._descriptors): comm_channel = self._get_comm_channel(descriptor) try: - # todo: given a failure, the message is not sent to any other - # recipients. consider retrying, adding a dead letter queue, or - # logging the message details more intentionally - comm_channel.send(next_event, timeout) + comm_channel.send(event_bytes, timeout) num_sent += 1 except Exception as ex: raise SmartSimError( - f"Failed broadcast to channel {descriptor} from {self._uid}" + f"Broadcast {i}/{num_listeners} for event {event.uid} to " + f"channel {descriptor} from {self._uid} failed." ) from ex - try: - next_event = self._event_buffer.popleft() - except IndexError: - next_event = None - logger.debug(f"Broadcast buffer exhausted for {self._uid}") - return num_sent def send(self, event: EventBase, timeout: float = 0.001) -> int: @@ -629,9 +639,12 @@ def register(self) -> t.Generator[bool, None, None]: """Send an event to register this consumer as a listener""" awaiting_confirmation = True descriptor = self._comm_channel.descriptor - backoffs = itertools.cycle((0.1, 0.5, 1.0, 2.0, 4.0, 8.0)) + backoffs = itertools.cycle((0.1, 0.5, 1.0, 2.0, 4.0)) event = OnCreateConsumer(descriptor, self._global_filters) + # create a temporary publisher to broadcast my own existence. + publisher = EventBroadcaster(self._backbone, DragonCommChannel.from_local) + # we're going to sit in this loop to wait for the backbone to get # updated with the registration (to avoid SEND/ACK) while awaiting_confirmation: diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py index b0e235f8c..c2b7ebaf0 100644 --- a/smartsim/protoclient.py +++ b/smartsim/protoclient.py @@ -43,7 +43,10 @@ import numpy import torch -from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_channel import ( + create_local, + DragonCommChannel, +) from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, @@ -56,8 +59,12 @@ from smartsim.error.errors import SmartSimError from smartsim.log import get_logger -# from mpi4py import MPI +try: + from mpi4py import MPI +except Exception: + MPI = None + print("Unable to import `mpi4py` package") _TimingDict = OrderedDict[str, list[str]] @@ -68,7 +75,16 @@ class ProtoClient: - _DEFAULT_TIMEOUT = 30.0 + """Proof of concept implementation of a client enabling user applications + to interact with MLI resources.""" + + _DEFAULT_BACKBONE_TIMEOUT = 30.0 + """A default timeout period applied to connection attempts with the + backbone feature store.""" + + _DEFAULT_WORK_QUEUE_SIZE = 500 + """A default number of events to be buffered in the work queue before + triggering QueueFull exceptions.""" @staticmethod def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore: @@ -82,7 +98,8 @@ def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore: descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) if descriptor is None: raise SmartSimError( - "Missing required backbone configuration in environment" + "Missing required backbone configuration in environment: " + f"{BackboneFeatureStore.MLI_BACKBONE}" ) backbone = t.cast( @@ -94,37 +111,43 @@ def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore: def _attach_to_worker_queue(self) -> DragonFLIChannel: """Wait until the backbone contains the worker queue configuration, then attach an FLI to the given worker queue""" - configuration = self._backbone.wait_for( - [BackboneFeatureStore.MLI_WORKER_QUEUE], self._timeout - ) - # descriptor = configuration.get(BackboneFeatureStore.MLI_WORKER_QUEUE, None) - # NOTE: without wait_for, this MUST be in the backbone.... - # descriptor = self._backbone.worker_queue - descriptor = str(configuration[BackboneFeatureStore.MLI_WORKER_QUEUE]) - if not descriptor: - raise ValueError("Unable to locate worker queue using backbone") + descriptor = "" + try: + # NOTE: without wait_for, this MUST be in the backbone.... + config = self._backbone.wait_for( + [BackboneFeatureStore.MLI_WORKER_QUEUE], self.backbone_timeout + ) + descriptor = str(config[BackboneFeatureStore.MLI_WORKER_QUEUE]) + except Exception as ex: + logger.info( + f"Unable to rerieve {BackboneFeatureStore.MLI_WORKER_QUEUE} " + "to attach to the worker queue." + ) + raise ValueError("Unable to locate worker queue using backbone") from ex - # self._to_worker_fli = DragonFLIChannel.from_descriptor(descriptor) - return DragonFLIChannel.from_descriptor(str(descriptor)) + return DragonFLIChannel.from_descriptor(descriptor) - @staticmethod - def _create_worker_channels() -> t.Tuple[DragonCommChannel, DragonCommChannel]: - """Create channels to be used in the worker queue""" - # self._from_worker_ch = Channel.make_process_local() - _from_worker_ch = DragonCommChannel.from_local() - # self._from_worker_ch_serialized = self._from_worker_ch.serialize() - # self._to_worker_ch = Channel.make_process_local() - _to_worker_ch = DragonCommChannel.from_local() + @classmethod + def _create_worker_channels( + cls, + ) -> t.Tuple[dragon.channels.Channel, dragon.channels.Channel]: + """Create channels to be used for communication to and from the worker queue. - return _from_worker_ch, _to_worker_ch + :returns: A tuple containing the native from and to Channels as (from_channel, to_channel). + """ + + _from_worker_ch_raw = create_local(cls._DEFAULT_WORK_QUEUE_SIZE) + _to_worker_ch_raw = create_local(cls._DEFAULT_WORK_QUEUE_SIZE) + + return _from_worker_ch_raw, _to_worker_ch_raw def _create_broadcaster(self) -> EventProducer: """Create an event publisher that will broadcast updates to other MLI components. This publisher :returns: the event publisher instance""" - broadcaster: EventProducer = EventBroadcaster( + broadcaster = EventBroadcaster( self._backbone, DragonCommChannel.from_descriptor ) return broadcaster @@ -138,30 +161,21 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None: worker queue :raises: SmartSimError if unable to attach to a backbone featurestore""" - # comm = MPI.COMM_WORLD - # rank = comm.Get_rank() - rank: int = 0 - self._timeout = wait_timeout or self._DEFAULT_TIMEOUT + # todo: determine a way to make this work in tests. + # - consider catching the import exception and defaulting rank to 0 + if MPI is not None: + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + else: + rank: int = 0 + + self._backbone_timeout = wait_timeout connect_to_infrastructure() - # ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] - # self._ddict = DDict.attach(ddict_str) - # self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor - self._backbone = self._attach_to_backbone(wait_timeout=wait_timeout) - - # # to_worker_fli_str = None - # # while to_worker_fli_str is None: - # # try: - # # to_worker_fli_str = self._ddict["to_worker_fli"] - # # self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) - # # except KeyError: - # # time.sleep(1) + self._backbone = self._attach_to_backbone(wait_timeout=self.backbone_timeout) self._to_worker_fli = self._attach_to_worker_queue() - # # # self._from_worker_ch = Channel.make_process_local() - # # # self._from_worker_ch_serialized = self._from_worker_ch.serialize() - # # # self._to_worker_ch = Channel.make_process_local() channels = self._create_worker_channels() self._from_worker_ch = channels[0] self._to_worker_ch = channels[1] @@ -176,6 +190,13 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None: self._timings: _TimingDict = OrderedDict() self._timing_on = timing_on + @property + def backbone_timeout(self) -> float: + """The timeout (in seconds) applied to retrievals from the backbone feature store. + + :returns: A float indicating the number of seconds to allow""" + return self._backbone_timeout or self._DEFAULT_BACKBONE_TIMEOUT + def _add_label_to_timings(self, label: str) -> None: if label not in self._timings: self._timings[label] = [] diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index 3eb800bb7..01b280d08 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -1,231 +1,304 @@ -# # BSD 2-Clause License -# # -# # Copyright (c) 2021-2024, Hewlett Packard Enterprise -# # All rights reserved. -# # -# # Redistribution and use in source and binary forms, with or without -# # modification, are permitted provided that the following conditions are met: -# # -# # 1. Redistributions of source code must retain the above copyright notice, this -# # list of conditions and the following disclaimer. -# # -# # 2. Redistributions in binary form must reproduce the above copyright notice, -# # this list of conditions and the following disclaimer in the documentation -# # and/or other materials provided with the distribution. -# # -# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# import pickle -# import time -# import typing as t - -# import pytest - -# dragon = pytest.importorskip("dragon") - -# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local -# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( -# BackboneFeatureStore, -# EventBroadcaster, -# OnWriteFeatureStore, -# ) -# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict -# from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys -# from smartsim.error.errors import SmartSimError -# from smartsim.log import get_logger - -# # isort: off -# from dragon import fli -# from dragon.channels import Channel - -# # from ..ex..high_throughput_inference.mock_app import ProtoClient -# from smartsim.protoclient import ProtoClient - - -# # The tests in this file belong to the dragon group -# pytestmark = pytest.mark.dragon -# WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" -# logger = get_logger(__name__) - - -# @pytest.fixture -# def storage_for_dragon_fs() -> t.Dict[str, str]: -# # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) -# return dragon_ddict.DDict(1, 2, 4 * 1024**2) - - -# @pytest.fixture -# def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: -# return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) - - -# @pytest.fixture -# def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: -# """a stand-in for the worker manager so a worker queue exists""" - -# # create the FLI -# to_worker_channel = Channel.make_process_local() -# # to_worker_channel = create_local() -# fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) -# comm_channel = DragonFLIChannel(fli_, True) - -# # store the descriptor in the backbone -# # the_backbone.worker_queue = comm_channel.descriptor -# the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor - -# try: -# comm_channel.send(b"foo") -# except Exception as ex: -# print(f"ohnooooo: {ex}") - -# return comm_channel - - -# @pytest.fixture -# def storage_for_dragon_fs_with_req_queue( -# storage_for_dragon_fs: t.Dict[str, str] -# ) -> t.Dict[str, str]: -# # create a valid FLI so any call to attach does not fail -# channel_ = Channel.make_process_local() -# fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) -# comm_channel = DragonFLIChannel(fli_, True) - -# storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor -# return storage_for_dragon_fs - - -# @pytest.mark.parametrize( -# "wait_timeout, exp_wait_max", -# [ -# # aggregate the 1+1+1 into 3 on remaining parameters -# pytest.param(0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps"), -# pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"), -# pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"), -# ], -# ) -# def test_protoclient_timeout( -# wait_timeout: float, -# exp_wait_max: float, -# the_backbone: BackboneFeatureStore, -# monkeypatch: pytest.MonkeyPatch, -# ): -# """Verify that attempts to attach to the worker queue from the protoclient -# timeout in an appropriate amount of time. Note: due to the backoff, we verify -# the elapsed time is less than the 15s of a cycle of waits +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import pickle +import time +import typing as t +from unittest.mock import MagicMock + +import pytest + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBroadcaster, + OnWriteFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict +from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +# isort: off +from dragon import fli +from dragon.channels import Channel + +# from ..ex..high_throughput_inference.mock_app import ProtoClient +from smartsim.protoclient import ProtoClient + + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon +WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" +logger = get_logger(__name__) + + +@pytest.fixture +def storage_for_dragon_fs() -> t.Dict[str, str]: + # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) + return dragon_ddict.DDict(1, 2, 4 * 1024**2) + + +@pytest.fixture +def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: + return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) + + +@pytest.fixture +def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: + """a stand-in for the worker manager so a worker queue exists""" + + # create the FLI + to_worker_channel = create_local() + fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + comm_channel = DragonFLIChannel(fli_, True) + + # store the descriptor in the backbone + the_backbone.worker_queue = comm_channel.descriptor + # the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor + + try: + comm_channel.send(b"foo") + except Exception as ex: + print(f"ohnooooo: {ex}") + + return comm_channel + + +@pytest.fixture +def storage_for_dragon_fs_with_req_queue( + storage_for_dragon_fs: t.Dict[str, str] +) -> t.Dict[str, str]: + # create a valid FLI so any call to attach does not fail + channel_ = Channel.make_process_local() + fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) + comm_channel = DragonFLIChannel(fli_, True) + + storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor + return storage_for_dragon_fs + + +@pytest.mark.parametrize( + "wait_timeout, exp_wait_max", + [ + # aggregate the 1+1+1 into 3 on remaining parameters + pytest.param( + 0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps", marks=pytest.mark.skip + ), + pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps", marks=pytest.mark.skip), + pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps", marks=pytest.mark.skip), + ], +) +def test_protoclient_timeout( + wait_timeout: float, + exp_wait_max: float, + the_backbone: BackboneFeatureStore, + monkeypatch: pytest.MonkeyPatch, +): + """Verify that attempts to attach to the worker queue from the protoclient + timeout in an appropriate amount of time. Note: due to the backoff, we verify + the elapsed time is less than the 15s of a cycle of waits -# :param wait_timeout: a timeout for use when configuring a proto client -# :param exp_wait_max: a ceiling for the expected time spent waiting for -# the timeout -# :param the_backbone: a pre-initialized backbone featurestore for setting up -# the environment variable required by the client""" + :param wait_timeout: a timeout for use when configuring a proto client + :param exp_wait_max: a ceiling for the expected time spent waiting for + the timeout + :param the_backbone: a pre-initialized backbone featurestore for setting up + the environment variable required by the client""" -# # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8] -# # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) -# start_time = time.time() -# with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex: -# ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) + # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8] + # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) + start_time = time.time() + with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex: + ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) -# ProtoClient(False, wait_timeout=wait_timeout) + ProtoClient(False, wait_timeout=wait_timeout) -# end_time = time.time() -# elapsed = end_time - start_time + end_time = time.time() + elapsed = end_time - start_time -# # todo: revisit. should this trigger any wait if the backbone is set above? -# # confirm that we met our timeout -# # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}" + # todo: revisit. should this trigger any wait if the backbone is set above? + # confirm that we met our timeout + # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}" -# # confirm that the total wait time is aligned with the sleep cycle -# assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" + # confirm that the total wait time is aligned with the sleep cycle + assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" -# def test_protoclient_initialization_no_backbone(): -# """Verify that attempting to start the client without required environment variables -# results in an exception. NOTE: Backbone env var is not set""" +def test_protoclient_initialization_no_backbone(): + """Verify that attempting to start the client without required environment variables + results in an exception. NOTE: Backbone env var is not set""" -# with pytest.raises(SmartSimError) as ex: -# ProtoClient(timing_on=False) + with pytest.raises(SmartSimError) as ex: + ProtoClient(timing_on=False) -# # confirm the missing value error has been raised -# assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" "))) + # confirm the missing value error has been raised + assert {"backbone", "configuration"}.issubset(set(ex.value.args[0].split(" "))) -# def test_protoclient_initialization( -# the_backbone: BackboneFeatureStore, -# the_worker_queue: DragonFLIChannel, -# monkeypatch: pytest.MonkeyPatch, -# ): -# """Verify that attempting to start the client with required env vars results -# in a fully initialized client +def test_protoclient_initialization( + the_backbone: BackboneFeatureStore, + the_worker_queue: DragonFLIChannel, + monkeypatch: pytest.MonkeyPatch, +): + """Verify that attempting to start the client with required env vars results + in a fully initialized client -# :param the_backbone: a pre-initialized backbone featurestore -# :param the_worker_queue: an FLI channel the client will retrieve -# from the backbone""" + :param the_backbone: a pre-initialized backbone featurestore + :param the_worker_queue: an FLI channel the client will retrieve + from the backbone""" -# with monkeypatch.context() as ctx: -# ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) -# # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture + with monkeypatch.context() as ctx: + ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) + # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone -# client = ProtoClient(timing_on=False) + client = ProtoClient(timing_on=False) -# # confirm the backbone was attached correctly -# assert client._backbone is not None -# assert client._backbone.descriptor == the_backbone.descriptor + fs_descriptor = the_backbone.descriptor + wq_descriptor = the_worker_queue.descriptor -# # confirm the worker queue is created and attached correctly -# assert client._to_worker_fli is not None -# assert client._to_worker_fli.descriptor == the_worker_queue.descriptor + # confirm the backbone was attached correctly + assert client._backbone is not None + assert client._backbone.descriptor == fs_descriptor -# # confirm the worker channels are created -# assert client._from_worker_ch is not None -# assert client._from_worker_ch.descriptor + # we expect the backbone to add its descriptor to the local env + assert os.environ[BackboneFeatureStore.MLI_BACKBONE] == fs_descriptor -# assert client._to_worker_ch is not None -# assert client._to_worker_ch.descriptor + # confirm the worker queue is created and attached correctly + assert client._to_worker_fli is not None + assert client._to_worker_fli.descriptor == wq_descriptor + + # we expect the worker queue descriptor to be placed into the backbone + # we do NOT expect _from_worker_ch to be placed anywhere. it's a specific callback + assert the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] == wq_descriptor -# # confirm a publisher is created -# assert client._publisher is not None + # confirm the worker channels are created + assert client._from_worker_ch is not None + assert client._to_worker_ch is not None + + # wrap the channels just to easily verify they produces a descriptor + assert DragonCommChannel(client._from_worker_ch).descriptor + assert DragonCommChannel(client._to_worker_ch).descriptor + + # confirm a publisher is created + assert client._publisher is not None + + +def test_protoclient_write_model( + the_backbone: BackboneFeatureStore, + the_worker_queue: DragonFLIChannel, + monkeypatch: pytest.MonkeyPatch, +): + """Verify that writing a model using the client causes the model data to be + written to a feature store + :param the_backbone: a pre-initialized backbone featurestore + :param the_worker_queue: an FLI channel the client will retrieve + from the backbone""" -# def test_protoclient_write_model( -# the_backbone: BackboneFeatureStore, -# the_worker_queue: DragonFLIChannel, -# monkeypatch: pytest.MonkeyPatch, -# ): -# """Verify that writing a model using the client causes the model data to be -# written to a feature store and triggers a key-written event + with monkeypatch.context() as ctx: + # we won't actually send here + client = ProtoClient(timing_on=False) -# :param the_backbone: a pre-initialized backbone featurestore -# :param the_worker_queue: an FLI channel the client will retrieve -# from the backbone""" + ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) + # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone -# with monkeypatch.context() as ctx: -# ctx.setenv("_SMARTSIM_INFRA_BACKBONE", the_backbone.descriptor) -# # NOTE: backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] set in the_worker_queue fixture + client = ProtoClient(timing_on=False) -# client = ProtoClient(timing_on=False) + model_key = "my-model" + model_bytes = b"12345" -# model_key = "my-model" -# model_bytes = b"12345" + client.set_model(model_key, model_bytes) -# client.set_model(model_key, model_bytes) + # confirm the client modified the underlying feature store + assert client._backbone[model_key] == model_bytes -# # confirm the client modified the underlying feature store -# assert client._backbone[model_key] == model_bytes -# publisher = t.cast(EventBroadcaster, client._publisher) +@pytest.mark.parametrize("num_listeners", [1, 2, 4]) +def test_protoclient_write_model_notification_sent( + the_backbone: BackboneFeatureStore, + the_worker_queue: DragonFLIChannel, + monkeypatch: pytest.MonkeyPatch, + num_listeners: int, +): + """Verify that writing a model sends a key-written event -# # confirm the client raised the key-written event -# assert len(publisher._event_buffer) == 1 + :param the_backbone: a pre-initialized backbone featurestore + :param the_worker_queue: an FLI channel the client will retrieve + from the backbone + :param num_listeners: vary the number of registered listeners + to verify that the event is broadcast to everyone + """ -# event = t.cast(OnWriteFeatureStore, pickle.loads(publisher._event_buffer.pop())) -# assert event.descriptor == the_backbone.descriptor -# assert event.key == model_key + # we won't actually send here, but it won't try without registered listeners + listeners = [f"mock-ch-desc-{i}" for i in range(num_listeners)] + the_backbone[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS] = ",".join(listeners) + + with monkeypatch.context() as ctx: + ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) + # NOTE: rely on `the_worker_queue` fixture to put MLI_WORKER_QUEUE in backbone + + client = ProtoClient(timing_on=False) + + publisher = t.cast(EventBroadcaster, client._publisher) + + # mock attaching to a channel given the mock-ch-desc in backbone + mock_send = MagicMock(return_value=None) + mock_comm_channel = MagicMock(**{"send": mock_send}, spec=DragonCommChannel) + mock_get_comm_channel = MagicMock(return_value=mock_comm_channel) + ctx.setattr(publisher, "_get_comm_channel", mock_get_comm_channel) + + model_key = "my-model" + model_bytes = b"12345" + + client.set_model(model_key, model_bytes) + + # confirm that a listener channel was attached + # once for each registered listener in backbone + assert mock_get_comm_channel.call_count == num_listeners + + # confirm the client raised the key-written event + assert ( + mock_send.call_count == num_listeners + ), f"Expected {num_listeners} sends with {num_listeners} registrations" + + # with at least 1 consumer registered, we can verify the message is sent + for call_args in mock_send.call_args_list: + send_args = call_args.args + event_bytes, timeout = send_args[0], send_args[1] + + assert event_bytes, "Expected event bytes to be supplied to send" + assert ( + timeout == 0.001 + ), "Expected default timeout on call to `publisher.send`, " + + # confirm the correct event was raised + event = t.cast(OnWriteFeatureStore, pickle.loads(event_bytes)) + assert event.descriptor == the_backbone.descriptor + assert event.key == model_key From 6f1cba79e18b8339fa3ddcd657a6187bba6128d7 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 24 Sep 2024 17:37:52 -0500 Subject: [PATCH 07/40] reduce timeouts & backoffs, share backbone across protoclient tests --- .../_core/launcher/dragon/dragonBackend.py | 5 ++-- smartsim/_core/mli/comm/channel/channel.py | 2 +- .../_core/mli/comm/channel/dragon_channel.py | 2 +- smartsim/_core/mli/comm/channel/dragon_fli.py | 17 +++++------ .../storage/backbone_feature_store.py | 26 +++++++++++------ smartsim/protoclient.py | 25 +++++++++-------- tests/dragon/channel.py | 7 +++-- tests/dragon/test_featurestore_base.py | 2 +- tests/dragon/test_protoclient.py | 28 ++++++++++++------- tests/dragon/utils/channel.py | 9 +++--- tests/mli/channel.py | 3 +- 11 files changed, 74 insertions(+), 52 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index a1367af2a..577b95119 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -622,9 +622,8 @@ def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer: name="BackendConsumerRegistrar", event_handler=self._on_consumer_created, ) - while consumer.register(): - # wait for the consumer to complete registration - ... + consumer.register() + logger.info(f"Consumer `{consumer.name}` registration completed.") # self._backbone.backend_channel = # consumer.descriptor # i want to get rid of this extra channel diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 90d81cb9b..bfa1c50fb 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -55,8 +55,8 @@ def __init__( def send(self, value: bytes, timeout: float = 0) -> None: """Send a message through the underlying communication channel. - :param timeout: Maximum time to wait (in seconds) for messages to send :param value: The value to send + :param timeout: Maximum time to wait (in seconds) for messages to send :raises SmartSimError: If sending message fails """ diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 4f8d3e552..0b73080d6 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -151,7 +151,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: logger.debug(f"DragonCommChannel {self.descriptor} sent message") except Exception as e: raise SmartSimError( - f"Error sending message: DragonCommChannel {self.descriptor!r}" + f"Error sending via DragonCommChannel {self.descriptor}" ) from e def recv(self, timeout: float = 0.001) -> t.List[bytes]: diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 325f6b779..22593f63c 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -68,20 +68,23 @@ def __init__( create_local(buffer_size) if sender_supplied else None ) - def send(self, value: bytes, timeout: float = 0.001) -> None: + def send( + self, value: bytes, timeout: float = 0.001, blocking: bool = False + ) -> None: """Send a message through the underlying communication channel. - :param timeout: Maximum time to wait (in seconds) for messages to send :param value: The value to send + :param timeout: Maximum time to wait (in seconds) for messages to send + :param blocking: Block returning until the message has been received :raises SmartSimError: If sending message fails """ try: with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value, timeout=timeout) - logger.debug(f"DragonFLIChannel {self.descriptor!r} sent message") + logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: raise SmartSimError( - f"Error sending message: DragonFLIChannel {self.descriptor!r}" + f"Error sending message: DragonFLIChannel {self.descriptor}" ) from e def recv(self, timeout: float = 0.001) -> t.List[bytes]: @@ -98,14 +101,12 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: try: message, _ = recvh.recv_bytes(timeout=timeout) messages.append(message) - logger.debug( - f"DragonFLIChannel {self.descriptor!r} received message" - ) + logger.debug(f"DragonFLIChannel {self.descriptor} received message") except fli.FLIEOT: eot = True except Exception as e: raise SmartSimError( - f"Error receiving messages: DragonFLIChannel {self.descriptor!r}" + f"Error receiving messages: DragonFLIChannel {self.descriptor}" ) from e return messages diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index e48f4e4e9..1110dc812 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -71,7 +71,7 @@ class BackboneFeatureStore(DragonFeatureStore): MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE" MLI_BACKBONE = "_SMARTSIM_INFRA_BACKBONE" _CREATED_ON = "creation" - _DEFAULT_WAIT_TIMEOUT = 30.0 + _DEFAULT_WAIT_TIMEOUT = 1.0 def __init__( self, @@ -219,7 +219,7 @@ def wait_for( values: t.Dict[str, t.Union[str, bytes, None]] = {k: None for k in set(keys)} is_found = {k: False for k in values.keys()} - backoff: t.List[float] = [0.1, 0.5, 1, 2, 4] + backoff = (0.1, 0.2, 0.4, 0.8) backoff_iter = itertools.cycle(backoff) start_time = time.time() @@ -360,7 +360,7 @@ def __init__( self._backbone = backbone self._channel: t.Optional[CommChannelBase] = channel - def send(self, event: EventBase) -> int: + def send(self, event: EventBase, timeout: float = 0.001) -> int: """The send operation""" if self._channel is None: # self._channel = self._channel_factory(event) @@ -371,7 +371,7 @@ def send(self, event: EventBase) -> int: try: event_bytes = bytes(event) - self._channel.send(event_bytes) + self._channel.send(event_bytes, timeout) num_sent += 1 except Exception as ex: raise SmartSimError(f"Failed broadcast to channel: {self._channel}") from ex @@ -589,6 +589,17 @@ def descriptor(self) -> str: :returns: The comm channel descriptor""" return self._comm_channel.descriptor + @property + def name(self) -> str: + """The friendly name assigned to the consumer. + + :returns: The consumer name if one is assigned, othewise a unique + id assigned by the system. + """ + if self._name is None: + self._name = str(uuid.uuid4()) + return self._name + def receive( self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0 ) -> t.List[EventBase]: @@ -635,11 +646,11 @@ def receive( return messages - def register(self) -> t.Generator[bool, None, None]: + def register(self) -> None: """Send an event to register this consumer as a listener""" awaiting_confirmation = True descriptor = self._comm_channel.descriptor - backoffs = itertools.cycle((0.1, 0.5, 1.0, 2.0, 4.0)) + backoffs = itertools.cycle((0.1, 0.2, 0.4, 0.8)) event = OnCreateConsumer(descriptor, self._global_filters) # create a temporary publisher to broadcast my own existence. @@ -654,7 +665,6 @@ def register(self) -> t.Generator[bool, None, None]: if descriptor in registered_channels: awaiting_confirmation = False - yield not awaiting_confirmation time.sleep(next(backoffs)) # if backend_descriptor := self._backbone.backend_channel: @@ -665,7 +675,7 @@ def register(self) -> t.Generator[bool, None, None]: # broadcast that this consumer is now ready to mingle publisher = EventBroadcaster(self._backbone, DragonCommChannel.from_local) - publisher.send(event, timeout=0.1) + publisher.send(event, timeout=0.01) # def register_callback(self, callback: t.Callable[[EventBase], None]) -> None: ... diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py index c2b7ebaf0..3e786cf05 100644 --- a/smartsim/protoclient.py +++ b/smartsim/protoclient.py @@ -31,6 +31,12 @@ import dragon.channels from dragon.globalservices.api_setup import connect_to_infrastructure +try: + from mpi4py import MPI # type: ignore[import-not-found] +except Exception: + MPI = None + print("Unable to import `mpi4py` package") + # isort: on # pylint: enable=unused-import,import-error @@ -44,8 +50,8 @@ import torch from smartsim._core.mli.comm.channel.dragon_channel import ( - create_local, DragonCommChannel, + create_local, ) from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( @@ -59,13 +65,6 @@ from smartsim.error.errors import SmartSimError from smartsim.log import get_logger - -try: - from mpi4py import MPI -except Exception: - MPI = None - print("Unable to import `mpi4py` package") - _TimingDict = OrderedDict[str, list[str]] @@ -134,7 +133,8 @@ def _create_worker_channels( ) -> t.Tuple[dragon.channels.Channel, dragon.channels.Channel]: """Create channels to be used for communication to and from the worker queue. - :returns: A tuple containing the native from and to Channels as (from_channel, to_channel). + :returns: A tuple containing the native from and to + Channels as (from_channel, to_channel). """ _from_worker_ch_raw = create_local(cls._DEFAULT_WORK_QUEUE_SIZE) @@ -165,9 +165,9 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None: # - consider catching the import exception and defaulting rank to 0 if MPI is not None: comm = MPI.COMM_WORLD - rank = comm.Get_rank() + rank: int = comm.Get_rank() else: - rank: int = 0 + rank = 0 self._backbone_timeout = wait_timeout @@ -192,7 +192,8 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None: @property def backbone_timeout(self) -> float: - """The timeout (in seconds) applied to retrievals from the backbone feature store. + """The timeout (in seconds) applied to retrievals + from the backbone feature store. :returns: A float indicating the number of seconds to allow""" return self._backbone_timeout or self._DEFAULT_BACKBONE_TIMEOUT diff --git a/tests/dragon/channel.py b/tests/dragon/channel.py index b00ba9aa2..efabb00c0 100644 --- a/tests/dragon/channel.py +++ b/tests/dragon/channel.py @@ -54,10 +54,11 @@ def __init__(self, key: pathlib.Path) -> None: self._file_path.touch() def send(self, value: bytes, timeout: float = 0) -> None: - """Send a message throuh the underlying communication channel + """Send a message throuh the underlying communication channel. - :param timeout: maximum time to wait (in seconds) for messages to send - :param value: The value to send""" + :param value: The value to send + :param timeout: Maximum time to wait (in seconds) for messages to send + """ with self._lock: # write as text so we can add newlines as delimiters with open(self._file_path, "a") as fp: diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 94733afc7..87536c5ba 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -759,7 +759,7 @@ def test_backbone_wait_timeout(wait_timeout: float, exp_wait_max: float) -> None :param storage_for_dragon_fs: the dragon storage engine to use """ - # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8] + # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8] # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) start_time = time.time() diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index 01b280d08..f5a55a381 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -60,18 +60,18 @@ logger = get_logger(__name__) -@pytest.fixture +@pytest.fixture(scope="session") def storage_for_dragon_fs() -> t.Dict[str, str]: # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) return dragon_ddict.DDict(1, 2, 4 * 1024**2) -@pytest.fixture +@pytest.fixture(scope="session") def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) -@pytest.fixture +@pytest.fixture(scope="session") def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: """a stand-in for the worker manager so a worker queue exists""" @@ -82,12 +82,11 @@ def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: # store the descriptor in the backbone the_backbone.worker_queue = comm_channel.descriptor - # the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = comm_channel.descriptor try: comm_channel.send(b"foo") except Exception as ex: - print(f"ohnooooo: {ex}") + logger.exception(f"Test send from worker channel failed", exc_info=True) return comm_channel @@ -132,7 +131,7 @@ def test_protoclient_timeout( :param the_backbone: a pre-initialized backbone featurestore for setting up the environment variable required by the client""" - # NOTE: exp_wait_time maps to the cycled backoff of [.1, .5, 1, 2, 4, 8] + # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8] # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) start_time = time.time() with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex: @@ -240,12 +239,16 @@ def test_protoclient_write_model( assert client._backbone[model_key] == model_bytes -@pytest.mark.parametrize("num_listeners", [1, 2, 4]) +@pytest.mark.parametrize( + "num_listeners, num_model_updates", + [(1, 1), (1, 4), (2, 4), (16, 4), (64, 8)], +) def test_protoclient_write_model_notification_sent( the_backbone: BackboneFeatureStore, the_worker_queue: DragonFLIChannel, monkeypatch: pytest.MonkeyPatch, num_listeners: int, + num_model_updates: int, ): """Verify that writing a model sends a key-written event @@ -258,7 +261,11 @@ def test_protoclient_write_model_notification_sent( # we won't actually send here, but it won't try without registered listeners listeners = [f"mock-ch-desc-{i}" for i in range(num_listeners)] + + the_backbone[BackboneFeatureStore.MLI_BACKBONE] = the_backbone.descriptor + the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_queue.descriptor the_backbone[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS] = ",".join(listeners) + the_backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = None with monkeypatch.context() as ctx: ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) @@ -277,15 +284,16 @@ def test_protoclient_write_model_notification_sent( model_key = "my-model" model_bytes = b"12345" - client.set_model(model_key, model_bytes) + for i in range(num_model_updates): + client.set_model(model_key, model_bytes) # confirm that a listener channel was attached # once for each registered listener in backbone - assert mock_get_comm_channel.call_count == num_listeners + assert mock_get_comm_channel.call_count == num_listeners * num_model_updates # confirm the client raised the key-written event assert ( - mock_send.call_count == num_listeners + mock_send.call_count == num_listeners * num_model_updates ), f"Expected {num_listeners} sends with {num_listeners} registrations" # with at least 1 consumer registered, we can verify the message is sent diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index b00ba9aa2..003d79400 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -40,7 +40,7 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" def __init__(self, key: pathlib.Path) -> None: - """Initialize the FileSystemCommChannel instance + """Initialize the FileSystemCommChannel instance. :param key: a path to the root directory of the feature store""" self._lock = threading.RLock() @@ -54,10 +54,11 @@ def __init__(self, key: pathlib.Path) -> None: self._file_path.touch() def send(self, value: bytes, timeout: float = 0) -> None: - """Send a message throuh the underlying communication channel + """Send a message throuh the underlying communication channel. - :param timeout: maximum time to wait (in seconds) for messages to send - :param value: The value to send""" + :param value: The value to send + :param timeout: Maximum time to wait (in seconds) for messages to send + """ with self._lock: # write as text so we can add newlines as delimiters with open(self._file_path, "a") as fp: diff --git a/tests/mli/channel.py b/tests/mli/channel.py index b00ba9aa2..1bbf159b1 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -56,8 +56,9 @@ def __init__(self, key: pathlib.Path) -> None: def send(self, value: bytes, timeout: float = 0) -> None: """Send a message throuh the underlying communication channel + :param value: The value to send :param timeout: maximum time to wait (in seconds) for messages to send - :param value: The value to send""" + """ with self._lock: # write as text so we can add newlines as delimiters with open(self._file_path, "a") as fp: From c0a1bca6108de64d4b5f3c19714a61decfb8bdf7 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 24 Sep 2024 21:38:03 -0500 Subject: [PATCH 08/40] Test eventing end-to-end in single process --- .../_core/launcher/dragon/dragonBackend.py | 47 +-- smartsim/_core/mli/comm/channel/channel.py | 4 +- .../_core/mli/comm/channel/dragon_channel.py | 2 +- .../storage/backbone_feature_store.py | 75 ++-- tests/dragon/test_dragon_backend.py | 322 ++++++++++-------- tests/dragon/test_featurestore.py | 6 +- tests/dragon/test_featurestore_base.py | 12 +- tests/dragon/test_featurestore_integration.py | 8 +- 8 files changed, 259 insertions(+), 217 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 577b95119..3fe120a9d 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -27,6 +27,7 @@ import functools import itertools import multiprocessing as mp +import os import time import typing as t from dataclasses import dataclass, field @@ -48,13 +49,17 @@ import dragon.native.machine as dragon_machine from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter -from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_channel import ( + DragonCommChannel, + create_local, +) from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, EventBase, # EventBroadcaster, EventCategory, EventConsumer, + OnCreateConsumer, ) # pylint: enable=import-error @@ -572,12 +577,21 @@ def _create_backbone(self) -> BackboneFeatureStore: self._backbone = BackboneFeatureStore( backbone_storage, allow_reserved_writes=True ) + + # put the backbone descriptor in the env vars + os.environ.update(self._backbone.get_env()) logger.info(self._backbone.creation_date) return self._backbone def _on_consumer_created(self, event: EventBase) -> None: """Event handler for""" + if isinstance(event, OnCreateConsumer) and self._backbone is not None: + notify_list = set(self._backbone.notification_channels) + notify_list.add(event.descriptor) + self._backbone.notification_channels = list(notify_list) + return + logger.warning(f"Unhandled event received: {event}") def _bootstrap_event_listeners( @@ -591,7 +605,7 @@ def _bootstrap_event_listeners( # Update directly to avoid SEND/ACK pattern notify_descriptors.append(consumer.descriptor) - # consumer.register() # this will loop infinitely waiting for itself + notify_descriptors = list(set(notify_descriptors)) backbone.notification_channels = notify_descriptors @@ -605,16 +619,11 @@ def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer: attempting to connect any eventing clients. :returns: The newly created EventConsumer instance """ - # if self._event_producer is None: - # logger.info("Creating event publisher") - # # todo: ensure DCC.from_descriptor and not DCC.from_local - # self._event_producer = - # EventBroadcaster(backbone, DragonCommChannel.from_descriptor) - # logger.info("Created event publisher") if self._event_consumer is None: logger.info("Creating event consumer") - event_channel = DragonCommChannel.from_local() + dragon_channel = create_local(500) + event_channel = DragonCommChannel(dragon_channel) consumer = EventConsumer( event_channel, backbone, @@ -622,24 +631,20 @@ def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer: name="BackendConsumerRegistrar", event_handler=self._on_consumer_created, ) - consumer.register() - logger.info(f"Consumer `{consumer.name}` registration completed.") - # self._backbone.backend_channel = - # consumer.descriptor # i want to get rid of this extra channel - # self._bootstrap_event_listeners(backbone, consumer) self._event_consumer = consumer - - logger.info("Created event consumer") + backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = consumer.descriptor + logger.info(f"Backend consumer `{consumer.name}` created.") return self._event_consumer + def listen_to_registrations(self, timeout: float = 0.001) -> None: + if self._event_consumer is not None: + self._event_consumer.listen_once(timeout) + def _start_eventing_listeners(self) -> None: - if self._event_consumer: - self._event_consumer_process = mp.Process( - target=self._event_consumer.listen - ) - self._event_consumer_process.start() + # todo: start external listener entrypoint + ... @staticmethod def create_run_policy( diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index bfa1c50fb..a581e8e2a 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -52,7 +52,7 @@ def __init__( """A user-friendly identifier for channel-related logging""" @abstractmethod - def send(self, value: bytes, timeout: float = 0) -> None: + def send(self, value: bytes, timeout: float = 0.001) -> None: """Send a message through the underlying communication channel. :param value: The value to send @@ -61,7 +61,7 @@ def send(self, value: bytes, timeout: float = 0) -> None: """ @abstractmethod - def recv(self, timeout: float = 0) -> t.List[bytes]: + def recv(self, timeout: float = 0.001) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. :param timeout: Maximum time to wait (in seconds) for messages to arrive diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 0b73080d6..9c0ac3423 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -147,7 +147,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: """ try: with self._channel.sendh(timeout=timeout) as sendh: - sendh.send_bytes(value) + sendh.send_bytes(value, blocking=False) logger.debug(f"DragonCommChannel {self.descriptor} sent message") except Exception as e: raise SmartSimError( diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index 1110dc812..83c255fe7 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -122,8 +122,8 @@ def backend_channel(self) -> t.Optional[str]: """Retrieve the channel descriptor exposed by the MLI backend for events :returns: a stringified channel descriptor""" - if self.MLI_NOTIFY_CONSUMERS in self: - return str(self[self.MLI_NOTIFY_CONSUMERS]) + if self.MLI_BACKEND_CONSUMER in self: + return str(self[self.MLI_BACKEND_CONSUMER]) return None @backend_channel.setter @@ -131,7 +131,7 @@ def backend_channel(self, value: str) -> None: """Set the channel exposed by the MLI backend for events :param value: a stringified channel descriptor""" - self[self.MLI_NOTIFY_CONSUMERS] = value + self[self.MLI_BACKEND_CONSUMER] = value @property def worker_queue(self) -> t.Optional[str]: @@ -165,8 +165,7 @@ def _record_creation_data(self) -> None: ) self[self._CREATED_ON] = str(time.time()) - if os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) is None: - os.environ.update(self.get_env()) + os.environ[self.MLI_BACKBONE] = self.descriptor @classmethod def from_writable_descriptor( @@ -479,7 +478,7 @@ def _get_comm_channel(self, descriptor: str) -> CommChannelBase: logger.error(msg, exc_info=True) raise SmartSimError(msg) from ex - def _get_next_event_event(self) -> t.Optional[EventBase]: + def _get_next_event(self) -> t.Optional[EventBase]: """Pop the next event to be sent from the queue. :returns: The next event to send if any events are enqueued, otherwise `None`. @@ -512,7 +511,7 @@ def _broadcast(self, timeout: float = 0.001) -> int: num_listeners = len(self._descriptors) # send each event to every consumer - while event := self._get_next_event_event(): + while event := self._get_next_event(): logger.debug(f"Broadcasting {event=} to {num_listeners} listeners") event_bytes = bytes(event) @@ -524,7 +523,7 @@ def _broadcast(self, timeout: float = 0.001) -> int: num_sent += 1 except Exception as ex: raise SmartSimError( - f"Broadcast {i}/{num_listeners} for event {event.uid} to " + f"Broadcast {i+1}/{num_listeners} for event {event.uid} to " f"channel {descriptor} from {self._uid} failed." ) from ex @@ -547,6 +546,7 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int: except (KeyError, ValueError, SmartSimError): raise except Exception as ex: + logger.exception("An unexpected exception occurred while sending") raise SmartSimError("An unexpected failure occurred while sending") from ex @@ -600,8 +600,8 @@ def name(self) -> str: self._name = str(uuid.uuid4()) return self._name - def receive( - self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0 + def recv( + self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0.001 ) -> t.List[EventBase]: """Receives available published event(s). @@ -648,44 +648,35 @@ def receive( def register(self) -> None: """Send an event to register this consumer as a listener""" - awaiting_confirmation = True descriptor = self._comm_channel.descriptor - backoffs = itertools.cycle((0.1, 0.2, 0.4, 0.8)) event = OnCreateConsumer(descriptor, self._global_filters) - # create a temporary publisher to broadcast my own existence. - publisher = EventBroadcaster(self._backbone, DragonCommChannel.from_local) - - # we're going to sit in this loop to wait for the backbone to get - # updated with the registration (to avoid SEND/ACK) - while awaiting_confirmation: - registered_channels = self._backbone.notification_channels - # todo: this should probably be descriptor_string? maybe i need to - # get rid of descriptor as bytes or just make desc_string required in ABC - if descriptor in registered_channels: - awaiting_confirmation = False + registrar_key = BackboneFeatureStore.MLI_BACKEND_CONSUMER + config = self._backbone.wait_for([registrar_key], 2.0) - time.sleep(next(backoffs)) + registrar_descriptor = str(config.get(registrar_key, None)) - # if backend_descriptor := self._backbone.backend_channel: - # backend_channel = DragonCommChannel. - # from_descriptor(backend_descriptor) - # backend = EventSender(self._backbone, backend_channel) - # backend.send(event) + if registrar_descriptor: + logger.debug(f"Sending registration for {self.name}") - # broadcast that this consumer is now ready to mingle - publisher = EventBroadcaster(self._backbone, DragonCommChannel.from_local) - publisher.send(event, timeout=0.01) + registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor) + registrar_channel.send(bytes(event), timeout=1.0) - # def register_callback(self, callback: t.Callable[[EventBase], None]) -> None: ... + logger.debug(f"Registration for {self.name} sent") + else: + logger.warning("Unable to register. No registrar channel found.") - def listen(self) -> None: + def listen_once(self, timeout: float = 0.001) -> None: """Function to handle incoming events""" - print("starting listener...") - - while True: - print("awaiting new message") - incoming_messages = self.receive() - for message in incoming_messages: - if self._event_handler: - self._event_handler(message) + logger.debug(f"Starting event listener with {timeout} second timeout") + logger.debug("Awaiting new messages") + + incoming_messages = self.recv(timeout=timeout) + + if not incoming_messages: + logger.debug("Consumer received empty message list.") + + for message in incoming_messages: + logger.debug(f"Sending event {message=} to handler.") + if self._event_handler: + self._event_handler(message) diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py index a4e61d430..0631e11e6 100644 --- a/tests/dragon/test_dragon_backend.py +++ b/tests/dragon/test_dragon_backend.py @@ -24,151 +24,197 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os +import typing as t import unittest.mock as mock import pytest -# from smartsim._core.launcher.dragon.dragonBackend import DragonBackend, NodePrioritizer -# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import EventSender, OnCreateConsumer - -# dragon = pytest.importorskip("dragon") - -# import dragon.utils as du -# from dragon.channels import Channel -# from dragon.data.ddict.ddict import DDict -# from dragon.fli import DragonFLIError, FLInterface - -# from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel -# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -# from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader -# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( -# DragonFeatureStore, -# ) +from smartsim._core.launcher.dragon.dragonBackend import DragonBackend, NodePrioritizer +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBase, + EventBroadcaster, + EventConsumer, + EventSender, + OnCreateConsumer, +) +from smartsim.log import get_logger + +dragon = pytest.importorskip("dragon") + +import dragon.utils as du +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.fli import DragonFLIError, FLInterface + +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( + DragonFeatureStore, +) # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon +logger = get_logger(__name__) def test_dragonbackend_listener_boostrapping(monkeypatch: pytest.MonkeyPatch): - """Verify that an event listener is started""" - # backend_channel = DragonCommChannel.from_local() - assert True - - # with monkeypatch.context() as patcher: - # # patcher.setattr("smartsim._core.launcher.dragon.dragonBackend", "NodePrioritizer", mock.MagicMock()) - # patcher.setattr(NodePrioritizer, "__init__", lambda self, nodes, lock: None) - # patcher.setattr(DragonBackend, "_initialize_hosts", lambda self: None) - - # backend = DragonBackend(pid=9999) - # backend._create_backbone() - - # # create the consumer and start a listener process - # backend_consumer = backend._create_eventing(backend._backbone) - - # # ensure the consumer that was created is retained - # assert backend._event_consumer is not None - # assert backend._event_consumer == backend_consumer - - # assert backend._backbone.notification_channels == [backend_consumer.descriptor] - - # # create components to publish events - # # sender_channel = DragonCommChannel.from_local() - # sender = EventSender(backend._backbone, backend_channel) - - # # simulate a new consumer registration - # new_consumer_channel = DragonCommChannel.from_local() - # registration = OnCreateConsumer(new_consumer_channel.descriptor) - # new_consumer_channel.send(bytes(registration), 0.1) - - # events = backend_consumer.receive() - # assert len(events) == 1 - - -# @pytest.mark.parametrize( -# "content", -# [ -# pytest.param(b"a"), -# pytest.param(b"new byte string"), -# ], -# ) -# def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch): -# """A descriptor can be stored, loaded, and reattached""" -# chan = Channel.make_process_local() -# queue = FLInterface(main_ch=chan) -# monkeypatch.setenv( -# "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) -# ) - -# config = EnvironmentConfigLoader( -# featurestore_factory=DragonFeatureStore.from_descriptor, -# callback_factory=DragonCommChannel.from_descriptor, -# queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, -# ) -# config_queue = config.get_queue() - -# _ = config_queue.send(content) - -# old_recv = queue.recvh() -# result, _ = old_recv.recv_bytes() -# assert result == content - - -# def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): -# """The serialized descriptors of a loaded and unloaded -# queue are the same""" -# chan = Channel.make_process_local() -# queue = FLInterface(main_ch=chan) -# monkeypatch.setenv( -# "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) -# ) - -# config = EnvironmentConfigLoader( -# featurestore_factory=DragonFeatureStore.from_descriptor, -# callback_factory=DragonCommChannel.from_descriptor, -# queue_factory=DragonFLIChannel.from_descriptor, -# ) -# config_queue = config.get_queue() -# assert config_queue._fli.serialize() == queue.serialize() - - -# def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): -# """An incorrect serialized descriptor will fails to attach""" -# monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring") -# config = EnvironmentConfigLoader( -# featurestore_factory=DragonFeatureStore.from_descriptor, -# callback_factory=None, -# queue_factory=DragonFLIChannel.from_descriptor, -# ) - -# with pytest.raises(DragonFLIError): -# config.get_queue() - - -# def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): -# """Verify the dragon feature store is loaded correctly by the -# EnvironmentConfigLoader to demonstrate featurestore_factory correctness""" -# feature_store = DragonFeatureStore(DDict()) -# monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor) - -# config = EnvironmentConfigLoader( -# featurestore_factory=DragonFeatureStore.from_descriptor, -# callback_factory=None, -# queue_factory=None, -# ) - -# print(f"calling config.get_backbone: `{feature_store.descriptor}`") - -# backbone = config.get_backbone() -# assert backbone is not None - - -# def test_environment_variables_not_set(): -# """EnvironmentConfigLoader getters return None when environment -# variables are not set""" -# config = EnvironmentConfigLoader( -# featurestore_factory=DragonFeatureStore.from_descriptor, -# callback_factory=DragonCommChannel.from_descriptor, -# queue_factory=DragonCommChannel.from_descriptor, -# ) -# assert config.get_backbone() is None -# assert config.get_queue() is None + """Verify that the dragon backend registration channel correctly + registers new consumers in the backbone and begins sending events + to the new consumers""" + + backend = DragonBackend(pid=9999) + + backend._create_backbone() + backbone = backend._backbone + + def mock_event_handler(event: EventBase) -> None: + logger.debug(f"Handling event in mock handler: {event}") + + bb_descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) + assert bb_descriptor + + fs = BackboneFeatureStore.from_descriptor(bb_descriptor) + fs[event.uid] = "received" + + # create the consumer and start a listener process + backend_consumer = backend._create_eventing(backbone) + registrar_descriptor = backend._event_consumer.descriptor + + # ensure the consumer is stored to backend & published to backbone + assert backend._event_consumer == backend_consumer + assert backbone.backend_channel == registrar_descriptor + assert os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) + + # simulate a new consumer registration + new_consumer_ch = DragonCommChannel.from_local() + new_consumer = EventConsumer( + new_consumer_ch, + backbone, + [], + name="test-consumer-a", + event_handler=mock_event_handler, + ) + assert new_consumer, "new_consumer construction failed" + + # send registration to registrar channel + new_consumer.register() + + # the backend consumer should handle updating the notify list and the new + # consumer that just broadcast its registration should be registered... + # backend_consumer.listen_once(timeout=2.0) + backend.listen_to_registrations(timeout=0.1) + + # # confirm the backend registrar consumer registerd the new listener + assert new_consumer_ch.descriptor in backbone.notification_channels + + broadcaster = EventBroadcaster(backbone, DragonCommChannel.from_descriptor) + + # re-send the same thing because i'm too lazy to create a new consumer + broadcast_event = OnCreateConsumer(registrar_descriptor, []) + broadcaster.send(broadcast_event, timeout=0.1) + + new_consumer.listen_once(timeout=0.1) + + values = backbone.wait_for( + [broadcast_event.uid, BackboneFeatureStore.MLI_NOTIFY_CONSUMERS], 1.0 + ) + stored = values[broadcast_event.uid] + assert stored == "received", "The handler didn't update the backbone" + + # confirm that directly retrieving the value isn't different from + # using backbone.notification_channels helper method + notify_list = str(values[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS]).split(",") + assert new_consumer.descriptor in set(notify_list) + + +@pytest.mark.parametrize( + "content", + [ + pytest.param(b"a"), + pytest.param(b"new byte string"), + ], +) +def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch): + """A descriptor can be stored, loaded, and reattached""" + chan = Channel.make_process_local() + queue = FLInterface(main_ch=chan) + monkeypatch.setenv( + "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + ) + + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + ) + config_queue = config.get_queue() + + _ = config_queue.send(content) + + old_recv = queue.recvh() + result, _ = old_recv.recv_bytes() + assert result == content + + +def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): + """The serialized descriptors of a loaded and unloaded + queue are the same""" + chan = Channel.make_process_local() + queue = FLInterface(main_ch=chan) + monkeypatch.setenv( + "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + ) + + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + config_queue = config.get_queue() + assert config_queue._fli.serialize() == queue.serialize() + + +def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): + """An incorrect serialized descriptor will fails to attach""" + monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring") + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=None, + queue_factory=DragonFLIChannel.from_descriptor, + ) + + with pytest.raises(DragonFLIError): + config.get_queue() + + +def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): + """Verify the dragon feature store is loaded correctly by the + EnvironmentConfigLoader to demonstrate featurestore_factory correctness""" + feature_store = DragonFeatureStore(DDict()) + monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor) + + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=None, + queue_factory=None, + ) + + print(f"calling config.get_backbone: `{feature_store.descriptor}`") + + backbone = config.get_backbone() + assert backbone is not None + + +def test_environment_variables_not_set(): + """EnvironmentConfigLoader getters return None when environment + variables are not set""" + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonCommChannel.from_descriptor, + ) + assert config.get_backbone() is None + assert config.get_queue() is None diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index 7f1649741..434bc5eab 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -181,15 +181,15 @@ def test_eventconsumer_eventpublisher_integration( mock_client_app.send(event_4) # worker manager should only get updates about feature update - wmgr_messages = wmgr_consumer.receive() + wmgr_messages = wmgr_consumer.recv() assert len(wmgr_messages) == 3 # the backend should only receive messages about consumer creation - back_messages = back_consumer.receive() + back_messages = back_consumer.recv() assert len(back_messages) == 1 # hypothetical app has no filters and will get all events - app_messages = capp_consumer.receive() + app_messages = capp_consumer.recv() assert len(app_messages) == 4 diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 87536c5ba..59a30a3e8 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -561,7 +561,7 @@ def test_eventconsumer_receive(test_dir: str) -> None: consumer = EventConsumer(comm_channel, backbone) - all_received: t.List[OnCreateConsumer] = consumer.receive() + all_received: t.List[OnCreateConsumer] = consumer.recv() assert len(all_received) == 1 # verify we received the same event that was raised @@ -595,7 +595,7 @@ def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None: consumer = EventConsumer(comm_channel, backbone) - all_received: t.List[OnCreateConsumer] = consumer.receive() + all_received: t.List[OnCreateConsumer] = consumer.recv() assert len(all_received) == num_sent @@ -621,7 +621,7 @@ def test_eventconsumer_receive_empty(test_dir: str) -> None: consumer = EventConsumer(comm_channel, backbone) - messages = consumer.receive() + messages = consumer.recv() # the messages array should be empty assert not messages @@ -696,15 +696,15 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: mock_client_app.send(event_4) # worker manager should only get updates about feature update - wmgr_messages = wmgr_consumer.receive() + wmgr_messages = wmgr_consumer.recv() assert len(wmgr_messages) == 3 # the backend should only receive messages about consumer creation - back_messages = back_consumer.receive() + back_messages = back_consumer.recv() assert len(back_messages) == 1 # hypothetical app has no filters and will get all events - app_messages = capp_consumer.receive() + app_messages = capp_consumer.recv() assert len(app_messages) == 4 diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index b088df5b4..ccc63def7 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -138,15 +138,15 @@ def test_eventconsumer_eventpublisher_integration( mock_client_app.send(event, timeout=0.1) # worker manager should only get updates about feature update - wmgr_messages = wmgr_consumer.receive() + wmgr_messages = wmgr_consumer.recv() assert len(wmgr_messages) == 3 # the backend should only receive messages about consumer creation - back_messages = back_consumer.receive() + back_messages = back_consumer.recv() assert len(back_messages) == 1 # hypothetical app has no filters and will get all events - app_messages = capp_consumer.receive() + app_messages = capp_consumer.recv() assert len(app_messages) == 4 @@ -204,7 +204,7 @@ def test_eventconsumer_max_dequeue( num_dequeued = 0 - while wmgr_messages := wmgr_consumer.receive(timeout=0.01): + while wmgr_messages := wmgr_consumer.recv(timeout=0.01): # worker manager should not get more than `max_num_msgs` events num_dequeued += len(wmgr_messages) From d81334d37c865f96900bd7287c182b6db58aa2bb Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Wed, 25 Sep 2024 17:45:35 -0500 Subject: [PATCH 09/40] docstrings & miscellaneous minor fixes (reuse descriptor code, add dragon utils module, fix missing test env vars, ... --- conftest.py | 28 +- ex/high_throughput_inference/mock_app.py | 22 +- .../_core/launcher/dragon/dragonBackend.py | 85 ++- smartsim/_core/mli/comm/channel/channel.py | 7 +- .../_core/mli/comm/channel/dragon_channel.py | 120 +--- smartsim/_core/mli/comm/channel/dragon_fli.py | 27 +- .../_core/mli/comm/channel/dragon_util.py | 137 ++++ .../storage/backbone_feature_store.py | 41 +- .../storage/dragon_feature_store.py | 3 +- smartsim/log.py | 6 +- smartsim/protoclient.py | 105 +-- tests/dragon/channel.py | 19 +- tests/dragon/test_dragon_backend.py | 90 --- tests/dragon/test_environment_loader.py | 22 +- tests/dragon/test_featurestore.py | 6 +- tests/dragon/test_featurestore_integration.py | 32 +- tests/dragon/test_protoclient.py | 62 +- tests/dragon/test_request_dispatcher.py | 8 +- tests/dragon/test_worker_manager.py | 652 +++++++++--------- tests/dragon/utils/channel.py | 17 +- tests/mli/channel.py | 19 +- 21 files changed, 743 insertions(+), 765 deletions(-) create mode 100644 smartsim/_core/mli/comm/channel/dragon_util.py diff --git a/conftest.py b/conftest.py index 622dd7a7c..098a4a0c5 100644 --- a/conftest.py +++ b/conftest.py @@ -459,10 +459,15 @@ def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None: @pytest.fixture(scope="function", autouse=True) def check_output_dir() -> None: - global test_output_dirs - assert os.path.isdir(test_output_root) - assert len(os.listdir(test_output_root)) >= test_output_dirs - test_output_dirs = len(os.listdir(test_output_root)) + try: + global test_output_dirs + assert os.path.isdir(test_output_root) + assert len(os.listdir(test_output_root)) >= test_output_dirs + test_output_dirs = len(os.listdir(test_output_root)) + except Exception: + # swallow error when the tests can't clean up test dirs + # and let the next run do the job. + ... @pytest.fixture @@ -1056,8 +1061,8 @@ def as_command(self) -> t.List[str]: NOTE: does NOT include the `[sys.executable, msg_pump_path, ...]` portion of the necessary parameters to Popen. - :returns: A list of strings containing the arguments of the request - formatted for inclusion in a call to subprocess.Popen""" + :returns: The arguments of the request formatted appropriately to + Popen the `/tests/dragon/utils/msg_pump.py`""" return [ "--dispatch-fli-descriptor", self.work_queue_descriptor, @@ -1075,11 +1080,16 @@ def msg_pump_factory() -> t.Callable[[MsgPumpRequest], subprocess.Popen]: """A pytest fixture used to create a mock event producer capable of feeding asynchronous inference requests to tests requiring them. - :returns: A function that can be passed appropriate descriptors - for starting a message pump.""" + :returns: A function that opens a subprocess running a mock message pump + """ def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen: - """Invokes the message pump entry-point""" + """Invoke the message pump entry-point with the descriptors + from the request. + + :param request: A request containing all parameters required to + invoke the message pump entrypoint + :returns: The Popen object for the subprocess that was started""" # /tests/dragon/utils/msg_pump.py msg_pump_script = "tests/dragon/utils/msg_pump.py" msg_pump_path = pathlib.Path(__file__).parent / msg_pump_script diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 31195c7e6..2886bd5f9 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -60,7 +60,12 @@ class ResNetWrapper: + """Wrapper around a pre-rained ResNet model.""" def __init__(self, name: str, model: str): + """Initialize the instance. + + :param name: The name to use for the model + :param model: The path to the pre-trained PyTorch model""" self._model = torch.jit.load(model) self._name = name buffer = io.BytesIO() @@ -69,14 +74,25 @@ def __init__(self, name: str, model: str): self._serialized_model = buffer.getvalue() def get_batch(self, batch_size: int = 32): + """Create a random batch of data with the correct dimensions to + invoke a ResNet model. + + :param batch_size: The desired number of samples to produce + :returns: A PyTorch tensor""" return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) @property - def model(self): + def model(self) -> bytes: + """The content of a model file. + + :returns: The model bytes""" return self._serialized_model @property - def name(self): + def name(self) -> str: + """The name applied to the model. + + :returns: The name""" return self._name @@ -90,7 +106,7 @@ def name(self): resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt") client = ProtoClient(timing_on=True, wait_timeout=0) - # client.set_model(resnet.name, resnet.model) + client.set_model(resnet.name, resnet.model) if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: # TODO: adapt to non-Nvidia devices diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 3fe120a9d..6dc61516e 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -26,7 +26,6 @@ import collections import functools import itertools -import multiprocessing as mp import os import time import typing as t @@ -36,27 +35,23 @@ from tabulate import tabulate -# pylint: disable=import-error,C0302,R0915,R6301 +# pylint: disable=import-error,C0302,R0915 # isort: off import dragon.data.ddict.ddict as dragon_ddict import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy import dragon.infrastructure.process_desc as dragon_process_desc -# import dragon.native.group_state as dragon_group_state import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group import dragon.native.machine as dragon_machine from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter -from smartsim._core.mli.comm.channel.dragon_channel import ( - DragonCommChannel, - create_local, -) +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, EventBase, - # EventBroadcaster, EventCategory, EventConsumer, OnCreateConsumer, @@ -86,9 +81,11 @@ logger = get_logger(__name__) +# TODO: create ticket for follow-up task to replace defunct +# dragon_group_state.Running() & .Error() class DragonStatus(str, Enum): - ERROR = "Error" # str(dragon_group_state.Error()) - RUNNING = "Running" # str(dragon_group_state.Running()) + ERROR = "Error" + RUNNING = "Running" def __str__(self) -> str: return self.value @@ -195,20 +192,13 @@ def __init__(self, pid: int) -> None: """Whether the server frontend should shut down when the backend does""" self._shutdown_initiation_time: t.Optional[float] = None """The time at which the server initiated shutdown""" - smartsim_config = get_config() - self._cooldown_period = ( - smartsim_config.telemetry_frequency * 2 + 5 - if smartsim_config.telemetry_enabled - else 5 - ) - """Time in seconds needed to server to complete shutdown""" + self._cooldown_period = self._initialize_cooldown() + """Time in seconds needed by the server to complete shutdown""" self._backbone: t.Optional[BackboneFeatureStore] = None """The backbone feature store""" self._event_consumer: t.Optional[EventConsumer] = None - """A listener registered to listen for new consumers and update the shared + """A consumer registered to listen for new consumers and update the shared consumer registrations list""" - self._event_consumer_process: t.Optional[mp.Process] = None - """The process executing the event consumers `listen` method""" """An event consumer for receiving events from MLI resources""" self._nodes: t.List["dragon_machine.Node"] = [] @@ -223,8 +213,6 @@ def __init__(self, pid: int) -> None: """Mapping with hostnames as keys and a set of running step IDs as the value""" self._initialize_hosts() - self._view = DragonBackendView(self) - logger.debug(self._view.host_desc) self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock) @property @@ -276,10 +264,8 @@ def status_message(self) -> str: :returns: a status message """ - return ( - "Dragon server backend update\n" - f"{self._view.host_table}\n{self._view.step_table}" - ) + view = DragonBackendView(self) + return "Dragon server backend update\n" f"{view.host_table}\n{view.step_table}" def _heartbeat(self) -> None: """Update the value of the last heartbeat to the current time.""" @@ -580,12 +566,15 @@ def _create_backbone(self) -> BackboneFeatureStore: # put the backbone descriptor in the env vars os.environ.update(self._backbone.get_env()) - logger.info(self._backbone.creation_date) return self._backbone def _on_consumer_created(self, event: EventBase) -> None: - """Event handler for""" + """Event handler for updating the backbone when new event consumers + are registered. + + :param event: The event that was received + """ if isinstance(event, OnCreateConsumer) and self._backbone is not None: notify_list = set(self._backbone.notification_channels) notify_list.add(event.descriptor) @@ -594,29 +583,29 @@ def _on_consumer_created(self, event: EventBase) -> None: logger.warning(f"Unhandled event received: {event}") - def _bootstrap_event_listeners( - self, backbone: BackboneFeatureStore, consumer: EventConsumer - ) -> None: - """Update the list of notification channels registered in the backbone. - - :param backbone: The backbone feature store to update""" - # Copy the consumer list so a backend restart doesn't clear registrations - notify_descriptors = list(backbone.notification_channels) - - # Update directly to avoid SEND/ACK pattern - notify_descriptors.append(consumer.descriptor) - notify_descriptors = list(set(notify_descriptors)) + @staticmethod + def _initialize_cooldown() -> int: + """Load environment configuration and determine the correct cooldown + period to apply to the backend process. - backbone.notification_channels = notify_descriptors + :returns: The calculated cooldown (in seconds) + """ + smartsim_config = get_config() + return ( + smartsim_config.telemetry_frequency * 2 + 5 + if smartsim_config.telemetry_enabled + else 5 + ) def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer: """ Create an event publisher and event consumer for communicating with other MLI resources. - :param backbone: The backbone feature store used by the MLI backend. NOTE: - passing backbone as a parameter to ensure the backbone is initialized before - attempting to connect any eventing clients. + :param backbone: The backbone feature store used by the MLI backend. + + NOTE: the backbone must be initialized before connecting to eventing clients. + :returns: The newly created EventConsumer instance """ @@ -639,10 +628,14 @@ def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer: return self._event_consumer def listen_to_registrations(self, timeout: float = 0.001) -> None: + """Execute the listener for registration events. + + :param timeout: Maximum time to wait (in seconds) for a new event""" if self._event_consumer is not None: self._event_consumer.listen_once(timeout) - def _start_eventing_listeners(self) -> None: + @staticmethod + def _start_eventing_listeners() -> None: # todo: start external listener entrypoint ... @@ -969,6 +962,8 @@ def __init__(self, backend: DragonBackend) -> None: self._backend = backend """A dragon backend used to produce the view""" + logger.debug(self.host_desc) + @property def host_desc(self) -> str: hosts = self._backend.hosts diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index a581e8e2a..104333ce7 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -76,12 +76,7 @@ def descriptor(self) -> str: """ return self._descriptor - @property - def decoded_descriptor(self) -> bytes: - """Return the descriptor decoded from a string into bytes""" - return base64.b64decode(self._descriptor.encode("utf-8")) - def __str__(self) -> str: - """Build a string representation of the channel useful for printing""" + """Build a string representation of the channel useful for printing.""" classname = type(self).__class__.__name__ return f"{classname}('{self._name}', '{self._descriptor}')" diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 9c0ac3423..7534719e7 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -24,17 +24,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import base64 -import sys import typing as t import dragon.channels as dch -import dragon.infrastructure.facts as df -import dragon.infrastructure.parameters as dp -import dragon.managed_memory as dm -import dragon.utils as du import smartsim._core.mli.comm.channel.channel as cch +import smartsim._core.mli.comm.channel.dragon_util as drg_util from smartsim.error.errors import SmartSimError from smartsim.log import get_logger @@ -49,75 +44,6 @@ unnecessary retries when creating a local channel.""" -def _channel_to_descriptor(channel: dch.Channel) -> str: - """Utility method for converting a channel to a descriptor string. - - :param channel: The dragon channel to convert - :returns: The descriptor string - """ - if channel is None: - raise SmartSimError("Channel is not available to create a descriptor") - - serialized_ch = channel.serialize() - return base64.b64encode(serialized_ch).decode("utf-8") - - -def _pool_to_descriptor(pool: dm.MemoryPool) -> str: - """Utility method for converting a pool to a descriptor string. - - :param pool: The memory pool to convert - :returns: The descriptor string""" - if pool is None: - raise SmartSimError("Memory pool is not available to create a descriptor") - - serialized_pool = pool.serialize() - return base64.b64encode(serialized_pool).decode("utf-8") - - -def create_local(capacity: int = 0) -> dch.Channel: - """Creates a Channel attached to the local memory pool. Replacement for - direct calls to `dch.Channel.make_process_local()` to enable - supplying a channel capacity. - - :param capacity: The number of events the channel can buffer; uses the default - buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied - :returns: The instantiated channel - :raises SmartSimError: If unable to attach local channel - """ - pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd)) - pool_descriptor = _pool_to_descriptor(pool) - channel: t.Optional[dch.Channel] = None - offset = 0 - - global LAST_OFFSET - if LAST_OFFSET: - offset = LAST_OFFSET - - capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE - - while not channel: - # search for an open channel ID - offset += 1 - cid = df.BASE_USER_MANAGED_CUID + offset - try: - channel = dch.Channel(mem_pool=pool, c_uid=cid, capacity=capacity) - LAST_OFFSET = offset - descriptor = _channel_to_descriptor(channel) - logger.debug( - "Local channel creatd: " - f"{cid=}, {pool_descriptor=}, {capacity=}, {descriptor=}" - ) - except dch.ChannelError as e: - if offset < 100: - logger.warning(f"Channnel id {cid} is not open. Retrying...") - else: - LAST_OFFSET = 0 - logger.error(f"All attempts to attach local channel have failed") - raise SmartSimError("Failed to attach local channel") from e - - return channel - - class DragonCommChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon channel.""" @@ -126,7 +52,7 @@ def __init__(self, channel: "dch.Channel") -> None: :param channel: A channel to use for communications """ - descriptor = _channel_to_descriptor(channel) + descriptor = drg_util.channel_to_descriptor(channel) super().__init__(descriptor) self._channel = channel @@ -175,23 +101,6 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: return messages - @property - def descriptor_string(self) -> str: - """Return the channel descriptor for the underlying dragon channel - as a string. Automatically performs base64 encoding to ensure the - string can be used in a call to `from_descriptor`. - - :returns: String representation of channel descriptor - :raises ValueError: If unable to convert descriptor to a string - """ - if isinstance(self._descriptor, str): - return self._descriptor - - if isinstance(self._descriptor, bytes): - return base64.b64encode(self._descriptor).decode("utf-8") - - raise ValueError(f"Unable to convert channel descriptor: {self._descriptor}") - @classmethod def from_descriptor( cls, @@ -199,36 +108,29 @@ def from_descriptor( ) -> "DragonCommChannel": """A factory method that creates an instance from a descriptor string. - :param descriptor: The descriptor that uniquely identifies the resource. Output - from `descriptor_string` is correctly encoded. + :param descriptor: The descriptor that uniquely identifies the resource. :returns: An attached DragonCommChannel - :raises SmartSimError: If creation of comm channel fails""" + :raises SmartSimError: If creation of comm channel fails + """ try: if isinstance(descriptor, bytes): raise ValueError("Descriptor must be a string") - utf8_descriptor: t.Union[str, bytes] = descriptor - if isinstance(descriptor, str): - utf8_descriptor = descriptor.encode("utf-8") - - # todo: ensure the bytes argument and condition are removed - # after refactoring the RPC models - - actual_descriptor = base64.b64decode(utf8_descriptor) - channel = dch.Channel.attach(actual_descriptor) + channel = drg_util.descriptor_to_channel(descriptor) return DragonCommChannel(channel) - except Exception as e: + except Exception as ex: raise SmartSimError( f"Failed to create dragon comm channel: {descriptor}" - ) from e + ) from ex @classmethod def from_local(cls, _descriptor: t.Optional[str] = None) -> "DragonCommChannel": - """A factory method that creates a local channel instance + """A factory method that creates a local channel instance. + :param _descriptor: Unused placeholder :returns: An attached DragonCommChannel""" try: - channel = dch.Channel.make_process_local() + channel = drg_util.create_local() return DragonCommChannel(channel) except: logger.error(f"Failed to create local dragon comm channel", exc_info=True) diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 22593f63c..13eb58a2e 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -27,18 +27,13 @@ # isort: off from dragon import fli import dragon.channels as dch -import dragon.infrastructure.facts as df -import dragon.infrastructure.parameters as dp -import dragon.managed_memory as dm -import dragon.utils as du # isort: on -import base64 import typing as t import smartsim._core.mli.comm.channel.channel as cch -from smartsim._core.mli.comm.channel.dragon_channel import create_local +import smartsim._core.mli.comm.channel.dragon_util as drg_util from smartsim.error.errors import SmartSimError from smartsim.log import get_logger @@ -60,22 +55,19 @@ def __init__( :param sender_supplied: Flag indicating if the FLI uses sender-supplied streams :param buffer_size: Maximum number of sent messages that can be buffered """ - descriptor = base64.b64encode(fli_.serialize()).decode("utf-8") + descriptor = drg_util.channel_to_descriptor(fli_) super().__init__(descriptor) self._fli = fli_ self._channel: t.Optional["dch.Channel"] = ( - create_local(buffer_size) if sender_supplied else None + drg_util.create_local(buffer_size) if sender_supplied else None ) - def send( - self, value: bytes, timeout: float = 0.001, blocking: bool = False - ) -> None: + def send(self, value: bytes, timeout: float = 0.001) -> None: """Send a message through the underlying communication channel. :param value: The value to send :param timeout: Maximum time to wait (in seconds) for messages to send - :param blocking: Block returning until the message has been received :raises SmartSimError: If sending message fails """ try: @@ -110,13 +102,6 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: ) from e return messages - @classmethod - def _string_descriptor_to_fli(cls, descriptor: str) -> "fli.FLInterface": - """Helper method to convert a string-safe, encoded descriptor back - into its original byte format""" - descriptor_ = base64.b64decode(descriptor.encode("utf-8")) - return fli.FLInterface.attach(descriptor_) - @classmethod def from_sender_supplied_descriptor( cls, @@ -128,7 +113,7 @@ def from_sender_supplied_descriptor( :returns: An attached DragonFLIChannel""" try: return DragonFLIChannel( - fli_=cls._string_descriptor_to_fli(descriptor), + fli_=drg_util.descriptor_to_fli(descriptor), sender_supplied=True, ) except: @@ -153,7 +138,7 @@ def from_descriptor( try: return DragonFLIChannel( - fli_=cls._string_descriptor_to_fli(descriptor), + fli_=drg_util.descriptor_to_fli(descriptor), sender_supplied=False, ) except Exception as e: diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py new file mode 100644 index 000000000..2980dc9a6 --- /dev/null +++ b/smartsim/_core/mli/comm/channel/dragon_util.py @@ -0,0 +1,137 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import base64 +import typing as t + +import dragon.channels as dch +import dragon.fli as fli +import dragon.infrastructure.facts as df +import dragon.infrastructure.parameters as dp +import dragon.managed_memory as dm +import dragon.utils as du + +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) + +DEFAULT_CHANNEL_BUFFER_SIZE = 500 +"""Maximum number of messages that can be buffered. DragonCommChannel will +raise an exception if no clients consume messages before the buffer is filled.""" + +LAST_OFFSET = 0 +"""The last offset used to create a local channel. This is used to avoid +unnecessary retries when creating a local channel.""" + + +def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str: + """Utility method for converting a channel to a descriptor string. + + :param channel: The dragon channel to convert + :returns: The descriptor string + """ + if channel is None: + raise SmartSimError("Channel is not available to create a descriptor") + + serialized_ch = channel.serialize() + return base64.b64encode(serialized_ch).decode("utf-8") + + +def pool_to_descriptor(pool: dm.MemoryPool) -> str: + """Utility method for converting a pool to a descriptor string. + + :param pool: The memory pool to convert + :returns: The descriptor string""" + if pool is None: + raise SmartSimError("Memory pool is not available to create a descriptor") + + serialized_pool = pool.serialize() + return base64.b64encode(serialized_pool).decode("utf-8") + + +def descriptor_to_fli(descriptor: str) -> "fli.FLInterface": + """Helper method to attach a new FLI instance given + the string-encoded descriptor. + + :param descriptor: The descriptor of an FLI to attach to + :returns: The attached dragon FLI""" + descriptor_ = base64.b64decode(descriptor.encode("utf-8")) + return fli.FLInterface.attach(descriptor_) + + +def descriptor_to_channel(descriptor: str) -> dch.Channel: + """Helper method to attach a new Channel instance given + the string-encoded descriptor. + + :param descriptor: The descriptor of a channel to attach to + :returns: The attached dragon Channel""" + descriptor_ = base64.b64decode(descriptor.encode("utf-8")) + return dch.Channel.attach(descriptor_) + + +def create_local(capacity: int = 0) -> dch.Channel: + """Creates a Channel attached to the local memory pool. Replacement for + direct calls to `dch.Channel.make_process_local()` to enable + supplying a channel capacity. + + :param capacity: The number of events the channel can buffer; uses the default + buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied + :returns: The instantiated channel + :raises SmartSimError: If unable to attach local channel + """ + pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd)) + pool_descriptor = pool_to_descriptor(pool) + channel: t.Optional[dch.Channel] = None + offset = 0 + + global LAST_OFFSET + if LAST_OFFSET: + offset = LAST_OFFSET + + capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE + + while not channel: + # search for an open channel ID + offset += 1 + channel_id = df.BASE_USER_MANAGED_CUID + offset + try: + channel = dch.Channel(mem_pool=pool, c_uid=channel_id, capacity=capacity) + LAST_OFFSET = offset + descriptor = channel_to_descriptor(channel) + logger.debug( + "Local channel created: " + f"{channel_id=}, {pool_descriptor=}, {capacity=}, {descriptor=}" + ) + except dch.ChannelError as e: + if offset < 100: + logger.warning(f"Channnel id `{channel_id}` is not open. Retrying...") + else: + LAST_OFFSET = 0 + logger.error(f"All attempts to attach local channel have failed") + raise SmartSimError("Failed to attach local channel") from e + + return channel diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index 83c255fe7..f8515220f 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -92,17 +92,27 @@ def __init__( @property def wait_timeout(self) -> float: + """Retrieve the wait timeout for this feature store. The wait timeout is + applied to all calls to `wait_for`. + + :returns: The wait timeout (in seconds). + """ return self._wait_timeout @wait_timeout.setter def wait_timeout(self, value: float) -> None: + """Set the wait timeout (in seconds) for this feature store. The wait + timeout is applied to all calls to `wait_for`. + + :param value: The new value to set + """ self._wait_timeout = value @property def notification_channels(self) -> t.Sequence[str]: """Retrieve descriptors for all registered MLI notification channels. - :returns: The list of descriptors + :returns: The list of channel descriptors """ if self.MLI_NOTIFY_CONSUMERS in self: stored_consumers = self[self.MLI_NOTIFY_CONSUMERS] @@ -119,26 +129,26 @@ def notification_channels(self, values: t.Sequence[str]) -> None: @property def backend_channel(self) -> t.Optional[str]: - """Retrieve the channel descriptor exposed by the MLI backend for events + """Retrieve the channel descriptor exposed by the MLI backend for events. - :returns: a stringified channel descriptor""" + :returns: The channel descriptor""" if self.MLI_BACKEND_CONSUMER in self: return str(self[self.MLI_BACKEND_CONSUMER]) return None @backend_channel.setter def backend_channel(self, value: str) -> None: - """Set the channel exposed by the MLI backend for events + """Set the channel exposed by the MLI backend for events. - :param value: a stringified channel descriptor""" + :param value: The stringified channel descriptor""" self[self.MLI_BACKEND_CONSUMER] = value @property def worker_queue(self) -> t.Optional[str]: """Retrieve the channel descriptor exposed by the MLI - backend to send work to an MLI worker manager instance + backend to send work to an MLI worker manager instance. - :returns: a stringified channel descriptor""" + :returns: The channel descriptor, if found. Otherwise, `None`""" if self.MLI_WORKER_QUEUE in self: return str(self[self.MLI_WORKER_QUEUE]) return None @@ -146,18 +156,20 @@ def worker_queue(self) -> t.Optional[str]: @worker_queue.setter def worker_queue(self, value: str) -> None: """Set the channel descriptor exposed by the MLI - backend to send work to an MLI worker manager instance + backend to send work to an MLI worker manager instance. - :param value: a stringified channel descriptor""" + :param value: The channel descriptor""" self[self.MLI_WORKER_QUEUE] = value @property def creation_date(self) -> str: - """Return the creation date for the backbone feature store""" + """Return the creation date for the backbone feature store. + + :returns: The string-formatted date when feature store was created""" return str(self[self._CREATED_ON]) def _record_creation_data(self) -> None: - """Write the creation timestamp to the feature store""" + """Write the creation timestamp to the feature store.""" if self._CREATED_ON not in self: if not self._allow_reserved_writes: logger.warning( @@ -180,9 +192,8 @@ def from_writable_descriptor( try: return BackboneFeatureStore(dragon_ddict.DDict.attach(descriptor), True) except Exception as ex: - logger.error(f"Error creating dragon feature store: {descriptor}") raise SmartSimError( - f"Error creating dragon feature store: {descriptor}" + f"Error creating backbone feature store: {descriptor}" ) from ex def _check_wait_timeout( @@ -568,8 +579,8 @@ def __init__( :param backbone: The MLI backbone feature store :param filters: A list of event types to deliver. when empty, all events will be delivered - :param timeout: Maximum time to wait for messages to arrive; may be overridden - on individual calls to `receive` + :param name: A user-friendly name for logging. If not provided, an + auto-generated GUID will be used :raises ValueError: If batch_timeout <= 0 """ if batch_timeout is not None and batch_timeout <= 0: diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index 0256b1a51..4eeeac32f 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -101,7 +101,6 @@ def from_descriptor( logger.debug(f"Attaching to FeatureStore with descriptor: {descriptor}") return cls(dragon_ddict.DDict.attach(descriptor)) except Exception as ex: - logger.error(f"Error creating dragon feature store: {descriptor}") raise SmartSimError( - f"Error creating dragon feature store: {descriptor}" + f"Error creating dragon feature store from descriptor: {descriptor}" ) from ex diff --git a/smartsim/log.py b/smartsim/log.py index a28112efa..c8fed9329 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -258,10 +258,12 @@ def log_to_file( """Installs a second filestream handler to the root logger, allowing subsequent logging calls to be sent to filename. - :param filename: the name of the desired log file. - :param log_level: as defined in get_logger. Can be specified + :param filename: The name of the desired log file. + :param log_level: As defined in get_logger. Can be specified to allow the file to store more or less verbose logging information. + :param logger: If supplied, a logger to add the file stream logging + behavior to. By default, a new logger is instantiated. """ if logger is None: logger = logging.getLogger("SmartSim") diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py index 3e786cf05..c248300ca 100644 --- a/smartsim/protoclient.py +++ b/smartsim/protoclient.py @@ -27,7 +27,6 @@ # isort: off # pylint: disable=unused-import,import-error import dragon -from dragon import fli import dragon.channels from dragon.globalservices.api_setup import connect_to_infrastructure @@ -49,11 +48,9 @@ import numpy import torch -from smartsim._core.mli.comm.channel.dragon_channel import ( - DragonCommChannel, - create_local, -) +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, EventBroadcaster, @@ -86,16 +83,17 @@ class ProtoClient: triggering QueueFull exceptions.""" @staticmethod - def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore: + def _attach_to_backbone() -> BackboneFeatureStore: """Use the supplied environment variables to attach to a pre-existing backbone featurestore. Requires the environment to contain `_SMARTSIM_INFRA_BACKBONE` - environment variable + environment variable. - :returns: the attached backbone featurestore""" + :returns: The attached backbone featurestore + """ # todo: ensure this env var from config loader or constant descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) - if descriptor is None: + if descriptor is None or not descriptor: raise SmartSimError( "Missing required backbone configuration in environment: " f"{BackboneFeatureStore.MLI_BACKBONE}" @@ -104,12 +102,16 @@ def _attach_to_backbone(wait_timeout: float = 0) -> BackboneFeatureStore: backbone = t.cast( BackboneFeatureStore, BackboneFeatureStore.from_descriptor(descriptor) ) - backbone.wait_timeout = wait_timeout return backbone def _attach_to_worker_queue(self) -> DragonFLIChannel: """Wait until the backbone contains the worker queue configuration, - then attach an FLI to the given worker queue""" + then attach an FLI to the given worker queue. + + :returns: The attached FLI channel + :raises: SmartSimError if the required configuration is not found in the + backbone feature store + """ descriptor = "" try: @@ -120,50 +122,36 @@ def _attach_to_worker_queue(self) -> DragonFLIChannel: descriptor = str(config[BackboneFeatureStore.MLI_WORKER_QUEUE]) except Exception as ex: logger.info( - f"Unable to rerieve {BackboneFeatureStore.MLI_WORKER_QUEUE} " + f"Unable to retrieve {BackboneFeatureStore.MLI_WORKER_QUEUE} " "to attach to the worker queue." ) - raise ValueError("Unable to locate worker queue using backbone") from ex + raise SmartSimError("Unable to locate worker queue using backbone") from ex return DragonFLIChannel.from_descriptor(descriptor) - @classmethod - def _create_worker_channels( - cls, - ) -> t.Tuple[dragon.channels.Channel, dragon.channels.Channel]: - """Create channels to be used for communication to and from the worker queue. - - :returns: A tuple containing the native from and to - Channels as (from_channel, to_channel). - """ - - _from_worker_ch_raw = create_local(cls._DEFAULT_WORK_QUEUE_SIZE) - _to_worker_ch_raw = create_local(cls._DEFAULT_WORK_QUEUE_SIZE) - - return _from_worker_ch_raw, _to_worker_ch_raw - def _create_broadcaster(self) -> EventProducer: """Create an event publisher that will broadcast updates to other MLI components. This publisher - :returns: the event publisher instance""" + :returns: the event publisher instance + """ broadcaster = EventBroadcaster( self._backbone, DragonCommChannel.from_descriptor ) return broadcaster def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None: - """Initialize the client instance + """Initialize the client instance. :param timing_on: Flag indicating if timing information should be written to file :param wait_timeout: Maximum wait time (in seconds) allowed to attach to the worker queue - - :raises: SmartSimError if unable to attach to a backbone featurestore""" - # todo: determine a way to make this work in tests. - # - consider catching the import exception and defaulting rank to 0 + :raises: SmartSimError if unable to attach to a backbone featurestore + """ if MPI is not None: + # todo: determine a way to make MPI work in the test environment + # - consider catching the import exception and defaulting rank to 0 comm = MPI.COMM_WORLD rank: int = comm.Get_rank() else: @@ -173,12 +161,12 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None: connect_to_infrastructure() - self._backbone = self._attach_to_backbone(wait_timeout=self.backbone_timeout) + self._backbone = self._attach_to_backbone() + self._backbone.wait_timeout = self.backbone_timeout self._to_worker_fli = self._attach_to_worker_queue() - channels = self._create_worker_channels() - self._from_worker_ch = channels[0] - self._to_worker_ch = channels[1] + self._from_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE) + self._to_worker_ch = create_local(self._DEFAULT_WORK_QUEUE_SIZE) self._publisher = self._create_broadcaster() @@ -199,14 +187,29 @@ def backbone_timeout(self) -> float: return self._backbone_timeout or self._DEFAULT_BACKBONE_TIMEOUT def _add_label_to_timings(self, label: str) -> None: + """Adds a new label into the timing dictionary to prepare for + receiving timing events. + + :param label: The label to create storage for + """ if label not in self._timings: self._timings[label] = [] @staticmethod def _format_number(number: t.Union[numbers.Number, float]) -> str: + """Utility function for formatting numbers consistently for logs. + + :param number: The number to convert to a formatted string + :returns: The formatted string containing the number + """ return f"{number:0.4e}" def start_timings(self, batch_size: numbers.Number) -> None: + """Configure the client to begin storing timing information. + + :param bach_size: The size of batches to generate as inputs + to the model + """ if self._timing_on: self._add_label_to_timings("batch_size") self._timings["batch_size"].append(self._format_number(batch_size)) @@ -214,6 +217,7 @@ def start_timings(self, batch_size: numbers.Number) -> None: self._interm = time.perf_counter() def end_timings(self) -> None: + """Configure the client to stop storing timing information.""" if self._timing_on and self._start is not None: self._add_label_to_timings("total_time") self._timings["total_time"].append( @@ -221,6 +225,10 @@ def end_timings(self) -> None: ) def measure_time(self, label: str) -> None: + """Measures elapsed time since the last recorded signal. + + :param label: The label to measure time for + """ if self._timing_on and self._interm is not None: self._add_label_to_timings(label) self._timings[label].append( @@ -229,6 +237,11 @@ def measure_time(self, label: str) -> None: self._interm = time.perf_counter() def print_timings(self, to_file: bool = False) -> None: + """Print timing information to standard output. + + :param to_file: If `True`, also saves timing information + to the files `timings.npy` and `timings.txt` + """ print(" ".join(self._timings.keys())) value_array = numpy.array(self._timings.values(), dtype=float) @@ -240,6 +253,14 @@ def print_timings(self, to_file: bool = False) -> None: numpy.savetxt("timings.txt", value_array) def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: + """Execute a bach of inference requests with the supplied ML model. + + :param model: The raw bytes or path to a pytorch model + :param batch: The tensor batch to perform inference on + :returns: The inference results + :raises: ValueError if the worker queue is not configured properly + in the environment variables + """ tensors = [batch.numpy()] self.perf_timer.start_timings("batch_size", batch.shape[0]) built_tensor_desc = MessageHandler.build_tensor_descriptor( @@ -301,9 +322,11 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: return result def set_model(self, key: str, model: bytes) -> None: - # todo: incorrect usage of backbone here to store - # user models? are we using the backbone if they do NOT - # have a feature store of their own? + """Write the supplied model to the feature store. + + :param key: The unique key used to identify the model + :param model: The raw bytes of the model to execute + """ self._backbone[key] = model # notify components of a change in the data at this key diff --git a/tests/dragon/channel.py b/tests/dragon/channel.py index efabb00c0..4c46359c2 100644 --- a/tests/dragon/channel.py +++ b/tests/dragon/channel.py @@ -40,9 +40,10 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" def __init__(self, key: pathlib.Path) -> None: - """Initialize the FileSystemCommChannel instance + """Initialize the FileSystemCommChannel instance. - :param key: a path to the root directory of the feature store""" + :param key: a path to the root directory of the feature store + """ self._lock = threading.RLock() super().__init__(key.as_posix()) @@ -57,7 +58,7 @@ def send(self, value: bytes, timeout: float = 0) -> None: """Send a message throuh the underlying communication channel. :param value: The value to send - :param timeout: Maximum time to wait (in seconds) for messages to send + :param timeout: maximum time to wait (in seconds) for messages to send """ with self._lock: # write as text so we can add newlines as delimiters @@ -67,11 +68,12 @@ def send(self, value: bytes, timeout: float = 0) -> None: logger.debug(f"FileSystemCommChannel {self._file_path} sent message") def recv(self, timeout: float = 0) -> t.List[bytes]: - """Receives message(s) through the underlying communication channel + """Receives message(s) through the underlying communication channel. :param timeout: maximum time to wait (in seconds) for messages to arrive :returns: the received message - :raises SmartSimError: if the descriptor points to a missing file""" + :raises SmartSimError: if the descriptor points to a missing file + """ with self._lock: messages: t.List[bytes] = [] if not self._file_path.exists(): @@ -100,7 +102,7 @@ def recv(self, timeout: float = 0) -> t.List[bytes]: return messages def clear(self) -> None: - """Create an empty file for events""" + """Create an empty file for events.""" if self._file_path.exists(): self._file_path.unlink() self._file_path.touch() @@ -110,10 +112,11 @@ def from_descriptor( cls, descriptor: str, ) -> "FileSystemCommChannel": - """A factory method that creates an instance from a descriptor string + """A factory method that creates an instance from a descriptor string. :param descriptor: The descriptor that uniquely identifies the resource - :returns: An attached FileSystemCommChannel""" + :returns: An attached FileSystemCommChannel + """ try: path = pathlib.Path(descriptor) return FileSystemCommChannel(path) diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py index 0631e11e6..0e16be5e2 100644 --- a/tests/dragon/test_dragon_backend.py +++ b/tests/dragon/test_dragon_backend.py @@ -128,93 +128,3 @@ def mock_event_handler(event: EventBase) -> None: # using backbone.notification_channels helper method notify_list = str(values[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS]).split(",") assert new_consumer.descriptor in set(notify_list) - - -@pytest.mark.parametrize( - "content", - [ - pytest.param(b"a"), - pytest.param(b"new byte string"), - ], -) -def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch): - """A descriptor can be stored, loaded, and reattached""" - chan = Channel.make_process_local() - queue = FLInterface(main_ch=chan) - monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) - ) - - config = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, - ) - config_queue = config.get_queue() - - _ = config_queue.send(content) - - old_recv = queue.recvh() - result, _ = old_recv.recv_bytes() - assert result == content - - -def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): - """The serialized descriptors of a loaded and unloaded - queue are the same""" - chan = Channel.make_process_local() - queue = FLInterface(main_ch=chan) - monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) - ) - - config = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, - ) - config_queue = config.get_queue() - assert config_queue._fli.serialize() == queue.serialize() - - -def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): - """An incorrect serialized descriptor will fails to attach""" - monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring") - config = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=None, - queue_factory=DragonFLIChannel.from_descriptor, - ) - - with pytest.raises(DragonFLIError): - config.get_queue() - - -def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): - """Verify the dragon feature store is loaded correctly by the - EnvironmentConfigLoader to demonstrate featurestore_factory correctness""" - feature_store = DragonFeatureStore(DDict()) - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor) - - config = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=None, - queue_factory=None, - ) - - print(f"calling config.get_backbone: `{feature_store.descriptor}`") - - backbone = config.get_backbone() - assert backbone is not None - - -def test_environment_variables_not_set(): - """EnvironmentConfigLoader getters return None when environment - variables are not set""" - config = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonCommChannel.from_descriptor, - ) - assert config.get_backbone() is None - assert config.get_queue() is None diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index b8c2af9c0..4f45614d9 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -94,7 +94,9 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): """An incorrect serialized descriptor will fails to attach""" + monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring") + config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=None, @@ -123,13 +125,17 @@ def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): assert backbone is not None -def test_environment_variables_not_set(): +def test_environment_variables_not_set(monkeypatch: pytest.MonkeyPatch): """EnvironmentConfigLoader getters return None when environment variables are not set""" - config = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonCommChannel.from_descriptor, - ) - assert config.get_backbone() is None - assert config.get_queue() is None + with monkeypatch.context() as patch: + patch.setenv("_SMARTSIM_INFRA_BACKBONE", "") + patch.setenv("_SMARTSIM_REQUEST_QUEUE", "") + + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonCommChannel.from_descriptor, + ) + assert config.get_backbone() is None + assert config.get_queue() is None diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index 434bc5eab..f59501df1 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -129,9 +129,9 @@ def test_eventconsumer_eventpublisher_integration( capp_channel = DragonCommChannel(capp_channel_) back_channel = DragonCommChannel(back_channel_) - wmgr_consumer_descriptor = wmgr_channel.descriptor_string - capp_consumer_descriptor = capp_channel.descriptor_string - back_consumer_descriptor = back_channel.descriptor_string + wmgr_consumer_descriptor = wmgr_channel.descriptor + capp_consumer_descriptor = capp_channel.descriptor + back_consumer_descriptor = back_channel.descriptor # create some consumers to receive messages wmgr_consumer = EventConsumer( diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index ccc63def7..fa6f99001 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -33,8 +33,8 @@ from smartsim._core.mli.comm.channel.dragon_channel import ( DEFAULT_CHANNEL_BUFFER_SIZE, DragonCommChannel, - create_local, ) +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, EventBroadcaster, @@ -82,17 +82,13 @@ def test_eventconsumer_eventpublisher_integration( backbone["test_dir"] = test_dir assert backbone["test_dir"] == test_dir - wmgr_channel_ = Channel.make_process_local() - capp_channel_ = Channel.make_process_local() - back_channel_ = Channel.make_process_local() - - wmgr_channel = DragonCommChannel(wmgr_channel_) - capp_channel = DragonCommChannel(capp_channel_) - back_channel = DragonCommChannel(back_channel_) + wmgr_channel = DragonCommChannel(create_local()) + capp_channel = DragonCommChannel(create_local()) + back_channel = DragonCommChannel(create_local()) - wmgr_consumer_descriptor = wmgr_channel.descriptor_string - capp_consumer_descriptor = capp_channel.descriptor_string - back_consumer_descriptor = back_channel.descriptor_string + wmgr_consumer_descriptor = wmgr_channel.descriptor + capp_consumer_descriptor = capp_channel.descriptor + back_consumer_descriptor = back_channel.descriptor # create some consumers to receive messages wmgr_consumer = EventConsumer( @@ -166,18 +162,20 @@ def test_eventconsumer_max_dequeue( storage_for_dragon_fs: t.Any, ) -> None: """Verify that a consumer does not sit and collect messages indefinitely - by checking that a consumer returns after a maximum timeout is exceeded + by checking that a consumer returns after a maximum timeout is exceeded. - :param num_events: the total number of events to raise in the test - :param batch_timeout: the maximum wait time for a message to be sent. - :param storage_for_dragon_fs: the dragon storage engine to use""" + :param num_events: Total number of events to raise in the test + :param batch_timeout: Maximum wait time (in seconds) for a message to be sent + :param max_batches_expected: Maximum number of receives that should occur + :param storage_for_dragon_fs: Dragon storage engine to use + """ mock_storage = storage_for_dragon_fs backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) wmgr_channel_ = Channel.make_process_local() wmgr_channel = DragonCommChannel(wmgr_channel_) - wmgr_consumer_descriptor = wmgr_channel.descriptor_string + wmgr_consumer_descriptor = wmgr_channel.descriptor # create some consumers to receive messages wmgr_consumer = EventConsumer( @@ -242,7 +240,7 @@ def test_channel_buffer_size( wmgr_channel_ = create_local(buffer_size) # <--- vary buffer size wmgr_channel = DragonCommChannel(wmgr_channel_) - wmgr_consumer_descriptor = wmgr_channel.descriptor_string + wmgr_consumer_descriptor = wmgr_channel.descriptor # create a broadcaster to publish messages. create no consumers to # push the number of sent messages past the allotted buffer size diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index f5a55a381..4310b6de0 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -35,14 +35,14 @@ dragon = pytest.importorskip("dragon") from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel -from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel, create_local +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, EventBroadcaster, OnWriteFeatureStore, ) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict -from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys from smartsim.error.errors import SmartSimError from smartsim.log import get_logger @@ -56,24 +56,40 @@ # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon -WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" +WORK_QUEUE_KEY = BackboneFeatureStore.MLI_WORKER_QUEUE logger = get_logger(__name__) @pytest.fixture(scope="session") def storage_for_dragon_fs() -> t.Dict[str, str]: - # return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) + """Fixture that creates a dragon distributed dictionary. + + :returns: The attached distributed dictionary + """ return dragon_ddict.DDict(1, 2, 4 * 1024**2) @pytest.fixture(scope="session") def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: + """Fixture that creates a dragon backbone feature store. + + :param storage_for_dragon_fs: + :returns: The backbone feature store + :returns: The attached `BackboneFeatureStore` + """ + return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) @pytest.fixture(scope="session") def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: - """a stand-in for the worker manager so a worker queue exists""" + """Fixture that creates a dragon FLI channel as a stand-in for the + worker queue created by the worker. + + :param the_backbone: The backbone feature store to update + with the worker queue descriptor. + :returns: The attached `DragonFLIChannel` + """ # create the FLI to_worker_channel = create_local() @@ -91,28 +107,13 @@ def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: return comm_channel -@pytest.fixture -def storage_for_dragon_fs_with_req_queue( - storage_for_dragon_fs: t.Dict[str, str] -) -> t.Dict[str, str]: - # create a valid FLI so any call to attach does not fail - channel_ = Channel.make_process_local() - fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) - comm_channel = DragonFLIChannel(fli_, True) - - storage_for_dragon_fs[WORK_QUEUE_KEY] = comm_channel.descriptor - return storage_for_dragon_fs - - @pytest.mark.parametrize( "wait_timeout, exp_wait_max", [ # aggregate the 1+1+1 into 3 on remaining parameters - pytest.param( - 0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps", marks=pytest.mark.skip - ), - pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps", marks=pytest.mark.skip), - pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps", marks=pytest.mark.skip), + pytest.param(0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps"), + pytest.param(2, 3 + 2, id="2s wait, 4 cycle steps"), + pytest.param(4, 3 + 2 + 4, id="4s wait, 5 cycle steps"), ], ) def test_protoclient_timeout( @@ -150,11 +151,20 @@ def test_protoclient_timeout( assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" -def test_protoclient_initialization_no_backbone(): +def test_protoclient_initialization_no_backbone( + monkeypatch: pytest.MonkeyPatch, the_worker_queue: DragonFLIChannel +): """Verify that attempting to start the client without required environment variables - results in an exception. NOTE: Backbone env var is not set""" + results in an exception. + + :param the_worker_queue: Passing the worker queue fixture to ensure + the worker queue environment is correctly configured. + + NOTE: os.environ[BackboneFeatureStore.MLI_BACKBONE] is not set""" + + with monkeypatch.context() as patch, pytest.raises(SmartSimError) as ex: + patch.setenv(BackboneFeatureStore.MLI_BACKBONE, "") - with pytest.raises(SmartSimError) as ex: ProtoClient(timing_on=False) # confirm the missing value error has been raised diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index e111f8c74..b6be86177 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -25,11 +25,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import gc -import logging import os import pathlib import subprocess as sp -import sys import time import typing as t from queue import Empty @@ -54,11 +52,9 @@ from dragon.data.ddict.ddict import DDict from dragon.managed_memory import MemoryAlloc -from smartsim._core.mli.comm.channel.dragon_channel import ( - DragonCommChannel, - create_local, -) +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestBatch, RequestDispatcher, diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index b2ddb3481..0feefdb51 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -1,339 +1,313 @@ -# # BSD 2-Clause License -# # -# # Copyright (c) 2021-2024, Hewlett Packard Enterprise -# # All rights reserved. -# # -# # Redistribution and use in source and binary forms, with or without -# # modification, are permitted provided that the following conditions are met: -# # -# # 1. Redistributions of source code must retain the above copyright notice, this -# # list of conditions and the following disclaimer. -# # -# # 2. Redistributions in binary form must reproduce the above copyright notice, -# # this list of conditions and the following disclaimer in the documentation -# # and/or other materials provided with the distribution. -# # -# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# import io -# import logging -# import pathlib -# import time - -# import pytest - -# torch = pytest.importorskip("torch") -# dragon = pytest.importorskip("dragon") - -# import multiprocessing as mp - -# from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( -# BackboneFeatureStore, -# ) -# from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import OutputDescriptor - -# try: -# mp.set_start_method("dragon") -# except Exception: -# pass - -# import os - -# import dragon.channels as dch -# import torch.nn as nn -# from dragon import fli -# from dragon.data.ddict.ddict import DDict - -# from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -# from smartsim._core.mli.infrastructure.control.worker_manager import ( -# EnvironmentConfigLoader, -# WorkerManager, -# ) -# from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( -# DragonFeatureStore, -# ) -# from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker -# from smartsim._core.mli.message_handler import MessageHandler -# from smartsim.log import get_logger - -# from .utils.channel import FileSystemCommChannel - -# logger = get_logger(__name__) -# # The tests in this file belong to the dragon group -# pytestmark = pytest.mark.dragon - - -# class MiniModel(nn.Module): -# def __init__(self): -# super().__init__() - -# self._name = "mini-model" -# self._net = torch.nn.Linear(2, 1) - -# def forward(self, input): -# return self._net(input) - -# @property -# def bytes(self) -> bytes: -# """Returns the model serialized to a byte stream""" -# buffer = io.BytesIO() -# scripted = torch.jit.trace(self._net, self.get_batch()) -# torch.jit.save(scripted, buffer) -# return buffer.getvalue() - -# @classmethod -# def get_batch(cls) -> "torch.Tensor": -# return torch.randn((100, 2), dtype=torch.float32) - - -# def create_model(model_path: pathlib.Path) -> pathlib.Path: -# """Create a simple torch model and persist to disk for -# testing purposes. - -# TODO: remove once unit tests are in place""" -# if not model_path.parent.exists(): -# model_path.parent.mkdir(parents=True, exist_ok=True) - -# model_path.unlink(missing_ok=True) - -# mini_model = MiniModel() -# torch.save(mini_model, model_path) - -# return model_path - - -# def load_model() -> bytes: -# """Create a simple torch model in memory for testing""" -# mini_model = MiniModel() -# return mini_model.bytes - - -# def mock_messages( -# feature_store_root_dir: pathlib.Path, -# comm_channel_root_dir: pathlib.Path, -# kill_queue: mp.Queue, -# ) -> None: -# """Mock event producer for triggering the inference pipeline""" -# feature_store_root_dir.mkdir(parents=True, exist_ok=True) -# comm_channel_root_dir.mkdir(parents=True, exist_ok=True) - -# iteration_number = 0 - -# config_loader = EnvironmentConfigLoader( -# featurestore_factory=DragonFeatureStore.from_descriptor, -# callback_factory=FileSystemCommChannel.from_descriptor, -# queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, -# ) -# backbone = config_loader.get_backbone() - -# worker_queue = config_loader.get_queue() -# if worker_queue is None: -# queue_desc = config_loader._queue_descriptor -# logger.warn( -# f"FLI input queue not loaded correctly from config_loader: {queue_desc}" -# ) - -# model_key = "mini-model" -# model_bytes = load_model() -# backbone[model_key] = model_bytes - -# message_model_key = MessageHandler.build_model_key( -# model_key, backbone.descriptor -# ) - -# while True: -# if not kill_queue.empty(): -# return -# iteration_number += 1 -# time.sleep(1) -# # 1. for demo, ignore upstream and just put stuff into downstream -# # 2. for demo, only one downstream but we'd normally have to filter -# # msg content and send to the correct downstream (worker) queue -# # timestamp = time.time_ns() -# # mock_channel = test_path / f"brainstorm-{timestamp}.txt" -# # mock_channel.touch() - -# # thread - just look for key (wait for keys) -# # call checkpoint, try to get non-persistent key, it blocks -# # working set size > 1 has side-effects -# # only incurs cost when working set size has been exceeded - -# channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" -# callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) - -# # input_key = f"my-input-{iteration_number}" -# output_key = f"my-output-{iteration_number}" - -# batch = MiniModel.get_batch() -# shape = batch.shape -# batch_bytes = batch.numpy().tobytes() -# # backbone[input_key] = batch_bytes - -# logger.debug(f"Model content: {backbone[model_key][:20]}") -# # logger.debug(f"Input content: {backbone[input_key][:20]}") - -# fsd = backbone.descriptor - -# # message_tensor_output_key = MessageHandler.build_tensor_key( -# # output_key, fsd -# # ) -# # message_tensor_input_key = MessageHandler.build_tensor_key( -# # input_key, fsd -# # ) - -# input_descriptor = MessageHandler.build_tensor_descriptor( -# "f", "float32", list(shape) -# ) - -# # output_descriptor = MessageHandler.build_output_tensor_descriptor( -# # "f", [], "float32", list(shape) -# # ) - -# # The first request is always the metadata... -# request = MessageHandler.build_request( -# reply_channel=callback_channel.descriptor, -# # model=message_model_key, -# model=MessageHandler.build_model(model_bytes, "mini-model", "1.0"), -# # inputs=[message_tensor_input_key], -# inputs=[input_descriptor], -# # outputs=[message_tensor_output_key], -# outputs=[], -# # output_descriptors=[output_descriptor], -# output_descriptors=[], -# custom_attributes=None, -# ) -# request_bytes = MessageHandler.serialize_request(request) -# fli: DragonFLIChannel = worker_queue - -# with fli._fli.sendh(timeout=None, stream_channel=fli._channel) as sendh: -# sendh.send_bytes(request_bytes) -# sendh.send_bytes(batch_bytes) - -# # worker_queue.send(request_bytes) -# # follow up with the actual data -# # worker_queue.send(batch_bytes) - -# logger.info("published message") - -# if iteration_number > 5: -# return - - -# def mock_mli_infrastructure_mgr(): -# config_loader = EnvironmentConfigLoader( -# featurestore_factory=DragonFeatureStore.from_descriptor, -# callback_factory=FileSystemCommChannel.from_descriptor, -# queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, -# ) - -# integrated_worker = TorchWorker - -# worker_manager = WorkerManager( -# config_loader, -# integrated_worker, -# as_service=True, -# cooldown=10, -# device="cpu", -# dispatcher_queue=mp.Queue(maxsize=0), -# ) -# worker_manager.execute() - - -# @pytest.fixture -# def prepare_environment(test_dir: str) -> pathlib.Path: -# """Cleanup prior outputs to run demo repeatedly""" -# path = pathlib.Path(f"{test_dir}/workermanager.log") -# logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) -# return path - - -# def test_worker_manager(prepare_environment: pathlib.Path) -> None: -# """Test the worker manager""" - -# test_path = prepare_environment -# fs_path = test_path / "feature_store" -# comm_path = test_path / "comm_store" - -# # old instantiation code start -# # to_worker_channel = dch.Channel.make_process_local() -# # to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) -# # to_worker_fli_serialized = to_worker_fli.serialize() - -# # # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader -# # # or test environment may be unable to send messages w/queue -# # descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") -# # os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor - -# mgr_per_node = 1 -# num_nodes = 2 -# mem_per_node = 1024**3 -# total_mem = num_nodes * mem_per_node - -# storage = DDict( -# managers_per_node=mgr_per_node, -# n_nodes=num_nodes, -# total_mem=total_mem, -# ) -# backbone = BackboneFeatureStore(storage, allow_reserved_writes=True) - -# to_worker_channel = dch.Channel.make_process_local() -# to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - -# to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, sender_supplied=True) - -# # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader -# # or test environment may be unable to send messages w/queue -# os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor -# os.environ["_SMARTSIM_INFRA_BACKBONE"] = backbone.descriptor - -# config_loader = EnvironmentConfigLoader( -# featurestore_factory=DragonFeatureStore.from_descriptor, -# callback_factory=FileSystemCommChannel.from_descriptor, -# queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, -# ) -# integrated_worker_type = TorchWorker - -# worker_manager = WorkerManager( -# config_loader, -# integrated_worker_type, -# as_service=True, -# cooldown=5, -# device="cpu", -# dispatcher_queue=mp.Queue(maxsize=0), -# ) - -# worker_queue = config_loader.get_queue() -# if worker_queue is None: -# logger.warn( -# f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" -# ) -# backbone.worker_queue = to_worker_fli_comm_channel.descriptor - -# # create a mock client application to populate the request queue -# kill_queue = mp.Queue() -# msg_pump = mp.Process( -# target=mock_messages, -# args=(fs_path, comm_path, kill_queue), -# ) -# msg_pump.start() - -# # create a process to execute commands -# process = mp.Process(target=mock_mli_infrastructure_mgr) - -# # let it send some messages before starting the worker manager -# msg_pump.join(timeout=5) -# process.start() -# msg_pump.join(timeout=5) -# kill_queue.put_nowait("kill!") -# process.join(timeout=5) -# msg_pump.kill() -# process.kill() +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import logging +import pathlib +import time + +import pytest + +from smartsim._core.mli.comm.channel.dragon_util import create_local + +torch = pytest.importorskip("torch") +dragon = pytest.importorskip("dragon") + +import multiprocessing as mp + +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import OutputDescriptor + +try: + mp.set_start_method("dragon") +except Exception: + pass + +import os + +import dragon.channels as dch +import torch.nn as nn +from dragon import fli +from dragon.data.ddict.ddict import DDict + +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.infrastructure.control.worker_manager import ( + EnvironmentConfigLoader, + WorkerManager, +) +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( + DragonFeatureStore, +) +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +from .utils.channel import FileSystemCommChannel + +logger = get_logger(__name__) +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +class MiniModel(nn.Module): + """A torch model that can be executed by the default torch worker""" + + def __init__(self): + """Initialize the model.""" + super().__init__() + + self._name = "mini-model" + self._net = torch.nn.Linear(2, 1) + + def forward(self, input): + """Execute a forward pass.""" + return self._net(input) + + @property + def bytes(self) -> bytes: + """Retrieve the serialized model + + :returns: The byte stream of the model file + """ + buffer = io.BytesIO() + scripted = torch.jit.trace(self._net, self.get_batch()) + torch.jit.save(scripted, buffer) + return buffer.getvalue() + + @classmethod + def get_batch(cls) -> "torch.Tensor": + """Generate a single batch of data with the correct + shape for inference. + + :returns: The batch as a torch tensor + """ + return torch.randn((100, 2), dtype=torch.float32) + + +def create_model(model_path: pathlib.Path) -> pathlib.Path: + """Create a simple torch model and persist to disk for + testing purposes. + + :param model_path: The path to the torch model file + """ + if not model_path.parent.exists(): + model_path.parent.mkdir(parents=True, exist_ok=True) + + model_path.unlink(missing_ok=True) + + mini_model = MiniModel() + torch.save(mini_model, model_path) + + return model_path + + +def load_model() -> bytes: + """Create a simple torch model in memory for testing.""" + mini_model = MiniModel() + return mini_model.bytes + + +def mock_messages( + feature_store_root_dir: pathlib.Path, + comm_channel_root_dir: pathlib.Path, + kill_queue: mp.Queue, +) -> None: + """Mock event producer for triggering the inference pipeline""" + feature_store_root_dir.mkdir(parents=True, exist_ok=True) + comm_channel_root_dir.mkdir(parents=True, exist_ok=True) + + iteration_number = 0 + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + ) + backbone = config_loader.get_backbone() + + worker_queue = config_loader.get_queue() + if worker_queue is None: + queue_desc = config_loader._queue_descriptor + logger.warn( + f"FLI input queue not loaded correctly from config_loader: {queue_desc}" + ) + + model_key = "mini-model" + model_bytes = load_model() + backbone[model_key] = model_bytes + + while True: + if not kill_queue.empty(): + return + iteration_number += 1 + time.sleep(1) + + channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" + callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) + + batch = MiniModel.get_batch() + shape = batch.shape + batch_bytes = batch.numpy().tobytes() + + logger.debug(f"Model content: {backbone[model_key][:20]}") + + input_descriptor = MessageHandler.build_tensor_descriptor( + "f", "float32", list(shape) + ) + + # The first request is always the metadata... + request = MessageHandler.build_request( + reply_channel=callback_channel.descriptor, + model=MessageHandler.build_model(model_bytes, "mini-model", "1.0"), + inputs=[input_descriptor], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + request_bytes = MessageHandler.serialize_request(request) + fli: DragonFLIChannel = worker_queue + + with fli._fli.sendh(timeout=None, stream_channel=fli._channel) as sendh: + sendh.send_bytes(request_bytes) + sendh.send_bytes(batch_bytes) + + logger.info("published message") + + if iteration_number > 5: + return + + +def mock_mli_infrastructure_mgr() -> None: + """Create resources normally instanatiated by the infrastructure + management portion of the DragonBackend + """ + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + ) + + integrated_worker = TorchWorker + + worker_manager = WorkerManager( + config_loader, + integrated_worker, + as_service=True, + cooldown=10, + device="cpu", + dispatcher_queue=mp.Queue(maxsize=0), + ) + worker_manager.execute() + + +@pytest.fixture +def prepare_environment(test_dir: str) -> pathlib.Path: + """Cleanup prior outputs to run demo repeatedly. + + :param tes_dir: the directory to prepare + :returns: The path to the log file""" + path = pathlib.Path(f"{test_dir}/workermanager.log") + logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) + return path + + +def test_worker_manager(prepare_environment: pathlib.Path) -> None: + """Test the worker manager. + + :param prepare_environment: Pass this fixture to configure + global resources before the worker manager executes + """ + + test_path = prepare_environment + fs_path = test_path / "feature_store" + comm_path = test_path / "comm_store" + + mgr_per_node = 1 + num_nodes = 2 + mem_per_node = 1024**3 + total_mem = num_nodes * mem_per_node + + storage = DDict( + managers_per_node=mgr_per_node, + n_nodes=num_nodes, + total_mem=total_mem, + ) + backbone = BackboneFeatureStore(storage, allow_reserved_writes=True) + + to_worker_channel = create_local() + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + + to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, sender_supplied=True) + + # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader + # or test environment may be unable to send messages w/queue + os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor + os.environ["_SMARTSIM_INFRA_BACKBONE"] = backbone.descriptor + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + ) + integrated_worker_type = TorchWorker + + worker_manager = WorkerManager( + config_loader, + integrated_worker_type, + as_service=True, + cooldown=5, + device="cpu", + dispatcher_queue=mp.Queue(maxsize=0), + ) + + worker_queue = config_loader.get_queue() + if worker_queue is None: + logger.warn( + f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" + ) + backbone.worker_queue = to_worker_fli_comm_channel.descriptor + + # create a mock client application to populate the request queue + kill_queue = mp.Queue() + msg_pump = mp.Process( + target=mock_messages, + args=(fs_path, comm_path, kill_queue), + ) + msg_pump.start() + + # create a process to execute commands + process = mp.Process(target=mock_mli_infrastructure_mgr) + + # let it send some messages before starting the worker manager + msg_pump.join(timeout=5) + process.start() + msg_pump.join(timeout=5) + kill_queue.put_nowait("kill!") + process.join(timeout=5) + msg_pump.kill() + process.kill() diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index 003d79400..4c46359c2 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -42,7 +42,8 @@ class FileSystemCommChannel(CommChannelBase): def __init__(self, key: pathlib.Path) -> None: """Initialize the FileSystemCommChannel instance. - :param key: a path to the root directory of the feature store""" + :param key: a path to the root directory of the feature store + """ self._lock = threading.RLock() super().__init__(key.as_posix()) @@ -57,7 +58,7 @@ def send(self, value: bytes, timeout: float = 0) -> None: """Send a message throuh the underlying communication channel. :param value: The value to send - :param timeout: Maximum time to wait (in seconds) for messages to send + :param timeout: maximum time to wait (in seconds) for messages to send """ with self._lock: # write as text so we can add newlines as delimiters @@ -67,11 +68,12 @@ def send(self, value: bytes, timeout: float = 0) -> None: logger.debug(f"FileSystemCommChannel {self._file_path} sent message") def recv(self, timeout: float = 0) -> t.List[bytes]: - """Receives message(s) through the underlying communication channel + """Receives message(s) through the underlying communication channel. :param timeout: maximum time to wait (in seconds) for messages to arrive :returns: the received message - :raises SmartSimError: if the descriptor points to a missing file""" + :raises SmartSimError: if the descriptor points to a missing file + """ with self._lock: messages: t.List[bytes] = [] if not self._file_path.exists(): @@ -100,7 +102,7 @@ def recv(self, timeout: float = 0) -> t.List[bytes]: return messages def clear(self) -> None: - """Create an empty file for events""" + """Create an empty file for events.""" if self._file_path.exists(): self._file_path.unlink() self._file_path.touch() @@ -110,10 +112,11 @@ def from_descriptor( cls, descriptor: str, ) -> "FileSystemCommChannel": - """A factory method that creates an instance from a descriptor string + """A factory method that creates an instance from a descriptor string. :param descriptor: The descriptor that uniquely identifies the resource - :returns: An attached FileSystemCommChannel""" + :returns: An attached FileSystemCommChannel + """ try: path = pathlib.Path(descriptor) return FileSystemCommChannel(path) diff --git a/tests/mli/channel.py b/tests/mli/channel.py index 1bbf159b1..4c46359c2 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -40,9 +40,10 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" def __init__(self, key: pathlib.Path) -> None: - """Initialize the FileSystemCommChannel instance + """Initialize the FileSystemCommChannel instance. - :param key: a path to the root directory of the feature store""" + :param key: a path to the root directory of the feature store + """ self._lock = threading.RLock() super().__init__(key.as_posix()) @@ -54,7 +55,7 @@ def __init__(self, key: pathlib.Path) -> None: self._file_path.touch() def send(self, value: bytes, timeout: float = 0) -> None: - """Send a message throuh the underlying communication channel + """Send a message throuh the underlying communication channel. :param value: The value to send :param timeout: maximum time to wait (in seconds) for messages to send @@ -67,11 +68,12 @@ def send(self, value: bytes, timeout: float = 0) -> None: logger.debug(f"FileSystemCommChannel {self._file_path} sent message") def recv(self, timeout: float = 0) -> t.List[bytes]: - """Receives message(s) through the underlying communication channel + """Receives message(s) through the underlying communication channel. :param timeout: maximum time to wait (in seconds) for messages to arrive :returns: the received message - :raises SmartSimError: if the descriptor points to a missing file""" + :raises SmartSimError: if the descriptor points to a missing file + """ with self._lock: messages: t.List[bytes] = [] if not self._file_path.exists(): @@ -100,7 +102,7 @@ def recv(self, timeout: float = 0) -> t.List[bytes]: return messages def clear(self) -> None: - """Create an empty file for events""" + """Create an empty file for events.""" if self._file_path.exists(): self._file_path.unlink() self._file_path.touch() @@ -110,10 +112,11 @@ def from_descriptor( cls, descriptor: str, ) -> "FileSystemCommChannel": - """A factory method that creates an instance from a descriptor string + """A factory method that creates an instance from a descriptor string. :param descriptor: The descriptor that uniquely identifies the resource - :returns: An attached FileSystemCommChannel""" + :returns: An attached FileSystemCommChannel + """ try: path = pathlib.Path(descriptor) return FileSystemCommChannel(path) From f5b7b7d3ada9e48cf350f9f93e27df088ea1f927 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Wed, 25 Sep 2024 18:07:55 -0500 Subject: [PATCH 10/40] fix infinite loop bug in consumer batch receive --- .../storage/backbone_feature_store.py | 72 ++++++++++-------- tests/dragon/test_featurestore_integration.py | 73 +++++++++++-------- 2 files changed, 85 insertions(+), 60 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index f8515220f..d247c8952 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -569,7 +569,6 @@ def __init__( comm_channel: CommChannelBase, backbone: BackboneFeatureStore, filters: t.Optional[t.List[EventCategory]] = None, - batch_timeout: t.Optional[float] = None, name: t.Optional[str] = None, event_handler: t.Optional[t.Callable[[EventBase], None]] = None, ) -> None: @@ -583,13 +582,9 @@ def __init__( auto-generated GUID will be used :raises ValueError: If batch_timeout <= 0 """ - if batch_timeout is not None and batch_timeout <= 0: - raise ValueError("batch_timeout must be a non-zero, positive value") - self._comm_channel = comm_channel self._backbone = backbone self._global_filters = filters or [] - self._global_timeout = batch_timeout or 1.0 self._name = name self._event_handler = event_handler @@ -612,50 +607,67 @@ def name(self) -> str: return self._name def recv( - self, filters: t.Optional[t.List[EventCategory]] = None, timeout: float = 0.001 + self, + filters: t.Optional[t.List[EventCategory]] = None, + timeout: float = 0.001, + batch_timeout: float = 1.0, ) -> t.List[EventBase]: """Receives available published event(s). :param filters: Additional filters to add to the global filters configured on the EventConsumer instance - :param timeout: Maximum time to wait for messages to arrive + :param timeout: Maximum time to wait for a single message to arrive + :param batch_timeout: Maximum time to wait for messages to arrive; allows + multiple batches to be retrieved in one call to `send` :returns: A list of events that pass any configured filters """ if filters is None: filters = [] + if batch_timeout is not None and batch_timeout <= 0: + raise ValueError("batch_timeout must be a non-zero, positive value") + filter_set = {*self._global_filters, *filters} - messages: t.List[t.Any] = [] + all_message_bytes: t.List[bytes] = [] - # use the local timeout to override a global setting - start_at = time.time_ns() + # firehose as many messages as possible within the batch_timeout + start_at = time.time() + remaining = batch_timeout - while msg_bytes_list := self._comm_channel.recv(timeout=timeout): + batch_message_bytes = self._comm_channel.recv(timeout=timeout) + while batch_message_bytes: # remove any empty messages that will fail to decode - msg_bytes_list = [msg for msg in msg_bytes_list if msg] + all_message_bytes.extend(batch_message_bytes) + batch_message_bytes = [] + + # avoid getting stuck indefinitely waiting for the channel + elapsed = time.time() - start_at + remaining = batch_timeout - elapsed - msg: t.Optional[EventBase] = None - if msg_bytes_list: - for message in msg_bytes_list: - msg = pickle.loads(message) + if remaining > 0: + batch_message_bytes = self._comm_channel.recv(timeout=timeout) - if not msg: - logger.warning("Unable to unpickle message") - continue + events_received: t.List[EventBase] = [] - # ignore anything that doesn't match a filter (if one is - # supplied), otherwise return everything - if not filter_set or msg.category in filter_set: - messages.append(msg) + # Timeout elapsed or no messages received - return the empty list + if not all_message_bytes: + return events_received - # avoid getting stuck indefinitely waiting for the channel - elapsed = (time.time_ns() - start_at) / 1000000000 - remaining = elapsed - self._global_timeout - if remaining > 0: - logger.debug(f"Consumer batch timeout exceeded by: {abs(remaining)}") - break + for message in all_message_bytes: + if not message or message is None: + continue + + event = pickle.loads(message) + if not event: + logger.warning("Unable to unpickle message") + + # skip events that don't pass a filter + if filter_set and event.category not in filter_set: + continue + + events_received.append(event) - return messages + return events_received def register(self) -> None: """Send an event to register this consumer as a listener""" diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index fa6f99001..0c8679224 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -58,11 +58,23 @@ pytestmark = pytest.mark.dragon -@pytest.fixture +@pytest.fixture(scope="session") def storage_for_dragon_fs() -> t.Dict[str, str]: return dragon_ddict.DDict() +@pytest.fixture(scope="session") +def the_worker_channel() -> DragonCommChannel: + wmgr_channel_ = create_local() + wmgr_channel = DragonCommChannel(wmgr_channel_) + return wmgr_channel + + +@pytest.fixture(scope="session") +def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore: + return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) + + def test_eventconsumer_eventpublisher_integration( storage_for_dragon_fs: t.Any, test_dir: str ) -> None: @@ -147,19 +159,21 @@ def test_eventconsumer_eventpublisher_integration( @pytest.mark.parametrize( - "num_events, batch_timeout", + "num_events, batch_timeout, max_batches_expected", [ - pytest.param(1, 1.0, id="under 1s timeout"), - pytest.param(20, 1.0, id="test 1s timeout w/20"), - pytest.param(50, 1.0, id="test 1s timeout w/50"), - pytest.param(60, 0.1, id="small batches"), - pytest.param(100, 0.1, id="many small batches"), + pytest.param(1, 1.0, 2, id="under 1s timeout"), + pytest.param(20, 1.0, 3, id="test 1s timeout 20x"), + pytest.param(30, 0.2, 5, id="test 0.2s timeout 30x"), + pytest.param(60, 0.4, 4, id="small batches"), + pytest.param(100, 0.1, 10, id="many small batches"), ], ) def test_eventconsumer_max_dequeue( num_events: int, batch_timeout: float, - storage_for_dragon_fs: t.Any, + max_batches_expected: int, + the_worker_channel: DragonCommChannel, + the_backbone: BackboneFeatureStore, ) -> None: """Verify that a consumer does not sit and collect messages indefinitely by checking that a consumer returns after a maximum timeout is exceeded. @@ -170,57 +184,56 @@ def test_eventconsumer_max_dequeue( :param storage_for_dragon_fs: Dragon storage engine to use """ - mock_storage = storage_for_dragon_fs - backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) - - wmgr_channel_ = Channel.make_process_local() - wmgr_channel = DragonCommChannel(wmgr_channel_) - wmgr_consumer_descriptor = wmgr_channel.descriptor - # create some consumers to receive messages wmgr_consumer = EventConsumer( - wmgr_channel, - backbone, + the_worker_channel, + the_backbone, filters=[EventCategory.FEATURE_STORE_WRITTEN], - batch_timeout=batch_timeout, ) # create a broadcaster to publish messages mock_client_app = EventBroadcaster( - backbone, + the_backbone, channel_factory=DragonCommChannel.from_descriptor, ) # register all of the consumers even though the OnCreateConsumer really should # trigger its registration. event processing is tested elsewhere. - backbone.notification_channels = [wmgr_consumer_descriptor] + the_backbone.notification_channels = [the_worker_channel.descriptor] # simulate the app updating a model a lot of times for key in (f"key-{i}" for i in range(num_events)): - event = OnWriteFeatureStore(backbone.descriptor, key) - mock_client_app.send(event, timeout=0.1) + event = OnWriteFeatureStore(the_backbone.descriptor, key) + mock_client_app.send(event, timeout=0.01) num_dequeued = 0 + num_batches = 0 - while wmgr_messages := wmgr_consumer.recv(timeout=0.01): + while wmgr_messages := wmgr_consumer.recv( + timeout=0.1, + batch_timeout=batch_timeout, + ): # worker manager should not get more than `max_num_msgs` events num_dequeued += len(wmgr_messages) + num_batches += 1 # make sure we made all the expected dequeue calls and got everything assert num_dequeued == num_events + assert num_batches > 0 + assert num_batches < max_batches_expected, "too many recv calls were made" @pytest.mark.parametrize( "buffer_size", [ - pytest.param(-1, id="use default: 500"), - pytest.param(0, id="use default: 500"), + pytest.param(-1, id="replace negative, default to 500"), + pytest.param(0, id="replace zero, default to 500"), pytest.param(1, id="non-zero buffer size: 1"), - pytest.param(500, id="buffer size: 500"), - pytest.param(800, id="buffer size: 800"), + pytest.param(550, id="larger than default: 550"), + pytest.param(800, id="much larger then default: 800"), pytest.param( 1000, - id="buffer size: 1000, unreliable in dragon-v0.10", + id="very large buffer: 1000, unreliable in dragon-v0.10", marks=pytest.mark.skip, ), ], @@ -261,8 +274,8 @@ def test_channel_buffer_size( # simulate the app updating a model a lot of times for key in (f"key-{i}" for i in range(buffer_size)): event = OnWriteFeatureStore(backbone.descriptor, key) - mock_client_app.send(event, timeout=0.1) + mock_client_app.send(event, timeout=0.01) # adding 1 more over the configured buffer size should report the error with pytest.raises(Exception) as ex: - mock_client_app.send(event, timeout=0.1) + mock_client_app.send(event, timeout=0.01) From bd037112bd848102e2d538a17fd74eddd1b0c046 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Wed, 25 Sep 2024 18:18:39 -0500 Subject: [PATCH 11/40] Sort imports to solve dragon import issue in non-dragon tests --- tests/dragon/test_dragon_backend.py | 24 ++++++++---------------- tests/dragon/test_worker_manager.py | 3 +-- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py index 0e16be5e2..dc2aceeaa 100644 --- a/tests/dragon/test_dragon_backend.py +++ b/tests/dragon/test_dragon_backend.py @@ -30,30 +30,22 @@ import pytest -from smartsim._core.launcher.dragon.dragonBackend import DragonBackend, NodePrioritizer -from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( - BackboneFeatureStore, - EventBase, - EventBroadcaster, - EventConsumer, - EventSender, - OnCreateConsumer, -) -from smartsim.log import get_logger - dragon = pytest.importorskip("dragon") -import dragon.utils as du from dragon.channels import Channel from dragon.data.ddict.ddict import DDict from dragon.fli import DragonFLIError, FLInterface +from smartsim._core.launcher.dragon.dragonBackend import DragonBackend from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel -from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( - DragonFeatureStore, +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBase, + EventBroadcaster, + EventConsumer, + OnCreateConsumer, ) +from smartsim.log import get_logger # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 0feefdb51..69d962940 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -31,8 +31,6 @@ import pytest -from smartsim._core.mli.comm.channel.dragon_util import create_local - torch = pytest.importorskip("torch") dragon = pytest.importorskip("dragon") @@ -56,6 +54,7 @@ from dragon.data.ddict.ddict import DDict from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.control.worker_manager import ( EnvironmentConfigLoader, WorkerManager, From 98260f3eadd6184c53d5c1a04ddd2f3a2a999241 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Wed, 25 Sep 2024 19:28:05 -0500 Subject: [PATCH 12/40] swap session scopes to module to destroy dragon resources --- tests/dragon/test_featurestore_integration.py | 6 +++--- tests/dragon/test_protoclient.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index 0c8679224..091610592 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -58,19 +58,19 @@ pytestmark = pytest.mark.dragon -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def storage_for_dragon_fs() -> t.Dict[str, str]: return dragon_ddict.DDict() -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def the_worker_channel() -> DragonCommChannel: wmgr_channel_ = create_local() wmgr_channel = DragonCommChannel(wmgr_channel_) return wmgr_channel -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore: return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index 4310b6de0..c758ce971 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -60,7 +60,7 @@ logger = get_logger(__name__) -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def storage_for_dragon_fs() -> t.Dict[str, str]: """Fixture that creates a dragon distributed dictionary. @@ -69,7 +69,7 @@ def storage_for_dragon_fs() -> t.Dict[str, str]: return dragon_ddict.DDict(1, 2, 4 * 1024**2) -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: """Fixture that creates a dragon backbone feature store. @@ -81,7 +81,7 @@ def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: """Fixture that creates a dragon FLI channel as a stand-in for the worker queue created by the worker. From 5ba2a4216a328b6712a73adaebdc056d7c36e8cc Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Thu, 26 Sep 2024 12:46:46 -0500 Subject: [PATCH 13/40] use make process local to avoid MPI issue, fix some test regressions, reuse some test fixtures for speed --- .../standalone_worker_manager.py | 9 +- .../_core/mli/comm/channel/dragon_util.py | 75 ++++--- .../storage/backbone_feature_store.py | 4 +- tests/dragon/test_environment_loader.py | 8 +- tests/dragon/test_error_handling.py | 48 ++-- tests/dragon/test_featurestore.py | 207 +++++++++++------- tests/dragon/test_featurestore_base.py | 12 +- tests/dragon/test_featurestore_integration.py | 46 +++- 8 files changed, 248 insertions(+), 161 deletions(-) diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py index 1d0b11055..fdef4268a 100644 --- a/ex/high_throughput_inference/standalone_worker_manager.py +++ b/ex/high_throughput_inference/standalone_worker_manager.py @@ -46,20 +46,17 @@ import argparse import base64 import multiprocessing as mp -import optparse import os -import pickle import socket -import sys import time import typing as t import cloudpickle from smartsim._core.entrypoints.service import Service -from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestDispatcher, ) @@ -71,8 +68,6 @@ from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.feature_store import ReservedKeys -from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase from smartsim.log import get_logger logger = get_logger("Worker Manager Entry Point") @@ -144,7 +139,7 @@ def service_as_dragon_proc( backbone = BackboneFeatureStore.from_descriptor(ddict_str) - to_worker_channel = Channel.make_process_local() + to_worker_channel = create_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, True) diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py index 2980dc9a6..014e9c0a4 100644 --- a/smartsim/_core/mli/comm/channel/dragon_util.py +++ b/smartsim/_core/mli/comm/channel/dragon_util.py @@ -49,7 +49,7 @@ def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str: - """Utility method for converting a channel to a descriptor string. + """Convert a dragon channel to a descriptor string. :param channel: The dragon channel to convert :returns: The descriptor string @@ -62,7 +62,7 @@ def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str def pool_to_descriptor(pool: dm.MemoryPool) -> str: - """Utility method for converting a pool to a descriptor string. + """Convert a dragon memory pool to a descriptor string. :param pool: The memory pool to convert :returns: The descriptor string""" @@ -74,7 +74,7 @@ def pool_to_descriptor(pool: dm.MemoryPool) -> str: def descriptor_to_fli(descriptor: str) -> "fli.FLInterface": - """Helper method to attach a new FLI instance given + """Create and attach a new FLI instance given the string-encoded descriptor. :param descriptor: The descriptor of an FLI to attach to @@ -84,7 +84,7 @@ def descriptor_to_fli(descriptor: str) -> "fli.FLInterface": def descriptor_to_channel(descriptor: str) -> dch.Channel: - """Helper method to attach a new Channel instance given + """Create and attach a new Channel instance given the string-encoded descriptor. :param descriptor: The descriptor of a channel to attach to @@ -93,7 +93,7 @@ def descriptor_to_channel(descriptor: str) -> dch.Channel: return dch.Channel.attach(descriptor_) -def create_local(capacity: int = 0) -> dch.Channel: +def create_local(_capacity: int = 0) -> dch.Channel: """Creates a Channel attached to the local memory pool. Replacement for direct calls to `dch.Channel.make_process_local()` to enable supplying a channel capacity. @@ -103,35 +103,38 @@ def create_local(capacity: int = 0) -> dch.Channel: :returns: The instantiated channel :raises SmartSimError: If unable to attach local channel """ - pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd)) - pool_descriptor = pool_to_descriptor(pool) - channel: t.Optional[dch.Channel] = None - offset = 0 - - global LAST_OFFSET - if LAST_OFFSET: - offset = LAST_OFFSET - - capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE - - while not channel: - # search for an open channel ID - offset += 1 - channel_id = df.BASE_USER_MANAGED_CUID + offset - try: - channel = dch.Channel(mem_pool=pool, c_uid=channel_id, capacity=capacity) - LAST_OFFSET = offset - descriptor = channel_to_descriptor(channel) - logger.debug( - "Local channel created: " - f"{channel_id=}, {pool_descriptor=}, {capacity=}, {descriptor=}" - ) - except dch.ChannelError as e: - if offset < 100: - logger.warning(f"Channnel id `{channel_id}` is not open. Retrying...") - else: - LAST_OFFSET = 0 - logger.error(f"All attempts to attach local channel have failed") - raise SmartSimError("Failed to attach local channel") from e - + # current implementation has a bug wrt MPI that must be fixed. + # falling back to `make_process_local` and disabling buffer size tests + + # pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd)) + # pool_descriptor = pool_to_descriptor(pool) + # channel: t.Optional[dch.Channel] = None + # offset = 0 + + # global LAST_OFFSET + # if LAST_OFFSET: + # offset = LAST_OFFSET + + # capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE + + # while not channel: + # # search for an open channel ID + # offset += 1 + # channel_id = df.BASE_USER_MANAGED_CUID + offset + # try: + # channel = dch.Channel(mem_pool=pool, c_uid=channel_id, capacity=capacity) + # LAST_OFFSET = offset + # descriptor = channel_to_descriptor(channel) + # logger.debug( + # "Local channel created: " + # f"{channel_id=}, {pool_descriptor=}, {capacity=}, {descriptor=}" + # ) + # except dch.ChannelError as e: + # if offset < 100: + # logger.warning(f"Channnel id `{channel_id}` is not open. Retrying...") + # else: + # LAST_OFFSET = 0 + # logger.error(f"All attempts to attach local channel have failed") + # raise SmartSimError("Failed to attach local channel") from e + channel = dch.Channel.make_process_local() return channel diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index d247c8952..1542f3811 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -548,16 +548,16 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int: :param timeout: Maximum time to wait (in seconds) for messages to send :returns: The number of events successfully published :raises ValueError: If event serialization fails + :raises AttributeError: If event cannot be serialized :raises KeyError: If channel fails to attach using registered descriptors :raises SmartSimError: If any unexpected error occurs during send """ try: self._save_to_buffer(event) return self._broadcast(timeout) - except (KeyError, ValueError, SmartSimError): + except (KeyError, ValueError, AttributeError, SmartSimError): raise except Exception as ex: - logger.exception("An unexpected exception occurred while sending") raise SmartSimError("An unexpected failure occurred while sending") from ex diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 4f45614d9..47e75109a 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -29,12 +29,12 @@ dragon = pytest.importorskip("dragon") import dragon.utils as du -from dragon.channels import Channel from dragon.data.ddict.ddict import DDict -from dragon.fli import DragonFLIError, FLInterface +from dragon.fli import FLInterface from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, @@ -54,7 +54,7 @@ ) def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch): """A descriptor can be stored, loaded, and reattached""" - chan = Channel.make_process_local() + chan = create_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv( "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) @@ -77,7 +77,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): """The serialized descriptors of a loaded and unloaded queue are the same""" - chan = Channel.make_process_local() + chan = create_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv( "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index b0934b6f5..1e659168b 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -33,7 +33,6 @@ import multiprocessing as mp -import dragon.utils as du from dragon.channels import Channel from dragon.data.ddict.ddict import DDict from dragon.fli import FLInterface @@ -41,7 +40,7 @@ from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -from smartsim._core.mli.infrastructure.control.device_manager import WorkerDevice +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestDispatcher, ) @@ -62,7 +61,6 @@ ExecuteResult, FetchInputResult, FetchModelResult, - InferenceReply, InferenceRequest, LoadModelResult, MachineLearningWorkerBase, @@ -80,14 +78,26 @@ pytestmark = pytest.mark.dragon -@pytest.fixture +@pytest.fixture(scope="module") +def the_worker_channel() -> DragonFLIChannel: + """Fixture to create a valid descriptor for a worker channel + that can be attached to. + + NOTE: using module scoped fixtures drastically improves test run-time""" + channel_ = create_local() + fli_ = FLInterface(main_ch=channel_, manager_ch=None) + comm_channel = DragonFLIChannel(fli_, True) + return comm_channel + + +@pytest.fixture(scope="module") def backbone_descriptor() -> str: # create a shared backbone featurestore feature_store = DragonFeatureStore(DDict()) return feature_store.descriptor -@pytest.fixture +@pytest.fixture(scope="module") def app_feature_store() -> FeatureStore: # create a standalone feature store to mimic a user application putting # data into an application-owned resource (app should not access backbone) @@ -101,14 +111,11 @@ def setup_worker_manager_model_bytes( monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, + the_worker_channel: DragonFLIChannel, ): integrated_worker_type = IntegratedTorchWorker - chan = Channel.make_process_local() - queue = FLInterface(main_ch=chan) - monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) - ) + monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) @@ -160,14 +167,11 @@ def setup_worker_manager_model_key( monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, + the_worker_channel: DragonFLIChannel, ): integrated_worker_type = IntegratedTorchWorker - chan = Channel.make_process_local() - queue = FLInterface(main_ch=chan) - monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) - ) + monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) @@ -217,14 +221,11 @@ def setup_request_dispatcher_model_bytes( monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, + the_worker_channel: DragonFLIChannel, ): integrated_worker_type = IntegratedTorchWorker - chan = Channel.make_process_local() - queue = FLInterface(main_ch=chan) - monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) - ) + monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) @@ -261,14 +262,11 @@ def setup_request_dispatcher_model_key( monkeypatch: pytest.MonkeyPatch, backbone_descriptor: str, app_feature_store: FeatureStore, + the_worker_channel: DragonFLIChannel, ): integrated_worker_type = IntegratedTorchWorker - chan = Channel.make_process_local() - queue = FLInterface(main_ch=chan) - monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) - ) + monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index f59501df1..32e1c3a82 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -38,6 +38,7 @@ from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, EventBroadcaster, @@ -68,42 +69,46 @@ pytestmark = pytest.mark.dragon -@pytest.fixture +@pytest.fixture(scope="module") def storage_for_dragon_fs() -> t.Dict[str, str]: + """Fixture to instantiate a dragon distributed dictionary. + + NOTE: using module scoped fixtures drastically improves test run-time""" return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) -@pytest.fixture -def storage_for_dragon_fs_with_req_queue( - storage_for_dragon_fs: t.Dict[str, str] -) -> t.Dict[str, str]: - # create a valid FLI so any call to attach does not fail - channel_ = Channel.make_process_local() +@pytest.fixture(scope="module") +def the_worker_channel() -> DragonFLIChannel: + """Fixture to create a valid descriptor for a worker channel + that can be attached to. + + NOTE: using module scoped fixtures drastically improves test run-time""" + # wmgr_channel_ = create_local() + # wmgr_channel = DragonCommChannel(wmgr_channel_) + # return wmgr_channel + channel_ = create_local() fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) comm_channel = DragonFLIChannel(fli_, True) + return comm_channel - storage_for_dragon_fs[BackboneFeatureStore.MLI_WORKER_QUEUE] = ( - comm_channel.descriptor - ) - return storage_for_dragon_fs +@pytest.fixture(scope="module") +def the_backbone( + storage_for_dragon_fs: t.Any, the_worker_channel: DragonFLIChannel +) -> BackboneFeatureStore: + """Fixture to create a distributed dragon dictionary and wrap it + in a BackboneFeatureStore. -@pytest.fixture -def storage_for_dragon_fs_with_mock_req_queue( - storage_for_dragon_fs: t.Dict[str, str] -) -> t.Dict[str, str]: - # # create a valid FLI so any call to attach does not fail - # channel_ = Channel.make_process_local() - # fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) - # comm_channel = DragonFLIChannel(fli_, True) + NOTE: using module scoped fixtures drastically improves test run-time""" - mock_descriptor = "12345" - storage_for_dragon_fs[BackboneFeatureStore.MLI_WORKER_QUEUE] = mock_descriptor - return storage_for_dragon_fs + backbone = BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) + backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor + + return backbone def test_eventconsumer_eventpublisher_integration( - storage_for_dragon_fs: t.Any, test_dir: str + the_backbone: BackboneFeatureStore, test_dir: str ) -> None: """Verify that the publisher and consumer integrate as expected when multiple publishers and consumers are sending simultaneously. This @@ -114,20 +119,13 @@ def test_eventconsumer_eventpublisher_integration( :param test_dir: pytest fixture automatically generating unique working directories for individual test outputs""" - mock_storage = storage_for_dragon_fs - backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) - # verify ability to write and read from ddict - backbone["test_dir"] = test_dir - assert backbone["test_dir"] == test_dir - - wmgr_channel_ = Channel.make_process_local() - capp_channel_ = Channel.make_process_local() - back_channel_ = Channel.make_process_local() + the_backbone["test_dir"] = test_dir + assert the_backbone["test_dir"] == test_dir - wmgr_channel = DragonCommChannel(wmgr_channel_) - capp_channel = DragonCommChannel(capp_channel_) - back_channel = DragonCommChannel(back_channel_) + wmgr_channel = DragonCommChannel(create_local()) + capp_channel = DragonCommChannel(create_local()) + back_channel = DragonCommChannel(create_local()) wmgr_consumer_descriptor = wmgr_channel.descriptor capp_consumer_descriptor = capp_channel.descriptor @@ -136,32 +134,32 @@ def test_eventconsumer_eventpublisher_integration( # create some consumers to receive messages wmgr_consumer = EventConsumer( wmgr_channel, - backbone, + the_backbone, filters=[EventCategory.FEATURE_STORE_WRITTEN], ) capp_consumer = EventConsumer( capp_channel, - backbone, + the_backbone, ) back_consumer = EventConsumer( back_channel, - backbone, + the_backbone, filters=[EventCategory.CONSUMER_CREATED], ) # create some broadcasters to publish messages mock_worker_mgr = EventBroadcaster( - backbone, + the_backbone, channel_factory=DragonCommChannel.from_descriptor, ) mock_client_app = EventBroadcaster( - backbone, + the_backbone, channel_factory=DragonCommChannel.from_descriptor, ) # register all of the consumers even though the OnCreateConsumer really should # trigger its registration. event processing is tested elsewhere. - backbone.notification_channels = [ + the_backbone.notification_channels = [ wmgr_consumer_descriptor, capp_consumer_descriptor, back_consumer_descriptor, @@ -172,9 +170,9 @@ def test_eventconsumer_eventpublisher_integration( mock_worker_mgr.send(event_1) # simulate the app updating a model a few times - event_2 = OnWriteFeatureStore(backbone.descriptor, "key-1") - event_3 = OnWriteFeatureStore(backbone.descriptor, "key-2") - event_4 = OnWriteFeatureStore(backbone.descriptor, "key-1") + event_2 = OnWriteFeatureStore(the_backbone.descriptor, "key-1") + event_3 = OnWriteFeatureStore(the_backbone.descriptor, "key-2") + event_4 = OnWriteFeatureStore(the_backbone.descriptor, "key-1") mock_client_app.send(event_2) mock_client_app.send(event_3) @@ -194,7 +192,7 @@ def test_eventconsumer_eventpublisher_integration( def test_backbone_wait_for_no_keys( - storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch + the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch ) -> None: """Verify that asking the backbone to wait for a value succeeds immediately and does not cause a wait to occur if the supplied key @@ -203,15 +201,12 @@ def test_backbone_wait_for_no_keys( :param storage_for_dragon_fs: the storage engine to use, prepopulated with """ # set a very low timeout to confirm that it does not wait - storage = storage_for_dragon_fs_with_req_queue - - backbone = BackboneFeatureStore(storage) with monkeypatch.context() as ctx: # all keys should be found and the timeout should never be checked. ctx.setattr(bbtime, "sleep", mock.MagicMock()) - values = backbone.wait_for([]) + values = the_backbone.wait_for([]) assert len(values) == 0 # confirm that no wait occurred @@ -219,7 +214,7 @@ def test_backbone_wait_for_no_keys( def test_backbone_wait_for_prepopulated( - storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch + the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch ) -> None: """Verify that asking the backbone to wait for a value succeed immediately and do not cause a wait to occur if the data exists @@ -227,15 +222,12 @@ def test_backbone_wait_for_prepopulated( :param storage_for_dragon_fs: the storage engine to use, prepopulated with """ # set a very low timeout to confirm that it does not wait - storage = storage_for_dragon_fs_with_req_queue - - backbone = BackboneFeatureStore(storage) with monkeypatch.context() as ctx: # all keys should be found and the timeout should never be checked. ctx.setattr(bbtime, "sleep", mock.MagicMock()) - values = backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE]) + values = the_backbone.wait_for([BackboneFeatureStore.MLI_WORKER_QUEUE], 0.1) # confirm that wait_for with one key returns one value assert len(values) == 1 @@ -248,7 +240,7 @@ def test_backbone_wait_for_prepopulated( def test_backbone_wait_for_prepopulated_dupe( - storage_for_dragon_fs_with_req_queue: t.Any, monkeypatch: pytest.MonkeyPatch + the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch ) -> None: """Verify that asking the backbone to wait for keys that are duplicated results in a single value being returned for each key @@ -256,19 +248,17 @@ def test_backbone_wait_for_prepopulated_dupe( :param storage_for_dragon_fs: the storage engine to use, prepopulated with """ # set a very low timeout to confirm that it does not wait - storage = storage_for_dragon_fs_with_req_queue - backbone = BackboneFeatureStore(storage) key1, key2 = "key-1", "key-2" value1, value2 = "i-am-value-1", "i-am-value-2" - backbone[key1] = value1 - backbone[key2] = value2 + the_backbone[key1] = value1 + the_backbone[key2] = value2 with monkeypatch.context() as ctx: # all keys should be found and the timeout should never be checked. ctx.setattr(bbtime, "sleep", mock.MagicMock()) - values = backbone.wait_for([key1, key2, key1]) # key1 is duplicated + values = the_backbone.wait_for([key1, key2, key1]) # key1 is duplicated # confirm that wait_for with one key returns one value assert len(values) == 2 @@ -294,10 +284,43 @@ def set_value_after_delay( logger.debug(f"set_value_after_delay wrote `{value} to backbone[`{key}`]") -@pytest.mark.skip(reason="Using mp on build agent is not working correctly") -@pytest.mark.parametrize("delay", [0, 1, 2, 4, 8]) +@pytest.mark.parametrize( + "delay", + [ + pytest.param( + 0, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 1, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 2, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 4, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 8, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + ], +) def test_backbone_wait_for_partial_prepopulated( - storage_for_dragon_fs_with_mock_req_queue: t.Any, delay: float + the_backbone: BackboneFeatureStore, delay: float ) -> None: """Verify that when data is not all in the backbone, the `wait_for` operation continues to poll until it finds everything it needs @@ -308,19 +331,17 @@ def test_backbone_wait_for_partial_prepopulated( """ # set a very low timeout to confirm that it does not wait wait_timeout = 10 - storage = storage_for_dragon_fs_with_mock_req_queue - backbone = BackboneFeatureStore(storage) key, value = str(uuid.uuid4()), str(random.random() * 10) logger.debug(f"Starting process to write {key} after {delay}s") p = mp.Process( - target=set_value_after_delay, args=(backbone.descriptor, key, value, delay) + target=set_value_after_delay, args=(the_backbone.descriptor, key, value, delay) ) p.start() p2 = mp.Process( - target=backbone.wait_for, + target=the_backbone.wait_for, args=([BackboneFeatureStore.MLI_WORKER_QUEUE, key],), kwargs={"timeout": wait_timeout}, ) @@ -330,7 +351,9 @@ def test_backbone_wait_for_partial_prepopulated( p2.join() # both values should be written at this time - ret_vals = backbone.wait_for([key, BackboneFeatureStore.MLI_WORKER_QUEUE, key], 0.1) + ret_vals = the_backbone.wait_for( + [key, BackboneFeatureStore.MLI_WORKER_QUEUE, key], 0.1 + ) # confirm that wait_for with two keys returns two values assert len(ret_vals) == 2, "values should contain values for both awaited keys" @@ -343,10 +366,43 @@ def test_backbone_wait_for_partial_prepopulated( assert ret_vals[key] == value, "verify order of values " -@pytest.mark.skip(reason="Using mp on build agent is not working correctly") -@pytest.mark.parametrize("num_keys", [0, 1, 3, 7, 11]) +@pytest.mark.parametrize( + "num_keys", + [ + pytest.param( + 0, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 1, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 3, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 7, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + pytest.param( + 11, + marks=pytest.mark.skip( + "Must use entrypoint instead of mp.Process to run on build agent" + ), + ), + ], +) def test_backbone_wait_for_multikey( - storage_for_dragon_fs_with_req_queue: t.Any, + the_backbone: BackboneFeatureStore, num_keys: int, test_dir: str, ) -> None: @@ -358,8 +414,6 @@ def test_backbone_wait_for_multikey( """ # maximum delay allowed for setter processes max_delay = 5 - storage = storage_for_dragon_fs_with_req_queue - backbone = BackboneFeatureStore(storage) extra_keys = [str(uuid.uuid4()) for _ in range(num_keys)] extra_values = [str(uuid.uuid4()) for _ in range(num_keys)] @@ -371,13 +425,14 @@ def test_backbone_wait_for_multikey( assert delay < max_delay, "write delay exceeds test timeout" logger.debug(f"Delaying {key} write by {delay} seconds") p = mp.Process( - target=set_value_after_delay, args=(backbone.descriptor, key, value, delay) + target=set_value_after_delay, + args=(the_backbone.descriptor, key, value, delay), ) p.start() processes.append(p) p2 = mp.Process( - target=backbone.wait_for, + target=the_backbone.wait_for, args=(extra_keys,), kwargs={"timeout": max_delay * 2}, ) @@ -390,7 +445,7 @@ def test_backbone_wait_for_multikey( # use without a wait to verify all values are written num_keys = len(extra_keys) - actual_values = backbone.wait_for(extra_keys, timeout=0.01) + actual_values = the_backbone.wait_for(extra_keys, timeout=0.01) assert len(extra_keys) == num_keys # confirm that wait_for returns all the expected values diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 59a30a3e8..1fa2bf5b4 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -452,12 +452,16 @@ def test_eventpublisher_serialize_failure( event = OnCreateConsumer(target_descriptor, filters=[]) # patch the __bytes__ implementation to cause pickling to fail during send - patch.setattr(event, "__bytes__", lambda x: b"abc") + def bad_bytes(self) -> bytes: + return b"abc" + + # this patch causes an attribute error when event pickling is attempted + patch.setattr(event, "__bytes__", bad_bytes) backbone.notification_channels = (target_descriptor,) # send a message into the channel - with pytest.raises(ValueError) as ex: + with pytest.raises(AttributeError) as ex: publisher.send(event) assert "serialize" in ex.value.args[0] @@ -729,12 +733,12 @@ def test_eventconsumer_batch_timeout( with pytest.raises(ValueError) as ex: # try to create a consumer w/a max recv size of 0 - EventConsumer( + consumer = EventConsumer( channel, backbone, filters=[EventCategory.FEATURE_STORE_WRITTEN], - batch_timeout=invalid_timeout, ) + consumer.recv(batch_timeout=invalid_timeout) assert "positive" in ex.value.args[0] diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index 091610592..fd93f9cfe 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -60,11 +60,18 @@ @pytest.fixture(scope="module") def storage_for_dragon_fs() -> t.Dict[str, str]: - return dragon_ddict.DDict() + """Fixture to instantiate a dragon distributed dictionary. + + NOTE: using module scoped fixtures drastically improves test run-time""" + return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) @pytest.fixture(scope="module") def the_worker_channel() -> DragonCommChannel: + """Fixture to create a valid descriptor for a worker channel + that can be attached to. + + NOTE: using module scoped fixtures drastically improves test run-time""" wmgr_channel_ = create_local() wmgr_channel = DragonCommChannel(wmgr_channel_) return wmgr_channel @@ -72,6 +79,10 @@ def the_worker_channel() -> DragonCommChannel: @pytest.fixture(scope="module") def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore: + """Fixture to create a distributed dragon dictionary and wrap it + in a BackboneFeatureStore. + + NOTE: using module scoped fixtures drastically improves test run-time""" return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) @@ -226,15 +237,36 @@ def test_eventconsumer_max_dequeue( @pytest.mark.parametrize( "buffer_size", [ - pytest.param(-1, id="replace negative, default to 500"), - pytest.param(0, id="replace zero, default to 500"), - pytest.param(1, id="non-zero buffer size: 1"), - pytest.param(550, id="larger than default: 550"), - pytest.param(800, id="much larger then default: 800"), + pytest.param( + -1, + id="replace negative, default to 500", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), + pytest.param( + 0, + id="replace zero, default to 500", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), + pytest.param( + 1, + id="non-zero buffer size: 1", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), + # pytest.param(500, id="maximum size edge case: 500"), + pytest.param( + 550, + id="larger than default: 550", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), + pytest.param( + 800, + id="much larger then default: 800", + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), + ), pytest.param( 1000, id="very large buffer: 1000, unreliable in dragon-v0.10", - marks=pytest.mark.skip, + marks=pytest.mark.skip("create_local issue w/MPI must be mitigated"), ), ], ) From 1f4e6e302cf8a761ed90312df342b026832cb6a2 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Thu, 26 Sep 2024 13:31:55 -0500 Subject: [PATCH 14/40] more docstrings standard fixes --- .../storage/backbone_feature_store.py | 69 +++++++++++-------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index 1542f3811..e5f54724c 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import base64 import enum import itertools import os @@ -52,16 +51,6 @@ logger = get_logger(__name__) -def byte_descriptor_to_string(descriptor: bytes) -> str: - return base64.b64encode(descriptor).decode("utf-8") - - -def string_descriptor_to_byte(descriptor: str) -> bytes: - return base64.b64decode(descriptor.encode("utf-8")) - - -# todo: did i create an arms race where a developer just grabs the backbone -# and passes it wherever they need a FeatureStore? class BackboneFeatureStore(DragonFeatureStore): """A DragonFeatureStore wrapper with utility methods for accessing shared information stored in the MLI backbone feature store.""" @@ -184,11 +173,12 @@ def from_writable_descriptor( cls, descriptor: str, ) -> "BackboneFeatureStore": - """A factory method that creates an instance from a descriptor string + """A factory method that creates an instance from a descriptor string. :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached DragonFeatureStore - :raises SmartSimError: if attachment to DragonFeatureStore fails""" + :raises SmartSimError: if attachment to DragonFeatureStore fails + """ try: return BackboneFeatureStore(dragon_ddict.DDict.attach(descriptor), True) except Exception as ex: @@ -199,11 +189,12 @@ def from_writable_descriptor( def _check_wait_timeout( self, start_time: float, timeout: float, indicators: t.Dict[str, bool] ) -> None: - """Perform timeout verification + """Perform timeout verification. :param start_time: the start time to use for elapsed calculation :param timeout: the timeout (in seconds) - :param indicators: latest retrieval status for requested keys""" + :param indicators: latest retrieval status for requested keys + """ elapsed = time.time() - start_time if timeout and elapsed > timeout: raise SmartSimError( @@ -214,10 +205,10 @@ def wait_for( self, keys: t.List[str], timeout: float = _DEFAULT_WAIT_TIMEOUT ) -> t.Dict[str, t.Union[str, bytes, None]]: """Perform a blocking wait until all specified keys have been found - in the backbone + in the backbone. :param keys: The required collection of keys to retrieve - :param timeout: The maximum wait time in seconds. Overrides class level setting + :param timeout: The maximum wait time in seconds """ if timeout < 0: timeout = self._DEFAULT_WAIT_TIMEOUT @@ -255,7 +246,10 @@ def wait_for( def get_env(self) -> t.Dict[str, str]: """Returns a dictionary populated with environment variables necessary to - connect a process to the existing backbone instance.""" + connect a process to the existing backbone instance. + + :returns: The dictionary populated with env vars + """ return {self.MLI_BACKBONE: self.descriptor} @@ -263,7 +257,9 @@ class EventCategory(str, enum.Enum): """Predefined event types raised by SmartSim backend.""" CONSUMER_CREATED: str = "consumer-created" + """Event category for an event raised when a new consumer is created""" FEATURE_STORE_WRITTEN: str = "feature-store-written" + """Event category for an event raised when a feature store key is written""" @dataclass @@ -350,10 +346,11 @@ class EventProducer(t.Protocol): """Core API of a class that publishes events.""" def send(self, event: EventBase, timeout: float = 0.001) -> int: - """The send operation. + """Send an event using the configured comm channel. :param event: The event to send :param timeout: Maximum time to wait (in seconds) for messages to send + :returns: The number of messages that were sent """ @@ -366,15 +363,24 @@ def __init__( backbone: BackboneFeatureStore, channel: t.Optional[CommChannelBase], ) -> None: - """Initialize the instance""" + """Initialize the instance. + + :param backbone: The backbone feature store to use + :param channel: The comm channel to send events on + """ self._backbone = backbone self._channel: t.Optional[CommChannelBase] = channel def send(self, event: EventBase, timeout: float = 0.001) -> int: - """The send operation""" + """Send an event using the configured comm channel. + + :param event: The event to send + :param timeout: Maximum time to wait (in seconds) for messages to send + :returns: The number of message copies that were sent + :raises: SmartSimError if the comm channel is not configured + """ if self._channel is None: - # self._channel = self._channel_factory(event) - raise Exception("No channel to send on") + raise SmartSimError("No channel to send on") num_sent = 0 logger.debug(f"Sending {event} to {self._channel.descriptor}") @@ -431,8 +437,8 @@ def _save_to_buffer(self, event: EventBase) -> None: """Places the event in the buffer to be sent once a consumer list is available. - :param event: The event to serialize and buffer - :raises ValueError: If the event cannot be serialized + :param event: The event to buffer + :raises ValueError: If the event cannot be buffered """ try: self._event_buffer.append(event) @@ -590,7 +596,7 @@ def __init__( @property def descriptor(self) -> str: - """The descriptor of the underlying comm channel where events are received + """The descriptor of the underlying comm channel. :returns: The comm channel descriptor""" return self._comm_channel.descriptor @@ -670,7 +676,7 @@ def recv( return events_received def register(self) -> None: - """Send an event to register this consumer as a listener""" + """Send an event to register this consumer as a listener.""" descriptor = self._comm_channel.descriptor event = OnCreateConsumer(descriptor, self._global_filters) @@ -690,7 +696,14 @@ def register(self) -> None: logger.warning("Unable to register. No registrar channel found.") def listen_once(self, timeout: float = 0.001) -> None: - """Function to handle incoming events""" + """Receives messages for the consumer a single time. + + NOTE: Executes a single batch-retrieval to receive the maximum + number of messages available under batch timeout. To continually + listen, use `listen` in a non-blocking thread/process + + :param timeout: Maximum time to wait (in seconds) for messages to send + """ logger.debug(f"Starting event listener with {timeout} second timeout") logger.debug("Awaiting new messages") From 9c8d127fef516240c638d70055316e83650556dd Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Thu, 26 Sep 2024 16:55:10 -0500 Subject: [PATCH 15/40] reduce default worker connect timeout, fix test timeout issue due to measurement error --- ex/high_throughput_inference/mock_app.py | 2 +- .../storage/backbone_feature_store.py | 7 ++--- smartsim/protoclient.py | 17 +++++++++--- tests/dragon/test_protoclient.py | 26 ++++++++++--------- 4 files changed, 32 insertions(+), 20 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 2886bd5f9..f4db1bc1e 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -105,7 +105,7 @@ def name(self) -> str: resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt") - client = ProtoClient(timing_on=True, wait_timeout=0) + client = ProtoClient(timing_on=True) client.set_model(resnet.name, resnet.model) if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index e5f54724c..9fcf490e4 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -695,19 +695,20 @@ def register(self) -> None: else: logger.warning("Unable to register. No registrar channel found.") - def listen_once(self, timeout: float = 0.001) -> None: + def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: """Receives messages for the consumer a single time. NOTE: Executes a single batch-retrieval to receive the maximum number of messages available under batch timeout. To continually listen, use `listen` in a non-blocking thread/process - :param timeout: Maximum time to wait (in seconds) for messages to send + :param timeout: Maximum time to wait (in seconds) for a message to arrive + :param timeout: Maximum time to wait (in seconds) for a batch to arrive """ logger.debug(f"Starting event listener with {timeout} second timeout") logger.debug("Awaiting new messages") - incoming_messages = self.recv(timeout=timeout) + incoming_messages = self.recv(timeout=timeout, batch_timeout=batch_timeout) if not incoming_messages: logger.debug("Consumer received empty message list.") diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py index c248300ca..a84a8a261 100644 --- a/smartsim/protoclient.py +++ b/smartsim/protoclient.py @@ -74,7 +74,7 @@ class ProtoClient: """Proof of concept implementation of a client enabling user applications to interact with MLI resources.""" - _DEFAULT_BACKBONE_TIMEOUT = 30.0 + _DEFAULT_BACKBONE_TIMEOUT = 1.0 """A default timeout period applied to connection attempts with the backbone feature store.""" @@ -140,7 +140,11 @@ def _create_broadcaster(self) -> EventProducer: ) return broadcaster - def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None: + def __init__( + self, + timing_on: bool, + backbone_timeout: float = _DEFAULT_BACKBONE_TIMEOUT, + ) -> None: """Initialize the client instance. :param timing_on: Flag indicating if timing information should be @@ -157,7 +161,12 @@ def __init__(self, timing_on: bool, wait_timeout: float = 0) -> None: else: rank = 0 - self._backbone_timeout = wait_timeout + if backbone_timeout <= 0: + raise ValueError( + f"Invalid backbone timeout provided: {backbone_timeout}. " + "The value must be greater than zero." + ) + self._backbone_timeout = max(backbone_timeout, 0.1) connect_to_infrastructure() @@ -184,7 +193,7 @@ def backbone_timeout(self) -> float: from the backbone feature store. :returns: A float indicating the number of seconds to allow""" - return self._backbone_timeout or self._DEFAULT_BACKBONE_TIMEOUT + return self._backbone_timeout def _add_label_to_timings(self, label: str) -> None: """Adds a new label into the timing dictionary to prepare for diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index c758ce971..6fb44ed3d 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -108,7 +108,7 @@ def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: @pytest.mark.parametrize( - "wait_timeout, exp_wait_max", + "backbone_timeout, exp_wait_max", [ # aggregate the 1+1+1 into 3 on remaining parameters pytest.param(0.5, 1 + 1 + 1, id="0.5s wait, 3 cycle steps"), @@ -117,7 +117,7 @@ def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: ], ) def test_protoclient_timeout( - wait_timeout: float, + backbone_timeout: float, exp_wait_max: float, the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch, @@ -134,21 +134,23 @@ def test_protoclient_timeout( # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8] # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) - start_time = time.time() + with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex: + start_time = time.time() ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) - ProtoClient(False, wait_timeout=wait_timeout) - - end_time = time.time() - elapsed = end_time - start_time + ProtoClient(timing_on=False, backbone_timeout=backbone_timeout) + elapsed = time.time() - start_time + logger.info(f"ProtoClient timeout occurred in {elapsed} seconds") - # todo: revisit. should this trigger any wait if the backbone is set above? - # confirm that we met our timeout - # assert elapsed > wait_timeout, f"below configured timeout {wait_timeout}" + # todo: should this trigger any wait if the backbone is set above? + # confirm that we met our timeout + assert ( + elapsed >= backbone_timeout + ), f"below configured timeout {backbone_timeout}" - # confirm that the total wait time is aligned with the sleep cycle - assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" + # confirm that the total wait time is aligned with the sleep cycle + assert elapsed < exp_wait_max, f"above expected max wait {exp_wait_max}" def test_protoclient_initialization_no_backbone( From 7442eb113607e5f1d2537fd47750759206271d9e Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Thu, 26 Sep 2024 17:08:34 -0500 Subject: [PATCH 16/40] use constants in tests for env var strings, docstrings, remove commented code --- tests/dragon/test_error_handling.py | 27 +++++++++++++++++++-------- tests/dragon/test_featurestore.py | 18 ++++++++++-------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 1e659168b..5c04faf0e 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -49,6 +49,9 @@ exception_handler, ) from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) @@ -115,9 +118,11 @@ def setup_worker_manager_model_bytes( ): integrated_worker_type = IntegratedTorchWorker - monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor) + monkeypatch.setenv( + BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor + ) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor) config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -171,9 +176,11 @@ def setup_worker_manager_model_key( ): integrated_worker_type = IntegratedTorchWorker - monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor) + monkeypatch.setenv( + BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor + ) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor) config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -225,9 +232,11 @@ def setup_request_dispatcher_model_bytes( ): integrated_worker_type = IntegratedTorchWorker - monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor) + monkeypatch.setenv( + BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor + ) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor) config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -266,9 +275,11 @@ def setup_request_dispatcher_model_key( ): integrated_worker_type = IntegratedTorchWorker - monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", the_worker_channel.descriptor) + monkeypatch.setenv( + BackboneFeatureStore.MLI_WORKER_QUEUE, the_worker_channel.descriptor + ) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + monkeypatch.setenv(BackboneFeatureStore.MLI_BACKBONE, backbone_descriptor) config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index 32e1c3a82..3e99762c9 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -73,7 +73,8 @@ def storage_for_dragon_fs() -> t.Dict[str, str]: """Fixture to instantiate a dragon distributed dictionary. - NOTE: using module scoped fixtures drastically improves test run-time""" + NOTE: using module scoped fixtures drastically improves test run-time + """ return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) @@ -82,10 +83,8 @@ def the_worker_channel() -> DragonFLIChannel: """Fixture to create a valid descriptor for a worker channel that can be attached to. - NOTE: using module scoped fixtures drastically improves test run-time""" - # wmgr_channel_ = create_local() - # wmgr_channel = DragonCommChannel(wmgr_channel_) - # return wmgr_channel + NOTE: using module scoped fixtures drastically improves test run-time + """ channel_ = create_local() fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) comm_channel = DragonFLIChannel(fli_, True) @@ -99,7 +98,8 @@ def the_backbone( """Fixture to create a distributed dragon dictionary and wrap it in a BackboneFeatureStore. - NOTE: using module scoped fixtures drastically improves test run-time""" + NOTE: using module scoped fixtures drastically improves test run-time + """ backbone = BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor @@ -117,7 +117,8 @@ def test_eventconsumer_eventpublisher_integration( :param storage_for_dragon_fs: the dragon storage engine to use :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ # verify ability to write and read from ddict the_backbone["test_dir"] = test_dir @@ -276,7 +277,8 @@ def set_value_after_delay( :param descriptor: the backbone feature store descriptor to attach to :param key: the key to write to - :param value: a value to write to the key""" + :param value: a value to write to the key + """ time.sleep(delay) backbone = BackboneFeatureStore.from_descriptor(descriptor) From da20b5f4b927ab56d45639c161b59624f33f0831 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Fri, 27 Sep 2024 10:37:02 -0500 Subject: [PATCH 17/40] docstring formatting in tests --- tests/dragon/test_dragon_backend.py | 2 +- tests/dragon/test_environment_loader.py | 10 +-- tests/dragon/test_error_handling.py | 3 +- tests/dragon/test_featurestore.py | 29 +++--- tests/dragon/test_featurestore_base.py | 88 +++++++++++-------- tests/dragon/test_featurestore_integration.py | 15 ++-- tests/dragon/test_protoclient.py | 23 +++-- tests/dragon/test_worker_manager.py | 16 +++- 8 files changed, 113 insertions(+), 73 deletions(-) diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py index dc2aceeaa..8a48e0026 100644 --- a/tests/dragon/test_dragon_backend.py +++ b/tests/dragon/test_dragon_backend.py @@ -55,7 +55,7 @@ def test_dragonbackend_listener_boostrapping(monkeypatch: pytest.MonkeyPatch): """Verify that the dragon backend registration channel correctly registers new consumers in the backbone and begins sending events - to the new consumers""" + to the new consumers.""" backend = DragonBackend(pid=9999) diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 47e75109a..08a0c0135 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -53,7 +53,7 @@ ], ) def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch): - """A descriptor can be stored, loaded, and reattached""" + """A descriptor can be stored, loaded, and reattached.""" chan = create_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv( @@ -76,7 +76,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): """The serialized descriptors of a loaded and unloaded - queue are the same""" + queue are the same.""" chan = create_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv( @@ -93,7 +93,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): - """An incorrect serialized descriptor will fails to attach""" + """An incorrect serialized descriptor will fails to attach.""" monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring") @@ -109,7 +109,7 @@ def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): """Verify the dragon feature store is loaded correctly by the - EnvironmentConfigLoader to demonstrate featurestore_factory correctness""" + EnvironmentConfigLoader to demonstrate featurestore_factory correctness.""" feature_store = DragonFeatureStore(DDict()) monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor) @@ -127,7 +127,7 @@ def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): def test_environment_variables_not_set(monkeypatch: pytest.MonkeyPatch): """EnvironmentConfigLoader getters return None when environment - variables are not set""" + variables are not set.""" with monkeypatch.context() as patch: patch.setenv("_SMARTSIM_INFRA_BACKBONE", "") patch.setenv("_SMARTSIM_REQUEST_QUEUE", "") diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 5c04faf0e..6f1e74dca 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -86,7 +86,8 @@ def the_worker_channel() -> DragonFLIChannel: """Fixture to create a valid descriptor for a worker channel that can be attached to. - NOTE: using module scoped fixtures drastically improves test run-time""" + NOTE: using module scoped fixtures drastically improves test run-time + """ channel_ = create_local() fli_ = FLInterface(main_ch=channel_, manager_ch=None) comm_channel = DragonFLIChannel(fli_, True) diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index 3e99762c9..ea62fbbeb 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -81,7 +81,7 @@ def storage_for_dragon_fs() -> t.Dict[str, str]: @pytest.fixture(scope="module") def the_worker_channel() -> DragonFLIChannel: """Fixture to create a valid descriptor for a worker channel - that can be attached to. + that can be attached to. Does not modify environment vars. NOTE: using module scoped fixtures drastically improves test run-time """ @@ -98,6 +98,8 @@ def the_backbone( """Fixture to create a distributed dragon dictionary and wrap it in a BackboneFeatureStore. + :param storage_for_dragon_fs: the dragon storage engine to use + :param the_worker_channel: a pre-configured worker channel NOTE: using module scoped fixtures drastically improves test run-time """ @@ -113,9 +115,9 @@ def test_eventconsumer_eventpublisher_integration( """Verify that the publisher and consumer integrate as expected when multiple publishers and consumers are sending simultaneously. This test closely tracks the test in tests/test_featurestore.py also named - test_eventconsumer_eventpublisher_integration but requires dragon entities + test_eventconsumer_eventpublisher_integration but requires dragon entities. - :param storage_for_dragon_fs: the dragon storage engine to use + :param the_backbone: the dragon storage engine to use :param test_dir: pytest fixture automatically generating unique working directories for individual test outputs """ @@ -197,9 +199,9 @@ def test_backbone_wait_for_no_keys( ) -> None: """Verify that asking the backbone to wait for a value succeeds immediately and does not cause a wait to occur if the supplied key - list is empty + list is empty. - :param storage_for_dragon_fs: the storage engine to use, prepopulated with + :param the_backbone: the storage engine to use, prepopulated with """ # set a very low timeout to confirm that it does not wait @@ -218,9 +220,9 @@ def test_backbone_wait_for_prepopulated( the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch ) -> None: """Verify that asking the backbone to wait for a value succeed - immediately and do not cause a wait to occur if the data exists + immediately and do not cause a wait to occur if the data exists. - :param storage_for_dragon_fs: the storage engine to use, prepopulated with + :param the_backbone: the storage engine to use, prepopulated with """ # set a very low timeout to confirm that it does not wait @@ -244,9 +246,9 @@ def test_backbone_wait_for_prepopulated_dupe( the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch ) -> None: """Verify that asking the backbone to wait for keys that are duplicated - results in a single value being returned for each key + results in a single value being returned for each key. - :param storage_for_dragon_fs: the storage engine to use, prepopulated with + :param the_backbone: the storage engine to use, prepopulated with """ # set a very low timeout to confirm that it does not wait @@ -278,6 +280,7 @@ def set_value_after_delay( :param descriptor: the backbone feature store descriptor to attach to :param key: the key to write to :param value: a value to write to the key + :param delay: amount of delay to apply before writing the key """ time.sleep(delay) @@ -325,9 +328,9 @@ def test_backbone_wait_for_partial_prepopulated( the_backbone: BackboneFeatureStore, delay: float ) -> None: """Verify that when data is not all in the backbone, the `wait_for` operation - continues to poll until it finds everything it needs + continues to poll until it finds everything it needs. - :param storage_for_dragon_fs: the storage engine to use, prepopulated with + :param the_backbone: the storage engine to use, prepopulated with :param delay: the number of seconds the second process will wait before setting the target value in the backbone featurestore """ @@ -409,9 +412,9 @@ def test_backbone_wait_for_multikey( test_dir: str, ) -> None: """Verify that asking the backbone to wait for multiple keys results - in that number of values being returned + in that number of values being returned. - :param storage_for_dragon_fs: the storage engine to use, prepopulated with + :param the_backbone: the storage engine to use, prepopulated with :param num_keys: the number of extra keys to set & request in the backbone """ # maximum delay allowed for setter processes diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 1fa2bf5b4..84594e3c2 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -67,12 +67,12 @@ def storage_for_dragon_fs_with_req_queue() -> t.Dict[str, str]: def boom(*args, **kwargs) -> None: """Helper function that blows up when used to mock up - some other function""" + some other function.""" raise Exception(f"you shall not pass! {args}, {kwargs}") def test_event_uid() -> None: - """Verify that all events include a unique identifier""" + """Verify that all events include a unique identifier.""" uids: t.Set[str] = set() num_iters = 1000 @@ -90,7 +90,7 @@ def test_event_uid() -> None: def test_mli_reserved_keys_conversion() -> None: """Verify that conversion from a string to an enum member - works as expected""" + works as expected.""" for reserved_key in ReservedKeys: # iterate through all keys and verify `from_string` works @@ -103,7 +103,7 @@ def test_mli_reserved_keys_conversion() -> None: def test_mli_reserved_keys_writes() -> None: """Verify that attempts to write to reserved keys are blocked from a - standard DragonFeatureStore but enabled with the BackboneFeatureStore""" + standard DragonFeatureStore but enabled with the BackboneFeatureStore.""" mock_storage = {} dfs = DragonFeatureStore(mock_storage) @@ -132,10 +132,11 @@ def test_mli_reserved_keys_writes() -> None: def test_mli_consumers_read_by_key() -> None: - """Verify that the value returned from the mli consumers - method is written to the correct key and reads are - allowed via standard dragon feature store. - NOTE: should reserved reads also be blocked""" + """Verify that the value returned from the mli consumers method is written + to the correct key and reads are allowed via standard dragon feature store. + + NOTE: should reserved reads also be blocked + """ mock_storage = {} dfs = DragonFeatureStore(mock_storage) @@ -154,7 +155,7 @@ def test_mli_consumers_read_by_key() -> None: def test_mli_consumers_read_by_backbone() -> None: """Verify that the backbone reads the correct location - when using the backbone feature store API instead of mapping API""" + when using the backbone feature store API instead of mapping API.""" mock_storage = {} backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) @@ -168,7 +169,7 @@ def test_mli_consumers_read_by_backbone() -> None: def test_mli_consumers_write_by_backbone() -> None: """Verify that the backbone writes the correct location - when using the backbone feature store API instead of mapping API""" + when using the backbone feature store API instead of mapping API.""" mock_storage = {} backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) @@ -182,10 +183,11 @@ def test_mli_consumers_write_by_backbone() -> None: def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: """Verify that a broadcast operation without any registered subscribers - succeeds without raising Exceptions + succeeds without raising Exceptions. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" mock_storage = {} consumer_descriptor = storage_path / "test-consumer" @@ -217,10 +219,11 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None: """Verify that a broadcast operation without any registered subscribers - succeeds without raising Exceptions + succeeds without raising Exceptions. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" mock_storage = {} @@ -249,10 +252,11 @@ def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None: def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None: """Verify that a broadcast operation reports an error if no channel - factory was supplied for constructing the consumer channels + factory was supplied for constructing the consumer channels. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" mock_storage = {} @@ -277,10 +281,11 @@ def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None: """Verify that a successful broadcast clears messages from the event - buffer when a new message is sent and consumers are registered + buffer when a new message is sent and consumers are registered. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" mock_storage = {} @@ -363,10 +368,11 @@ def test_eventpublisher_broadcast_returns_total_sent( def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: - """Verify that any unused consumers are pruned each time a new event is sent + """Verify that any unused consumers are pruned each time a new event is sent. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" mock_storage = {} @@ -429,12 +435,13 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: def test_eventpublisher_serialize_failure( test_dir: str, monkeypatch: pytest.MonkeyPatch ) -> None: - """Verify that errors during message serialization are raised to the caller + """Verify that errors during message serialization are raised to the caller. :param test_dir: pytest fixture automatically generating unique working directories for individual test outputs :param monkeypatch: pytest fixture for modifying behavior of existing code - with mock implementations""" + with mock implementations + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -470,12 +477,13 @@ def bad_bytes(self) -> bytes: def test_eventpublisher_factory_failure( test_dir: str, monkeypatch: pytest.MonkeyPatch ) -> None: - """Verify that errors during channel construction are raised to the caller + """Verify that errors during channel construction are raised to the caller. :param test_dir: pytest fixture automatically generating unique working directories for individual test outputs :param monkeypatch: pytest fixture for modifying behavior of existing code - with mock implementations""" + with mock implementations + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -504,12 +512,13 @@ def boom(descriptor: str) -> None: def test_eventpublisher_failure(test_dir: str, monkeypatch: pytest.MonkeyPatch) -> None: """Verify that unexpected errors during message send are caught and wrapped in a - SmartSimError so they are not propagated directly to the caller + SmartSimError so they are not propagated directly to the caller. :param test_dir: pytest fixture automatically generating unique working directories for individual test outputs :param monkeypatch: pytest fixture for modifying behavior of existing code - with mock implementations""" + with mock implementations + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -544,10 +553,11 @@ def boom(self) -> None: def test_eventconsumer_receive(test_dir: str) -> None: - """Verify that a consumer retrieves a message from the given channel + """Verify that a consumer retrieves a message from the given channel. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -575,12 +585,13 @@ def test_eventconsumer_receive(test_dir: str) -> None: @pytest.mark.parametrize("num_sent", [0, 1, 2, 4, 8, 16]) def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None: - """Verify that a consumer retrieves multiple message from the given channel + """Verify that a consumer retrieves multiple message from the given channel. :param test_dir: pytest fixture automatically generating unique working directories for individual test outputs :param num_sent: parameterized value used to vary the number of events - that are enqueued and validations are checked at multiple queue sizes""" + that are enqueued and validations are checked at multiple queue sizes + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -605,10 +616,11 @@ def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None: def test_eventconsumer_receive_empty(test_dir: str) -> None: """Verify that a consumer receiving an empty message ignores the - message and continues processing + message and continues processing. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -636,7 +648,8 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: multiple publishers and consumers are sending simultaneously. :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -722,7 +735,8 @@ def test_eventconsumer_batch_timeout( :param invalid_timeout: any invalid timeout that should fail validation :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ storage_path = pathlib.Path(test_dir) / "features" storage_path.mkdir(parents=True, exist_ok=True) @@ -758,8 +772,12 @@ def test_eventconsumer_batch_timeout( def test_backbone_wait_timeout(wait_timeout: float, exp_wait_max: float) -> None: """Verify that attempts to attach to the worker queue from the protoclient timeout in an appropriate amount of time. Note: due to the backoff, we verify - the elapsed time is less than the 15s of a cycle of waits + the elapsed time is less than the 15s of a cycle of waits. + :param wait_timeout: Maximum amount of time (in seconds) to allow the backbone + to wait for the requested value to exist + :param exp_wait_max: Maximum amount of time (in seconds) to set as the upper + bound to allow the delays with backoff to occur :param storage_for_dragon_fs: the dragon storage engine to use """ diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index fd93f9cfe..fb86ad7cd 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -62,7 +62,8 @@ def storage_for_dragon_fs() -> t.Dict[str, str]: """Fixture to instantiate a dragon distributed dictionary. - NOTE: using module scoped fixtures drastically improves test run-time""" + NOTE: using module scoped fixtures drastically improves test run-time + """ return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) @@ -71,7 +72,8 @@ def the_worker_channel() -> DragonCommChannel: """Fixture to create a valid descriptor for a worker channel that can be attached to. - NOTE: using module scoped fixtures drastically improves test run-time""" + NOTE: using module scoped fixtures drastically improves test run-time + """ wmgr_channel_ = create_local() wmgr_channel = DragonCommChannel(wmgr_channel_) return wmgr_channel @@ -82,7 +84,8 @@ def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore: """Fixture to create a distributed dragon dictionary and wrap it in a BackboneFeatureStore. - NOTE: using module scoped fixtures drastically improves test run-time""" + NOTE: using module scoped fixtures drastically improves test run-time + """ return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) @@ -96,7 +99,8 @@ def test_eventconsumer_eventpublisher_integration( :param storage_for_dragon_fs: the dragon storage engine to use :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs""" + directories for individual test outputs + """ mock_storage = storage_for_dragon_fs backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) @@ -278,7 +282,8 @@ def test_channel_buffer_size( until a configured maximum value is exceeded. :param buffer_size: the maximum number of messages allowed in a channel buffer - :param storage_for_dragon_fs: the dragon storage engine to use""" + :param storage_for_dragon_fs: the dragon storage engine to use + """ mock_storage = storage_for_dragon_fs backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index 6fb44ed3d..86becf71e 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -73,7 +73,7 @@ def storage_for_dragon_fs() -> t.Dict[str, str]: def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: """Fixture that creates a dragon backbone feature store. - :param storage_for_dragon_fs: + :param storage_for_dragon_fs: the distributed dictionary to use in backbone :returns: The backbone feature store :returns: The attached `BackboneFeatureStore` """ @@ -124,13 +124,14 @@ def test_protoclient_timeout( ): """Verify that attempts to attach to the worker queue from the protoclient timeout in an appropriate amount of time. Note: due to the backoff, we verify - the elapsed time is less than the 15s of a cycle of waits + the elapsed time is less than the 15s of a cycle of waits. - :param wait_timeout: a timeout for use when configuring a proto client + :param backbone_timeout: a timeout for use when configuring a proto client :param exp_wait_max: a ceiling for the expected time spent waiting for the timeout :param the_backbone: a pre-initialized backbone featurestore for setting up - the environment variable required by the client""" + the environment variable required by the client + """ # NOTE: exp_wait_time maps to the cycled backoff of [0.1, 0.2, 0.4, 0.8] # with leeway added (by allowing 1s each for the 0.1 and 0.5 steps) @@ -179,7 +180,7 @@ def test_protoclient_initialization( monkeypatch: pytest.MonkeyPatch, ): """Verify that attempting to start the client with required env vars results - in a fully initialized client + in a fully initialized client. :param the_backbone: a pre-initialized backbone featurestore :param the_worker_queue: an FLI channel the client will retrieve @@ -227,11 +228,13 @@ def test_protoclient_write_model( monkeypatch: pytest.MonkeyPatch, ): """Verify that writing a model using the client causes the model data to be - written to a feature store + written to a feature store. :param the_backbone: a pre-initialized backbone featurestore - :param the_worker_queue: an FLI channel the client will retrieve - from the backbone""" + :param the_worker_queue: Passing the worker queue fixture to ensure + the worker queue environment is correctly configured. + from the backbone + """ with monkeypatch.context() as ctx: # we won't actually send here @@ -262,13 +265,15 @@ def test_protoclient_write_model_notification_sent( num_listeners: int, num_model_updates: int, ): - """Verify that writing a model sends a key-written event + """Verify that writing a model sends a key-written event. :param the_backbone: a pre-initialized backbone featurestore :param the_worker_queue: an FLI channel the client will retrieve from the backbone :param num_listeners: vary the number of registered listeners to verify that the event is broadcast to everyone + :param num_listeners: vary the number of listeners to register + to verify the broadcast counts messages sent correctly """ # we won't actually send here, but it won't try without registered listeners diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 69d962940..132bb2110 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -136,7 +136,14 @@ def mock_messages( comm_channel_root_dir: pathlib.Path, kill_queue: mp.Queue, ) -> None: - """Mock event producer for triggering the inference pipeline""" + """Mock event producer for triggering the inference pipeline. + + :param feature_store_root_dir: Path to a directory where a + FileSystemFeatureStore can read & write results + :param comm_channel_root_dir: Path to a directory where a + FileSystemCommChannel can read & write messages + :param kill_queue: Queue used by unit test to stop mock_message process + """ feature_store_root_dir.mkdir(parents=True, exist_ok=True) comm_channel_root_dir.mkdir(parents=True, exist_ok=True) @@ -203,7 +210,7 @@ def mock_messages( def mock_mli_infrastructure_mgr() -> None: """Create resources normally instanatiated by the infrastructure - management portion of the DragonBackend + management portion of the DragonBackend. """ config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -228,8 +235,9 @@ def mock_mli_infrastructure_mgr() -> None: def prepare_environment(test_dir: str) -> pathlib.Path: """Cleanup prior outputs to run demo repeatedly. - :param tes_dir: the directory to prepare - :returns: The path to the log file""" + :param test_dir: the directory to prepare + :returns: The path to the log file + """ path = pathlib.Path(f"{test_dir}/workermanager.log") logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) return path From f5ba5a69a5ed671dc50fca916d27113b5cd6d722 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Fri, 27 Sep 2024 12:00:39 -0500 Subject: [PATCH 18/40] docstrings --- .../_core/launcher/dragon/dragonBackend.py | 2 -- smartsim/protoclient.py | 3 +-- tests/dragon/test_error_handling.py | 5 +---- tests/dragon/test_featurestore.py | 11 ++--------- tests/dragon/test_featurestore_base.py | 5 +---- tests/dragon/test_featurestore_integration.py | 18 ++++++------------ 6 files changed, 11 insertions(+), 33 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 6dc61516e..1d8c71e7d 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -81,8 +81,6 @@ logger = get_logger(__name__) -# TODO: create ticket for follow-up task to replace defunct -# dragon_group_state.Running() & .Error() class DragonStatus(str, Enum): ERROR = "Error" RUNNING = "Running" diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py index a84a8a261..7f6d6f412 100644 --- a/smartsim/protoclient.py +++ b/smartsim/protoclient.py @@ -91,7 +91,6 @@ def _attach_to_backbone() -> BackboneFeatureStore: :returns: The attached backbone featurestore """ - # todo: ensure this env var from config loader or constant descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) if descriptor is None or not descriptor: raise SmartSimError( @@ -154,7 +153,7 @@ def __init__( :raises: SmartSimError if unable to attach to a backbone featurestore """ if MPI is not None: - # todo: determine a way to make MPI work in the test environment + # TODO: determine a way to make MPI work in the test environment # - consider catching the import exception and defaulting rank to 0 comm = MPI.COMM_WORLD rank: int = comm.Get_rank() diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 6f1e74dca..7d2c4cb3c 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -84,10 +84,7 @@ @pytest.fixture(scope="module") def the_worker_channel() -> DragonFLIChannel: """Fixture to create a valid descriptor for a worker channel - that can be attached to. - - NOTE: using module scoped fixtures drastically improves test run-time - """ + that can be attached to.""" channel_ = create_local() fli_ = FLInterface(main_ch=channel_, manager_ch=None) comm_channel = DragonFLIChannel(fli_, True) diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index ea62fbbeb..e815e0dd9 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -71,20 +71,14 @@ @pytest.fixture(scope="module") def storage_for_dragon_fs() -> t.Dict[str, str]: - """Fixture to instantiate a dragon distributed dictionary. - - NOTE: using module scoped fixtures drastically improves test run-time - """ + """Fixture to instantiate a dragon distributed dictionary.""" return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) @pytest.fixture(scope="module") def the_worker_channel() -> DragonFLIChannel: """Fixture to create a valid descriptor for a worker channel - that can be attached to. Does not modify environment vars. - - NOTE: using module scoped fixtures drastically improves test run-time - """ + that can be attached to. Does not modify environment vars.""" channel_ = create_local() fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) comm_channel = DragonFLIChannel(fli_, True) @@ -100,7 +94,6 @@ def the_backbone( :param storage_for_dragon_fs: the dragon storage engine to use :param the_worker_channel: a pre-configured worker channel - NOTE: using module scoped fixtures drastically improves test run-time """ backbone = BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 84594e3c2..2e032213b 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -133,10 +133,7 @@ def test_mli_reserved_keys_writes() -> None: def test_mli_consumers_read_by_key() -> None: """Verify that the value returned from the mli consumers method is written - to the correct key and reads are allowed via standard dragon feature store. - - NOTE: should reserved reads also be blocked - """ + to the correct key and reads are allowed via standard dragon feature store.""" mock_storage = {} dfs = DragonFeatureStore(mock_storage) diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index fb86ad7cd..470193597 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -60,20 +60,14 @@ @pytest.fixture(scope="module") def storage_for_dragon_fs() -> t.Dict[str, str]: - """Fixture to instantiate a dragon distributed dictionary. - - NOTE: using module scoped fixtures drastically improves test run-time - """ + """Fixture to instantiate a dragon distributed dictionary.""" return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) @pytest.fixture(scope="module") def the_worker_channel() -> DragonCommChannel: """Fixture to create a valid descriptor for a worker channel - that can be attached to. - - NOTE: using module scoped fixtures drastically improves test run-time - """ + that can be attached to.""" wmgr_channel_ = create_local() wmgr_channel = DragonCommChannel(wmgr_channel_) return wmgr_channel @@ -84,7 +78,7 @@ def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore: """Fixture to create a distributed dragon dictionary and wrap it in a BackboneFeatureStore. - NOTE: using module scoped fixtures drastically improves test run-time + :param storage_for_dragon_fs: The dragon storage engine to use """ return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) @@ -95,10 +89,10 @@ def test_eventconsumer_eventpublisher_integration( """Verify that the publisher and consumer integrate as expected when multiple publishers and consumers are sending simultaneously. This test closely tracks the test in tests/test_featurestore.py also named - test_eventconsumer_eventpublisher_integration but requires dragon entities + test_eventconsumer_eventpublisher_integration but requires dragon entities. - :param storage_for_dragon_fs: the dragon storage engine to use - :param test_dir: pytest fixture automatically generating unique working + :param storage_for_dragon_fs: The dragon storage engine to use + :param test_dir: Automatically generated unique working directories for individual test outputs """ From d68975423d245e7ed7430f7e2c873bad67e689e4 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Fri, 27 Sep 2024 15:20:30 -0500 Subject: [PATCH 19/40] parameterize ddict creation, add single ddict touchpoint util module, use fixtures for pytest ddict creation --- .../_core/launcher/dragon/dragonBackend.py | 18 ++- .../storage/dragon_feature_store.py | 9 +- .../mli/infrastructure/storage/dragon_util.py | 100 ++++++++++++++ tests/dragon/test_dragon_backend.py | 5 - tests/dragon/test_dragon_ddict_utils.py | 123 ++++++++++++++++++ tests/dragon/test_environment_loader.py | 15 ++- tests/dragon/test_error_handling.py | 16 ++- tests/dragon/test_featurestore.py | 15 ++- tests/dragon/test_featurestore_base.py | 6 - tests/dragon/test_featurestore_integration.py | 31 +++-- tests/dragon/test_protoclient.py | 12 +- tests/dragon/test_reply_building.py | 1 - tests/dragon/test_request_dispatcher.py | 12 +- tests/dragon/test_worker_manager.py | 14 +- 14 files changed, 310 insertions(+), 67 deletions(-) create mode 100644 smartsim/_core/mli/infrastructure/storage/dragon_util.py create mode 100644 tests/dragon/test_dragon_ddict_utils.py diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 1d8c71e7d..0c172365a 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -37,7 +37,7 @@ # pylint: disable=import-error,C0302,R0915 # isort: off -import dragon.data.ddict.ddict as dragon_ddict + import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy import dragon.infrastructure.process_desc as dragon_process_desc @@ -56,6 +56,7 @@ EventConsumer, OnCreateConsumer, ) +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict # pylint: enable=import-error # isort: on @@ -157,6 +158,10 @@ class DragonBackend: by threads spawned by it. """ + _DEFAULT_NUM_MGR_PER_NODE = 2 + _DEFAULT_MEM_PER_NODE = 256 * 1024**2 + """The default memory capacity to allocate for a feaure store node (in megabytes)""" + def __init__(self, pid: int) -> None: self._pid = pid """PID of dragon executable which launched this server""" @@ -553,11 +558,12 @@ def _create_backbone(self) -> BackboneFeatureStore: :returns: The descriptor of the backbone feature store """ if self._backbone is None: - logger.info("Creating backbone storage DDict") - backbone_storage = dragon_ddict.DDict( - n_nodes=len(self._hosts), total_mem=len(self._hosts) * 1024**3 - ) # todo: parametrize - logger.info("Created backbone storage DDict") + backbone_storage = create_ddict( + len(self._hosts), + self._DEFAULT_NUM_MGR_PER_NODE, + self._DEFAULT_MEM_PER_NODE, + ) + self._backbone = BackboneFeatureStore( backbone_storage, allow_reserved_writes=True ) diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index 4eeeac32f..ecc232f21 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -32,6 +32,10 @@ # isort: on +from smartsim._core.mli.infrastructure.storage.dragon_util import ( + ddict_to_descriptor, + descriptor_to_ddict, +) from smartsim._core.mli.infrastructure.storage.feature_store import FeatureStore from smartsim.error import SmartSimError from smartsim.log import get_logger @@ -48,7 +52,7 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None: :param storage: A distributed dictionary to be used as the underlying storage mechanism of the feature store""" if isinstance(storage, dragon_ddict.DDict): - descriptor = str(storage.serialize()) + descriptor = ddict_to_descriptor(storage) else: descriptor = "not-set" @@ -99,7 +103,8 @@ def from_descriptor( """ try: logger.debug(f"Attaching to FeatureStore with descriptor: {descriptor}") - return cls(dragon_ddict.DDict.attach(descriptor)) + storage = descriptor_to_ddict(descriptor) + return cls(storage) except Exception as ex: raise SmartSimError( f"Error creating dragon feature store from descriptor: {descriptor}" diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_util.py b/smartsim/_core/mli/infrastructure/storage/dragon_util.py new file mode 100644 index 000000000..fda89bba5 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/storage/dragon_util.py @@ -0,0 +1,100 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# pylint: disable=import-error +# isort: off +import dragon.data.ddict.ddict as dragon_ddict + +# isort: on + +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +def ddict_to_descriptor(ddict: dragon_ddict.DDict) -> str: + """Convert a DDict to a descriptor string. + + :param ddict: The dragon dictionary to convert + :returns: The descriptor string + """ + if ddict is None: + raise ValueError("DDict is not available to create a descriptor") + + # unlike other dragon objects, the dictionary serializes to a string + # instead of bytes + return str(ddict.serialize()) + + +def descriptor_to_ddict(descriptor: str) -> dragon_ddict.DDict: + """Create and attach a new DDict instance given + the string-encoded descriptor. + + :param descriptor: The descriptor of a dictionary to attach to + :returns: The attached dragon dictionary""" + return dragon_ddict.DDict.attach(descriptor) + + +def create_ddict( + num_nodes: int, mgr_per_node: int, mem_per_node: int +) -> dragon_ddict.DDict: + """Create a distributed dragon dictionary. + + :param num_nodes: The number of distributed nodes to distribute the dictionary to. + At least one node is required. + :param mgr_per_node: The number of manager processes per node + :param mem_per_node: The amount of memory (in megabytes) to allocate per node. Total + memory available will be calculated as `num_nodes * node_mem` + + :returns: The instantiated dragon dictionary + :raises ValueError: If invalid num_nodes is supplied + :raises ValueError: If invalid mem_per_node is supplied + :raises ValueError: If invalid mgr_per_node is supplied + """ + if num_nodes < 1: + raise ValueError("A dragon dictionary must have at least 1 node") + + if mgr_per_node < 1: + raise ValueError("A dragon dict requires at least 2 managers per ndode") + + if mem_per_node < dragon_ddict.DDICT_MIN_SIZE: + raise ValueError( + "A dragon dictionary requires at least " + f"{dragon_ddict.DDICT_MIN_SIZE / 1024} MB" + ) + + mem_total = num_nodes * mem_per_node + + logger.debug( + f"Creating dragon dictionary with {num_nodes} nodes, {mem_total} MB memory" + ) + + distributed_dict = dragon_ddict.DDict(num_nodes, mgr_per_node, total_mem=mem_total) + logger.debug( + "Successfully created dragon dictionary with " + f"{num_nodes} nodes, {mem_total} MB total memory" + ) + return distributed_dict diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py index 8a48e0026..b56a92c5b 100644 --- a/tests/dragon/test_dragon_backend.py +++ b/tests/dragon/test_dragon_backend.py @@ -25,16 +25,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os -import typing as t -import unittest.mock as mock import pytest dragon = pytest.importorskip("dragon") -from dragon.channels import Channel -from dragon.data.ddict.ddict import DDict -from dragon.fli import DragonFLIError, FLInterface from smartsim._core.launcher.dragon.dragonBackend import DragonBackend from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel diff --git a/tests/dragon/test_dragon_ddict_utils.py b/tests/dragon/test_dragon_ddict_utils.py new file mode 100644 index 000000000..0df33e7a7 --- /dev/null +++ b/tests/dragon/test_dragon_ddict_utils.py @@ -0,0 +1,123 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +dragon = pytest.importorskip("dragon") + +# isort: off +import dragon.data.ddict.ddict as dragon_ddict + +# isort: on + +from smartsim._core.mli.infrastructure.storage import dragon_util +from smartsim.log import get_logger + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon +logger = get_logger(__name__) + + +@pytest.fixture(scope="module") +def the_storage() -> dragon_ddict.DDict: + """Verify that a descriptor is created.""" + return dragon_util.create_ddict(1, 2, 3 * 1024**2) + + +@pytest.mark.parametrize( + "num_nodes, num_managers, mem_per_node", + [ + pytest.param(1, 1, 3 * 1024**2, id="3MB, Bare minimum allocation"), + pytest.param(2, 2, 128 * 1024**2, id="128 MB allocation, 2 nodes, 2 mgr"), + pytest.param(2, 1, 512 * 1024**2, id="512 MB allocation, 2 nodes, 1 mgr"), + ], +) +def test_dragon_storage_util_create_ddict( + num_nodes: int, + num_managers: int, + mem_per_node: int, +): + """Verify that a dragon dictionary is successfully created. + + :param num_nodes: Number of ddict nodes to attempt to create + :param num_managers: Number of managers per node to request + :param num_managers: Memory to allocate per node + """ + ddict = dragon_util.create_ddict(num_nodes, num_managers, mem_per_node) + + assert ddict is not None + + +@pytest.mark.parametrize( + "num_nodes, num_managers, mem_per_node", + [ + pytest.param(-1, 1, 3 * 1024**2, id="Negative Node Count"), + pytest.param(0, 1, 3 * 1024**2, id="Invalid Node Count"), + pytest.param(1, -1, 3 * 1024**2, id="Negative Mgr Count"), + pytest.param(1, 0, 3 * 1024**2, id="Invalid Mgr Count"), + pytest.param(1, 1, -3 * 1024**2, id="Negative Mem Per Node"), + pytest.param(1, 1, (3 * 1024**2) - 1, id="Invalid Mem Per Node"), + pytest.param(1, 1, 0 * 1024**2, id="No Mem Per Node"), + ], +) +def test_dragon_storage_util_create_ddict_validators( + num_nodes: int, + num_managers: int, + mem_per_node: int, +): + """Verify that a dragon dictionary is successfully created. + + :param num_nodes: Number of ddict nodes to attempt to create + :param num_managers: Number of managers per node to request + :param num_managers: Memory to allocate per node + """ + with pytest.raises(ValueError): + dragon_util.create_ddict(num_nodes, num_managers, mem_per_node) + + +def test_dragon_storage_util_get_ddict_descriptor(the_storage: dragon_ddict.DDict): + """Verify that a descriptor is created. + + :param the_storage: A pre-allocated ddict + """ + value = dragon_util.ddict_to_descriptor(the_storage) + + assert isinstance(value, str) + assert len(value) > 0 + + +def test_dragon_storage_util_get_ddict_from_descriptor(the_storage: dragon_ddict.DDict): + """Verify that a ddict is created from a descriptor. + + :param the_storage: A pre-allocated ddict + """ + descriptor = dragon_util.ddict_to_descriptor(the_storage) + + value = dragon_util.descriptor_to_ddict(descriptor) + + assert value is not None + assert isinstance(value, dragon_ddict.DDict) + assert dragon_util.ddict_to_descriptor(value) == descriptor diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 08a0c0135..9dd0255fe 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -28,8 +28,8 @@ dragon = pytest.importorskip("dragon") +import dragon.data.ddict.ddict as dragon_ddict import dragon.utils as du -from dragon.data.ddict.ddict import DDict from dragon.fli import FLInterface from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel @@ -39,12 +39,19 @@ from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim.error.errors import SmartSimError # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon +@pytest.fixture(scope="module") +def the_storage() -> dragon_ddict.DDict: + """Fixture to instantiate a dragon distributed dictionary.""" + return create_ddict(1, 2, 4 * 1024**2) + + @pytest.mark.parametrize( "content", [ @@ -107,10 +114,12 @@ def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): config.get_queue() -def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): +def test_environment_loader_backbone_load_dfs( + monkeypatch: pytest.MonkeyPatch, the_storage: dragon_ddict.DDict +): """Verify the dragon feature store is loaded correctly by the EnvironmentConfigLoader to demonstrate featurestore_factory correctness.""" - feature_store = DragonFeatureStore(DDict()) + feature_store = DragonFeatureStore(the_storage) monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor) config = EnvironmentConfigLoader( diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 7d2c4cb3c..4f511a9c3 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -29,6 +29,8 @@ import pytest +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict + dragon = pytest.importorskip("dragon") import multiprocessing as mp @@ -81,6 +83,12 @@ pytestmark = pytest.mark.dragon +@pytest.fixture(scope="module") +def the_storage() -> DDict: + """Fixture to instantiate a dragon distributed dictionary.""" + return create_ddict(1, 2, 4 * 1024**2) + + @pytest.fixture(scope="module") def the_worker_channel() -> DragonFLIChannel: """Fixture to create a valid descriptor for a worker channel @@ -92,17 +100,17 @@ def the_worker_channel() -> DragonFLIChannel: @pytest.fixture(scope="module") -def backbone_descriptor() -> str: +def backbone_descriptor(the_storage) -> str: # create a shared backbone featurestore - feature_store = DragonFeatureStore(DDict()) + feature_store = DragonFeatureStore(the_storage) return feature_store.descriptor @pytest.fixture(scope="module") -def app_feature_store() -> FeatureStore: +def app_feature_store(the_storage) -> FeatureStore: # create a standalone feature store to mimic a user application putting # data into an application-owned resource (app should not access backbone) - app_fs = DragonFeatureStore(DDict()) + app_fs = DragonFeatureStore(the_storage) return app_fs diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index e815e0dd9..35720fa9d 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -34,6 +34,8 @@ import pytest +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict + dragon = pytest.importorskip("dragon") from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel @@ -50,7 +52,6 @@ from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( time as bbtime, ) -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict from smartsim.log import get_logger logger = get_logger(__name__) @@ -70,9 +71,9 @@ @pytest.fixture(scope="module") -def storage_for_dragon_fs() -> t.Dict[str, str]: +def the_storage() -> t.Dict[str, str]: """Fixture to instantiate a dragon distributed dictionary.""" - return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) + return create_ddict(1, 2, 4 * 1024**2) @pytest.fixture(scope="module") @@ -87,16 +88,16 @@ def the_worker_channel() -> DragonFLIChannel: @pytest.fixture(scope="module") def the_backbone( - storage_for_dragon_fs: t.Any, the_worker_channel: DragonFLIChannel + the_storage: t.Any, the_worker_channel: DragonFLIChannel ) -> BackboneFeatureStore: """Fixture to create a distributed dragon dictionary and wrap it in a BackboneFeatureStore. - :param storage_for_dragon_fs: the dragon storage engine to use - :param the_worker_channel: a pre-configured worker channel + :param the_storage: The dragon storage engine to use + :param the_worker_channel: Pre-configured worker channel """ - backbone = BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) + backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True) backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor return backbone diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 2e032213b..2278a0036 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -59,12 +59,6 @@ RANDOMLY_SET_KEY = "_SOMETHING_ELSE" -@pytest.fixture -def storage_for_dragon_fs_with_req_queue() -> t.Dict[str, str]: - storage = {WORK_QUEUE_KEY: "12345", RANDOMLY_SET_KEY: "67890"} - return storage - - def boom(*args, **kwargs) -> None: """Helper function that blows up when used to mock up some other function.""" diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index 470193597..e4d6bb9eb 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -43,7 +43,10 @@ OnCreateConsumer, OnWriteFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict +from smartsim._core.mli.infrastructure.storage.dragon_util import ( + create_ddict, + dragon_ddict, +) # isort: off from dragon.channels import Channel @@ -59,9 +62,9 @@ @pytest.fixture(scope="module") -def storage_for_dragon_fs() -> t.Dict[str, str]: +def the_storage() -> dragon_ddict.DDict: """Fixture to instantiate a dragon distributed dictionary.""" - return dragon_ddict.DDict(1, 2, total_mem=2 * 1024**3) + return create_ddict(1, 2, 32 * 1024**2) @pytest.fixture(scope="module") @@ -74,29 +77,29 @@ def the_worker_channel() -> DragonCommChannel: @pytest.fixture(scope="module") -def the_backbone(storage_for_dragon_fs: t.Any) -> BackboneFeatureStore: +def the_backbone(the_storage: t.Any) -> BackboneFeatureStore: """Fixture to create a distributed dragon dictionary and wrap it in a BackboneFeatureStore. - :param storage_for_dragon_fs: The dragon storage engine to use + :param the_storage: The dragon storage engine to use """ - return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) + return BackboneFeatureStore(the_storage, allow_reserved_writes=True) def test_eventconsumer_eventpublisher_integration( - storage_for_dragon_fs: t.Any, test_dir: str + the_storage: t.Any, test_dir: str ) -> None: """Verify that the publisher and consumer integrate as expected when multiple publishers and consumers are sending simultaneously. This test closely tracks the test in tests/test_featurestore.py also named test_eventconsumer_eventpublisher_integration but requires dragon entities. - :param storage_for_dragon_fs: The dragon storage engine to use + :param the_storage: The dragon storage engine to use :param test_dir: Automatically generated unique working directories for individual test outputs """ - mock_storage = storage_for_dragon_fs + mock_storage = the_storage backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) # verify ability to write and read from ddict @@ -190,7 +193,7 @@ def test_eventconsumer_max_dequeue( :param num_events: Total number of events to raise in the test :param batch_timeout: Maximum wait time (in seconds) for a message to be sent :param max_batches_expected: Maximum number of receives that should occur - :param storage_for_dragon_fs: Dragon storage engine to use + :param the_storage: Dragon storage engine to use """ # create some consumers to receive messages @@ -270,16 +273,16 @@ def test_eventconsumer_max_dequeue( ) def test_channel_buffer_size( buffer_size: int, - storage_for_dragon_fs: t.Any, + the_storage: t.Any, ) -> None: """Verify that a channel used by an EventBroadcaster can buffer messages until a configured maximum value is exceeded. - :param buffer_size: the maximum number of messages allowed in a channel buffer - :param storage_for_dragon_fs: the dragon storage engine to use + :param buffer_size: Maximum number of messages allowed in a channel buffer + :param the_storage: The dragon storage engine to use """ - mock_storage = storage_for_dragon_fs + mock_storage = the_storage backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) wmgr_channel_ = create_local(buffer_size) # <--- vary buffer size diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index 86becf71e..2e6d1dcc5 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -42,13 +42,13 @@ EventBroadcaster, OnWriteFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import dragon_ddict +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim.error.errors import SmartSimError from smartsim.log import get_logger # isort: off from dragon import fli -from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict # from ..ex..high_throughput_inference.mock_app import ProtoClient from smartsim.protoclient import ProtoClient @@ -61,16 +61,16 @@ @pytest.fixture(scope="module") -def storage_for_dragon_fs() -> t.Dict[str, str]: +def the_storage() -> DDict: """Fixture that creates a dragon distributed dictionary. :returns: The attached distributed dictionary """ - return dragon_ddict.DDict(1, 2, 4 * 1024**2) + return create_ddict(1, 2, 32 * 1024**2) @pytest.fixture(scope="module") -def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: +def the_backbone(the_storage) -> BackboneFeatureStore: """Fixture that creates a dragon backbone feature store. :param storage_for_dragon_fs: the distributed dictionary to use in backbone @@ -78,7 +78,7 @@ def the_backbone(storage_for_dragon_fs) -> BackboneFeatureStore: :returns: The attached `BackboneFeatureStore` """ - return BackboneFeatureStore(storage_for_dragon_fs, allow_reserved_writes=True) + return BackboneFeatureStore(the_storage, allow_reserved_writes=True) @pytest.fixture(scope="module") diff --git a/tests/dragon/test_reply_building.py b/tests/dragon/test_reply_building.py index 063200dd6..48493b3c4 100644 --- a/tests/dragon/test_reply_building.py +++ b/tests/dragon/test_reply_building.py @@ -31,7 +31,6 @@ dragon = pytest.importorskip("dragon") from smartsim._core.mli.infrastructure.control.worker_manager import build_failure_reply -from smartsim._core.mli.infrastructure.worker.worker import InferenceReply if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.response.response_capnp import Status diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index b6be86177..82f41e3db 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -68,6 +68,7 @@ from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim.log import get_logger @@ -85,9 +86,15 @@ pass +@pytest.fixture(scope="module") +def the_storage() -> DDict: + """Fixture to instantiate a dragon distributed dictionary.""" + return create_ddict(1, 2, 4 * 1024**2) + + @pytest.mark.parametrize("num_iterations", [4]) def test_request_dispatcher( - msg_pump_factory: _MsgPumpFactory, num_iterations: int + msg_pump_factory: _MsgPumpFactory, num_iterations: int, the_storage: DDict ) -> None: """Test the request dispatcher batching and queueing system @@ -99,8 +106,7 @@ def test_request_dispatcher( to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, sender_supplied=True) - ddict = DDict(1, 2, 4 * 1024**2) - backbone_fs = BackboneFeatureStore(ddict, allow_reserved_writes=True) + backbone_fs = BackboneFeatureStore(the_storage, allow_reserved_writes=True) # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader # or test environment may be unable to send messages w/queue diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 132bb2110..a2df57f3b 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -31,6 +31,8 @@ import pytest +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict + torch = pytest.importorskip("torch") dragon = pytest.importorskip("dragon") @@ -39,7 +41,6 @@ from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, ) -from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import OutputDescriptor try: mp.set_start_method("dragon") @@ -48,10 +49,8 @@ import os -import dragon.channels as dch import torch.nn as nn from dragon import fli -from dragon.data.ddict.ddict import DDict from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.comm.channel.dragon_util import create_local @@ -256,14 +255,9 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: mgr_per_node = 1 num_nodes = 2 - mem_per_node = 1024**3 - total_mem = num_nodes * mem_per_node + mem_per_node = 128 * 1024**2 - storage = DDict( - managers_per_node=mgr_per_node, - n_nodes=num_nodes, - total_mem=total_mem, - ) + storage = create_ddict(num_nodes, mgr_per_node, mem_per_node) backbone = BackboneFeatureStore(storage, allow_reserved_writes=True) to_worker_channel = create_local() From 7ddcd7c7bca5e5bc02454c846a53cb6aaed2f5a4 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Fri, 27 Sep 2024 15:44:40 -0500 Subject: [PATCH 20/40] remove completed todos, fix docstrings, remove obsolete/commented code --- .../storage/dragon_feature_store.py | 2 - tests/dragon/test_dragon_ddict_utils.py | 2 +- tests/dragon/test_protoclient.py | 1 - tests/mli/test_default_torch_worker.py | 206 ------------------ 4 files changed, 1 insertion(+), 210 deletions(-) delete mode 100644 tests/mli/test_default_torch_worker.py diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index ecc232f21..7c640bab6 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -56,8 +56,6 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None: else: descriptor = "not-set" - # todo: follow up and ensure this descriptor is also encoded/decoded - # in a string-safe way here & in `from_descriptor` super().__init__(descriptor) self._storage: t.Dict[str, t.Union[str, bytes]] = storage diff --git a/tests/dragon/test_dragon_ddict_utils.py b/tests/dragon/test_dragon_ddict_utils.py index 0df33e7a7..d2240abc1 100644 --- a/tests/dragon/test_dragon_ddict_utils.py +++ b/tests/dragon/test_dragon_ddict_utils.py @@ -43,7 +43,7 @@ @pytest.fixture(scope="module") def the_storage() -> dragon_ddict.DDict: - """Verify that a descriptor is created.""" + """Fixture to instantiate a dragon distributed dictionary.""" return dragon_util.create_ddict(1, 2, 3 * 1024**2) diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index 2e6d1dcc5..b871de267 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -144,7 +144,6 @@ def test_protoclient_timeout( elapsed = time.time() - start_time logger.info(f"ProtoClient timeout occurred in {elapsed} seconds") - # todo: should this trigger any wait if the backbone is set above? # confirm that we met our timeout assert ( elapsed >= backbone_timeout diff --git a/tests/mli/test_default_torch_worker.py b/tests/mli/test_default_torch_worker.py deleted file mode 100644 index b2ec6c3dc..000000000 --- a/tests/mli/test_default_torch_worker.py +++ /dev/null @@ -1,206 +0,0 @@ -# # BSD 2-Clause License -# # -# # Copyright (c) 2021-2024, Hewlett Packard Enterprise -# # All rights reserved. -# # -# # Redistribution and use in source and binary forms, with or without -# # modification, are permitted provided that the following conditions are met: -# # -# # 1. Redistributions of source code must retain the above copyright notice, this -# # list of conditions and the following disclaimer. -# # -# # 2. Redistributions in binary form must reproduce the above copyright notice, -# # this list of conditions and the following disclaimer in the documentation -# # and/or other materials provided with the distribution. -# # -# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# import io -# import pathlib -# import typing as t - -# import pytest -# import torch - -# from smartsim._core.mli.infrastructure.worker.integratedtorchworker import ( -# IntegratedTorchWorker, -# ) -# import smartsim.error as sse -# from smartsim._core.mli.infrastructure import MemoryFeatureStore -# from smartsim._core.mli.infrastructure.worker.worker import ( -# ExecuteResult, -# FetchInputResult, -# FetchModelResult, -# InferenceRequest, -# TransformInputResult, -# LoadModelResult, -# ) -# from smartsim._core.utils import installed_redisai_backends - -# # The tests in this file belong to the group_a group -# pytestmark = pytest.mark.group_b - -# # retrieved from pytest fixtures -# is_dragon = pytest.test_launcher == "dragon" -# torch_available = "torch" in installed_redisai_backends() - - -# @pytest.fixture -# def persist_torch_model(test_dir: str) -> pathlib.Path: -# test_path = pathlib.Path(test_dir) -# model_path = test_path / "basic.pt" - -# model = torch.nn.Linear(2, 1) -# torch.save(model, model_path) - -# return model_path - - -# # def test_deserialize() -> None: -# # """Verify that serialized requests are properly deserialized to -# # and converted to the internal representation used by ML workers""" -# # worker = SampleTorchWorker -# # buffer = io.BytesIO() - -# # exp_model_key = "model-key" -# # msg = InferenceRequest(model_key=exp_model_key) -# # pickle.dump(msg, buffer) - -# # deserialized: InferenceRequest = worker.deserialize(buffer.getvalue()) - -# # assert deserialized.model_key == exp_model_key -# # # assert deserialized.backend == exp_backend - - -# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -# def test_load_model_from_disk(persist_torch_model: pathlib.Path) -> None: -# """Verify that a model can be loaded using a FileSystemFeatureStore""" -# worker = IntegratedTorchWorker -# request = InferenceRequest(raw_model=persist_torch_model.read_bytes()) - -# fetch_result = FetchModelResult(persist_torch_model.read_bytes()) -# load_result = worker.load_model(request, fetch_result) - -# input = torch.randn(2) -# pred = load_result.model(input) - -# assert pred - - -# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -# def test_transform_input() -> None: -# """Verify that the default input transform operation is a no-op copy""" -# rows, cols = 1, 4 -# num_values = 7 -# tensors = [torch.randn((rows, cols)) for _ in range(num_values)] - -# request = InferenceRequest() - -# inputs: t.List[bytes] = [] -# for tensor in tensors: -# buffer = io.BytesIO() -# torch.save(tensor, buffer) -# inputs.append(buffer.getvalue()) - -# fetch_result = FetchInputResult(inputs) -# worker = IntegratedTorchWorker -# result = worker.transform_input(request, fetch_result) -# transformed: t.Collection[torch.Tensor] = result.transformed - -# assert len(transformed) == num_values - -# for output, expected in zip(transformed, tensors): -# assert output.shape == expected.shape -# assert output.equal(expected) - -# transformed = list(transformed) - -# original: torch.Tensor = tensors[0] -# assert transformed[0].equal(original) - -# # verify a copy was made -# transformed[0] = 2 * transformed[0] -# assert transformed[0].equal(2 * original) - - -# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -# def test_execute_model(persist_torch_model: pathlib.Path) -> None: -# """Verify that a model executes corrrectly via the worker""" - -# # put model bytes into memory -# model_name = "test-key" -# feature_store = MemoryFeatureStore() -# feature_store[model_name] = persist_torch_model.read_bytes() - -# worker = IntegratedTorchWorker -# request = InferenceRequest(model_key=model_name) -# fetch_result = FetchModelResult(persist_torch_model.read_bytes()) -# load_result = worker.load_model(request, fetch_result) - -# value = torch.randn(2) -# transform_result = TransformInputResult([value]) - -# execute_result = worker.execute(request, load_result, transform_result) - -# assert execute_result.predictions is not None - - -# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -# def test_execute_missing_model(persist_torch_model: pathlib.Path) -> None: -# """Verify that a executing a model with an invalid key fails cleanly""" - -# # use key that references an un-set model value -# model_name = "test-key" -# feature_store = MemoryFeatureStore() -# feature_store[model_name] = persist_torch_model.read_bytes() - -# worker = IntegratedTorchWorker -# request = InferenceRequest(input_keys=[model_name]) - -# load_result = LoadModelResult(None) -# transform_result = TransformInputResult( -# [torch.randn(2), torch.randn(2), torch.randn(2)] -# ) - -# with pytest.raises(sse.SmartSimError) as ex: -# worker.execute(request, load_result, transform_result) - -# assert "Model must be loaded" in ex.value.args[0] - - -# @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -# def test_transform_output() -> None: -# """Verify that the default output transform operation is a no-op copy""" -# rows, cols = 1, 4 -# num_values = 7 -# inputs = [torch.randn((rows, cols)) for _ in range(num_values)] -# exp_outputs = [torch.Tensor(tensor) for tensor in inputs] - -# worker = SampleTorchWorker -# request = InferenceRequest() -# exec_result = ExecuteResult(inputs) - -# result = worker.transform_output(request, exec_result) - -# assert len(result.outputs) == num_values - -# for output, expected in zip(result.outputs, exp_outputs): -# assert output.shape == expected.shape -# assert output.equal(expected) - -# transformed = list(result.outputs) - -# # verify a copy was made -# original: torch.Tensor = inputs[0] -# transformed[0] = 2 * transformed[0] - -# assert transformed[0].equal(2 * original) From 762937c8017961a66ce33a95eb0aaaf60dd3501d Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Mon, 30 Sep 2024 22:46:14 -0500 Subject: [PATCH 21/40] extract notify listener from dragon backend, fix dragon import order, fix --- smartsim/_core/entrypoints/service.py | 48 ++- .../_core/launcher/dragon/dragonBackend.py | 109 +++--- .../infrastructure/control/event_listener.py | 318 ++++++++++++++++++ .../storage/backbone_feature_store.py | 44 ++- .../storage/dragon_feature_store.py | 13 +- .../infrastructure/storage/feature_store.py | 4 +- tests/dragon/test_dragon_backend.py | 294 +++++++++++++--- tests/dragon/test_error_handling.py | 3 +- tests/dragon/test_featurestore.py | 3 +- tests/dragon/test_worker_manager.py | 10 +- 10 files changed, 704 insertions(+), 142 deletions(-) create mode 100644 smartsim/_core/mli/infrastructure/control/event_listener.py diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index 6b4ef74b6..27d541312 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -40,14 +40,22 @@ class Service(ABC): hooks for status changes""" def __init__( - self, as_service: bool = False, cooldown: int = 0, loop_delay: int = 0 + self, + as_service: bool = False, + cooldown: int = 0, + loop_delay: int = 0, + health_check_frequency: float = 0, ) -> None: """Initialize the ServiceHost + :param as_service: Determines if the host will run until shutdown criteria are met or as a run-once instance :param cooldown: Period of time to allow service to run before automatic shutdown, in seconds. A non-zero, positive integer. - :param loop_delay: delay between iterations of the event loop""" + :param loop_delay: Delay between iterations of the event loop (in seconds) + :param health_check_frequency: Delay between calls to a + health check handler (in seconds) + """ self._as_service = as_service """If the service should run until shutdown function returns True""" self._cooldown = abs(cooldown) @@ -55,6 +63,11 @@ def __init__( before shutdown""" self._loop_delay = abs(loop_delay) """Forced delay between iterations of the event loop""" + self._health_check_frequency = health_check_frequency + """The time (in seconds) between desired health checks. A health check + frequency of zero will never trigger the health check.""" + self._last_health_check = time.time() + """The timestamp of the latest health check""" @abstractmethod def _on_iteration(self) -> None: @@ -76,6 +89,11 @@ def _on_shutdown(self) -> None: the main event loop during automatic shutdown.""" logger.debug(f"Shutting down {self.__class__.__name__}") + def _on_health_check(self) -> None: + """Empty hook method for use by subclasses. Invoked based on the + value of `self._health_check_frequency`.""" + logger.debug(f"Performing health check for {self.__class__.__name__}") + def _on_cooldown_elapsed(self) -> None: """Empty hook method for use by subclasses. Called on every event loop iteration immediately upon exceeding the cooldown period""" @@ -98,13 +116,30 @@ def execute(self) -> None: """The main event loop of a service host. Evaluates shutdown criteria and combines with a cooldown period to allow automatic service termination. Responsible for executing calls to subclass implementation of `_on_iteration`""" - self._on_start() + + try: + self._on_start() + except Exception: + logger.exception("Unable to start service.") + return running = True cooldown_start: t.Optional[datetime.datetime] = None while running: - self._on_iteration() + try: + self._on_iteration() + except Exception: + running = False + logger.exception( + "Failure in event loop resulted in service termination" + ) + + if self._health_check_frequency > 0: + hc_elapsed = time.time() - self._last_health_check + if hc_elapsed >= self._health_check_frequency: + self._on_health_check() + self._last_health_check = time.time() # allow immediate shutdown if not set to run as a service if not self._as_service: @@ -133,4 +168,7 @@ def execute(self) -> None: self._on_delay() time.sleep(self._loop_delay) - self._on_shutdown() + try: + self._on_shutdown() + except Exception: + logger.exception("Service shutdown may not have completed.") diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 0c172365a..fa28f8690 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -27,6 +27,7 @@ import functools import itertools import os +import socket import time import typing as t from dataclasses import dataclass, field @@ -47,16 +48,15 @@ import dragon.native.machine as dragon_machine from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter -from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel -from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.control.event_listener import ( + ConsumerRegistrationListener, +) from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, - EventBase, EventCategory, - EventConsumer, - OnCreateConsumer, ) from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict +from smartsim.error.errors import SmartSimError # pylint: enable=import-error # isort: on @@ -199,11 +199,9 @@ def __init__(self, pid: int) -> None: """Time in seconds needed by the server to complete shutdown""" self._backbone: t.Optional[BackboneFeatureStore] = None """The backbone feature store""" - self._event_consumer: t.Optional[EventConsumer] = None - """A consumer registered to listen for new consumers and update the shared - consumer registrations list""" + self._listener: t.Optional[dragon_process.Process] = None + """The standalone process executing the event consumer""" - """An event consumer for receiving events from MLI resources""" self._nodes: t.List["dragon_machine.Node"] = [] """Node capability information for hosts in the allocation""" self._hosts: t.List[str] = [] @@ -573,20 +571,6 @@ def _create_backbone(self) -> BackboneFeatureStore: return self._backbone - def _on_consumer_created(self, event: EventBase) -> None: - """Event handler for updating the backbone when new event consumers - are registered. - - :param event: The event that was received - """ - if isinstance(event, OnCreateConsumer) and self._backbone is not None: - notify_list = set(self._backbone.notification_channels) - notify_list.add(event.descriptor) - self._backbone.notification_channels = list(notify_list) - return - - logger.warning(f"Unhandled event received: {event}") - @staticmethod def _initialize_cooldown() -> int: """Load environment configuration and determine the correct cooldown @@ -601,47 +585,38 @@ def _initialize_cooldown() -> int: else 5 ) - def _create_eventing(self, backbone: BackboneFeatureStore) -> EventConsumer: - """ - Create an event publisher and event consumer for communicating with - other MLI resources. - - :param backbone: The backbone feature store used by the MLI backend. - - NOTE: the backbone must be initialized before connecting to eventing clients. - - :returns: The newly created EventConsumer instance - """ - - if self._event_consumer is None: - logger.info("Creating event consumer") - dragon_channel = create_local(500) - event_channel = DragonCommChannel(dragon_channel) - consumer = EventConsumer( - event_channel, - backbone, - [EventCategory.CONSUMER_CREATED], - name="BackendConsumerRegistrar", - event_handler=self._on_consumer_created, - ) - - self._event_consumer = consumer - backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = consumer.descriptor - logger.info(f"Backend consumer `{consumer.name}` created.") - - return self._event_consumer - - def listen_to_registrations(self, timeout: float = 0.001) -> None: - """Execute the listener for registration events. + def start_event_listener( + self, cpu_affinity: list[int], gpu_affinity: list[int] + ) -> dragon_process.Process: + if self._backbone is None: + raise SmartSimError("Backbone feature store is not available") - :param timeout: Maximum time to wait (in seconds) for a new event""" - if self._event_consumer is not None: - self._event_consumer.listen_once(timeout) + service = ConsumerRegistrationListener( + self._backbone, 1.0, 2.0, [EventCategory.CONSUMER_CREATED], True + ) - @staticmethod - def _start_eventing_listeners() -> None: - # todo: start external listener entrypoint - ... + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=socket.gethostname(), + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + process = dragon_process.Process( + target=service.execute, + args=[], + cwd=os.getcwd(), + env={ + **os.environ, + **(self._backbone.get_env() if self._backbone is not None else {}), + }, + policy=local_policy, + options=options, + stderr=dragon_process.Popen.STDOUT, + stdout=dragon_process.Popen.STDOUT, + ) + process.start() + return process @staticmethod def create_run_policy( @@ -684,8 +659,6 @@ def create_run_policy( def _start_steps(self) -> None: self._heartbeat() - backbone = self._create_backbone() - self._create_eventing(backbone) with self._queue_lock: started = [] @@ -713,7 +686,7 @@ def _start_steps(self) -> None: env={ **request.current_env, **request.env, - **backbone.get_env(), + **(self._backbone.get_env() if self._backbone else {}), }, stdout=dragon_process.Popen.PIPE, stderr=dragon_process.Popen.PIPE, @@ -869,8 +842,7 @@ def _should_print_status(self) -> bool: def _update(self) -> None: """Trigger all update queries and update local state database""" - backbone = self._create_backbone() - self._create_eventing(backbone) + self._create_backbone() self._stop_steps() self._start_steps() @@ -879,6 +851,9 @@ def _update(self) -> None: def _kill_all_running_jobs(self) -> None: with self._queue_lock: + if self._listener and self._listener.is_alive: + self._listener.kill() + for step_id, group_info in self._group_infos.items(): if group_info.status not in TERMINAL_STATUSES: self._stop_requests.append(DragonStopRequest(step_id=step_id)) diff --git a/smartsim/_core/mli/infrastructure/control/event_listener.py b/smartsim/_core/mli/infrastructure/control/event_listener.py new file mode 100644 index 000000000..03d7b1ceb --- /dev/null +++ b/smartsim/_core/mli/infrastructure/control/event_listener.py @@ -0,0 +1,318 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +# pylint: disable=import-error +# pylint: disable=unused-import +import dragon + +# from dragon.globalservices.api_setup import connect_to_infrastructure + + +# pylint: enable=unused-import +# pylint: enable=import-error +# isort: on + +import argparse +import multiprocessing as mp +import os +import sys +import typing as t + +from smartsim._core.entrypoints.service import Service +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBase, + EventCategory, + EventConsumer, + OnCreateConsumer, +) +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class ConsumerRegistrationListener(Service): + """A long-running service that listens for events of a specific type + and executes the appropriate event handler.""" + + def __init__( + self, + backbone: BackboneFeatureStore, + timeout: float, + batch_timeout: float, + event_filters: t.List[EventCategory], + as_service: bool = False, + cooldown: int = 0, + health_check_frequency: float = 60.0, + ) -> None: + """Initialize the EventListener. + + :param backbone: The backbone feature store + :param timeout: Maximum time (in seconds) to allow a single recv request to wait + :param batch_timeout: Maximum time (in seconds) to allow a batch of receives to + continue to build + :param filters: Filters specifying the message types to handle + :param as_service: Specifies run-once or run-until-complete behavior of service + :param cooldown: Number of seconds to wait before shutting down after + shutdown criteria are met + """ + super().__init__( + as_service, cooldown, health_check_frequency=health_check_frequency + ) + + self._timeout = timeout + """ Maximum time (in seconds) to allow a single recv request to wait""" + + self._batch_timeout = batch_timeout + """Maximum time (in seconds) to allow a batch of receives to + continue to build""" + + self._filters = event_filters + """Filters specifying the message types to handle""" + + self._consumer: t.Optional[EventConsumer] = None + """The event consumer that handles receiving events""" + + self._backbone = backbone + """A standalone, system-created feature store used to share internal + information among MLI components""" + + def _on_start(self) -> None: + """Called on initial entry into Service `execute` event loop before + `_on_iteration` is invoked.""" + super()._on_start() + self._create_eventing() + + def _on_shutdown(self) -> None: + """Release dragon resources. Called immediately after exiting + the main event loop during automatic shutdown.""" + super()._on_shutdown() + + # unregister this listener in the backbone + self._backbone.pop(BackboneFeatureStore.MLI_BACKEND_CONSUMER) + + def _on_iteration(self) -> None: + """Executes calls to the machine learning worker implementation to complete + the inference pipeline.""" + + if self._consumer is None: + logger.info("Unable to listen. No consumer available.") + return + + self._consumer.listen_once(self._timeout, self._batch_timeout) + + def _can_shutdown(self) -> bool: + """Determines if the event consumer is ready to stop listening. + + :returns: True when criteria to shutdown the service are met, False otherwise + """ + + if self._backbone is None: + logger.info("Listener must shutdown: no backbone attached") + return True + + if self._consumer is None: + logger.info("Listener must shutdown: no consumer channel created") + return True + + if not self._consumer.listening: + logger.info("Listener can shutdown: consumer is not listening") + return True + + return False + + def _on_event_received(self, event: EventBase) -> None: + """Event handler for updating the backbone when new event consumers + are registered. + + :param event: The event that was received + """ + if self._backbone is None: + logger.info("Unable to handle event. Backbone is missing.") + + if not isinstance(event, OnCreateConsumer): + logger.info( + "Consumer registration listener received an " + f"unexpected event: {event=}" + ) + return + + notify_list = set(self._backbone.notification_channels) + notify_list.add(event.descriptor) + self._backbone.notification_channels = list(notify_list) + + def _on_health_check(self) -> None: + """Check if this consumer has been replaced by a new listener + and automatically trigger a shutdown. Invoked based on the + value of `self._health_check_frequency`.""" + super()._on_health_check() + + try: + logger.debug("Retrieving registered listener descriptor") + descriptor = self._backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + except KeyError: + descriptor = None + if self._consumer: + self._consumer.listening = False + + if self._consumer and descriptor != self._consumer.descriptor: + logger.warning( + "This listener is no longer registered. It " + "will automatically shut down." + ) + self._consumer.listening = False + + def _publish_consumer(self) -> None: + """Publish the consumer descriptor to the backbone.""" + if self._consumer is None: + logger.warning("No consumer descriptor available to publisher") + return + + self._backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = ( + self._consumer.descriptor + ) + + def _create_eventing(self) -> EventConsumer: + """ + Create an event publisher and event consumer for communicating with + other MLI resources. + + :param backbone: The backbone feature store used by the MLI backend. + + NOTE: the backbone must be initialized before connecting eventing clients. + + :returns: The newly created EventConsumer instance + """ + + if self._consumer: + return self._consumer + + logger.info("Creating event consumer") + + dragon_channel = create_local(500) + event_channel = DragonCommChannel(dragon_channel) + + if not event_channel.descriptor: + raise SmartSimError( + "Unable to generate the descriptor for the event channel" + ) + + self._consumer = EventConsumer( + event_channel, + self._backbone, + self._filters, + name="BackendConsumerRegistrar", + event_handler=self._on_event_received, + ) + self._publish_consumer() + + logger.info( + f"Backend consumer `{self._consumer.name}` created: " + f"{self._consumer.descriptor}" + ) + + return self._consumer + + +def _create_parser() -> argparse.ArgumentParser: + """ + Create an argument parser that contains the arguments + required to start the listener as a new process: + + --timeout + --batch_timeout + --categories + + :returns: A configured parser + """ + arg_parser = argparse.ArgumentParser(prog="ConsumerRegistrarEventListener") + + category_default = EventCategory.CONSUMER_CREATED + + arg_parser.add_argument("--timeout", type=float, default=1.0) + arg_parser.add_argument("--batch_timeout", type=float, default=1.0) + arg_parser.add_argument("--categories", type=str, default=category_default) + + return arg_parser + + +def _connect_backbone() -> t.Optional[BackboneFeatureStore]: + """ + Load the backbone by retrieving the descriptor from environment variables. + + :returns: The backbone feature store + :raises: SmartSimError if a descriptor is not found + """ + descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, "") + + if not descriptor: + return None + + logger.info(f"Listener backbone descriptor: {descriptor}\n") + + # `from_writable_descriptor` ensures we can update the backbone + return BackboneFeatureStore.from_writable_descriptor(descriptor) + + +if __name__ == "__main__": + mp.set_start_method("dragon") + + parser = _create_parser() + + args = parser.parse_args() + user_filters: t.List[EventCategory] = list(args.categories.split(",")) + + backbone_fs = _connect_backbone() + + if backbone_fs is None: + logger.error( + "Unable to attach to the backbone without the " + f"`{BackboneFeatureStore.MLI_BACKBONE}` environment variable." + ) + sys.exit(1) + + logger.debug(f"Listener attached to backbone: {backbone_fs.descriptor}") + + listener = ConsumerRegistrationListener( + backbone_fs, + float(args.timeout), + float(args.batch_timeout), + user_filters, + as_service=True, + ) + + logger.info(f"listener created? {listener}") + + try: + listener.execute() + sys.exit(0) + except Exception: + logger.exception("An error occurred in the event listener") + sys.exit(1) diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index 9fcf490e4..ffeb917a9 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -76,8 +76,7 @@ def __init__( super().__init__(storage) self._enable_reserved_writes = allow_reserved_writes - if self._CREATED_ON not in self: - self._record_creation_data() + self._record_creation_data() @property def wait_timeout(self) -> float: @@ -114,7 +113,9 @@ def notification_channels(self, values: t.Sequence[str]) -> None: :param values: The list of channel descriptors to save """ - self[self.MLI_NOTIFY_CONSUMERS] = ",".join([str(value) for value in values]) + self[self.MLI_NOTIFY_CONSUMERS] = ",".join( + [str(value) for value in values if value] + ) @property def backend_channel(self) -> t.Optional[str]: @@ -198,7 +199,8 @@ def _check_wait_timeout( elapsed = time.time() - start_time if timeout and elapsed > timeout: raise SmartSimError( - f"Backbone {self.descriptor=} timeout retrieving all keys: {indicators}" + f"Backbone {self.descriptor=} timeout after {elapsed} " + f"seconds retrieving keys: {indicators}" ) def wait_for( @@ -260,6 +262,8 @@ class EventCategory(str, enum.Enum): """Event category for an event raised when a new consumer is created""" FEATURE_STORE_WRITTEN: str = "feature-store-written" """Event category for an event raised when a feature store key is written""" + SHUTDOWN: str = "shutdown" + """Event category for an event that should trigger the listener to shutdown""" @dataclass @@ -288,6 +292,14 @@ def __str__(self) -> str: return f"{self.uid}|{self.category}" +class OnShutdownRequested(EventBase): + """Publish this event to trigger the listener to shutdown.""" + + def __init__(self) -> None: + """Initialize the OnShutdownRequest event.""" + super().__init__(EventCategory.SHUTDOWN, str(uuid.uuid4())) + + class OnCreateConsumer(EventBase): """Publish this event when a new event consumer registration is required.""" @@ -593,6 +605,7 @@ def __init__( self._global_filters = filters or [] self._name = name self._event_handler = event_handler + self.listening = True @property def descriptor(self) -> str: @@ -696,7 +709,10 @@ def register(self) -> None: logger.warning("Unable to register. No registrar channel found.") def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: - """Receives messages for the consumer a single time. + """Receives messages for the consumer a single time. Delivers + all messages that pass the consumer filters. Shutdown requests + are handled by a default event handler. + NOTE: Executes a single batch-retrieval to receive the maximum number of messages available under batch timeout. To continually @@ -715,5 +731,23 @@ def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> Non for message in incoming_messages: logger.debug(f"Sending event {message=} to handler.") + self._handle_shutdown(message) if self._event_handler: self._event_handler(message) + + def _handle_shutdown(self, event: EventBase) -> None: + """Handles shutdown requests sent to the consumer by setting the + `self.listener` property to `False`.""" + if isinstance(event, OnShutdownRequested): + self.listening = False + + def listen(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: + """Receives messages for the consumer until a shutdown request is received + + :param timeout: Maximum time to wait (in seconds) for a message to arrive + :param timeout: Maximum time to wait (in seconds) for a batch to arrive + """ + self.listening = True + + while self.listening: + self.listen_once(timeout, batch_timeout) diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index 7c640bab6..c8c85623f 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -68,7 +68,7 @@ def _get(self, key: str) -> t.Union[str, bytes]: """ try: return self._storage[key] - except KeyError as e: + except dragon_ddict.DDictKeyError as e: raise KeyError(f"Key not found in FeatureStore: {key}") from e def _set(self, key: str, value: t.Union[str, bytes]) -> None: @@ -88,6 +88,17 @@ def _contains(self, key: str) -> bool: """ return key in self._storage + def pop(self, key: str) -> t.Union[str, bytes, None]: + """Remove the value from the dictionary and return the value. + + :param key: Dictionary key to retrieve + :returns: The value held at the key if it exists, otherwise `None + `""" + try: + return self._storage.pop(key) + except dragon_ddict.DDictKeyError: + return None + @classmethod def from_descriptor( cls, diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py index 8c85a352d..260b1a337 100644 --- a/smartsim/_core/mli/infrastructure/storage/feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py @@ -147,8 +147,8 @@ def __getitem__(self, key: str) -> t.Union[str, bytes]: """ try: return self._get(key) - except KeyError as ex: - raise SmartSimError(f"An unknown key was requested: {key}") from ex + except KeyError: + raise except Exception as ex: # note: explicitly avoid round-trip to check for key existence raise SmartSimError( diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py index b56a92c5b..003e27e8c 100644 --- a/tests/dragon/test_dragon_backend.py +++ b/tests/dragon/test_dragon_backend.py @@ -25,6 +25,8 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import time +import uuid import pytest @@ -33,12 +35,13 @@ from smartsim._core.launcher.dragon.dragonBackend import DragonBackend from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.infrastructure.control.event_listener import ( + ConsumerRegistrationListener, +) from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, - EventBase, - EventBroadcaster, - EventConsumer, OnCreateConsumer, + OnShutdownRequested, ) from smartsim.log import get_logger @@ -47,71 +50,258 @@ logger = get_logger(__name__) -def test_dragonbackend_listener_boostrapping(monkeypatch: pytest.MonkeyPatch): - """Verify that the dragon backend registration channel correctly - registers new consumers in the backbone and begins sending events - to the new consumers.""" +def test_dragonbackend_start_listener(): + """Verify the background process listening to consumer registration events + is up and processing messages as expected.""" + backend = DragonBackend(pid=9999) + + # We need to let the backend create the backbone to continue + backbone = backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_BACKEND_CONSUMER) + + os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor + + with pytest.raises(KeyError) as ex: + # we expect the value of the consumer to be empty until + # the listener start-up completes. + backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + + assert "not found" in ex.value.args[0] + + drg_process = backend.start_event_listener(cpu_affinity=[], gpu_affinity=[]) + + # # confirm there is a process still running + logger.info(f"Dragon process started: {drg_process}") + assert drg_process is not None, "Backend was unable to start event listener" + assert drg_process.puid != 0, "Process unique ID is empty" + assert drg_process.returncode is None, "Listener terminated early" + + # wait for the event listener to come up + try: + config = backbone.wait_for( + [BackboneFeatureStore.MLI_BACKEND_CONSUMER], timeout=30 + ) + # verify result was in the returned configuration map + assert config[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + except Exception: + raise KeyError( + f"Unable to locate {BackboneFeatureStore.MLI_BACKEND_CONSUMER}" + "in the backbone" + ) + + # wait_for ensures the normal retrieval will now work, error-free + descriptor = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + assert descriptor is not None + + # register a new listener channel + comm_channel = DragonCommChannel.from_descriptor(descriptor) + mock_descriptor = str(uuid.uuid4()) + event = OnCreateConsumer(mock_descriptor, []) + + event_bytes = bytes(event) + comm_channel.send(event_bytes) + + subscriber_list = [] + + # Give the channel time to write the message and the listener time to handle it + for i in range(20): + time.sleep(1) + # Retrieve the subscriber list from the backbone and verify it is updated + if subscriber_list := backbone.notification_channels: + logger.debug(f"The subscriber list was populated after {i} iterations") + break + + assert mock_descriptor in subscriber_list + + # now send a shutdown message to terminate the listener + return_code = drg_process.returncode + + # clean up if the OnShutdownRequested wasn't properly handled + if return_code is None and drg_process.is_alive: + drg_process.kill() + drg_process.join() + + +def test_dragonbackend_backend_consumer(): + """Verify the listener background process updates the MLI_BACKEND_CONSUMER + value in the backbone.""" + backend = DragonBackend(pid=9999) + + # We need to let the backend create the backbone to continue + backbone = backend._create_backbone() + assert backbone._allow_reserved_writes + + # create listener with `as_service=False` to perform a single loop iteration + listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False) + + logger.debug(f"backbone loaded? {listener._backbone}") + logger.debug(f"listener created? {listener}") + + try: + # call the service execute method directly to trigger + # the entire service lifecycle + listener.execute() + + consumer_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + logger.debug(f"MLI_BACKEND_CONSUMER: {consumer_desc}") + + assert consumer_desc + except Exception as ex: + logger.info("") + finally: + listener._on_shutdown() + +def test_dragonbackend_event_handled(): + """Verify the event listener process updates the MLI_NOTIFY_CONSUMERS + value in the backbone when an event is received and again on shutdown. + """ backend = DragonBackend(pid=9999) - backend._create_backbone() - backbone = backend._backbone + # We need to let the backend create the backbone to continue + backbone = backend._create_backbone() - def mock_event_handler(event: EventBase) -> None: - logger.debug(f"Handling event in mock handler: {event}") + # create the listener to be tested + listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False) - bb_descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) - assert bb_descriptor + assert listener._backbone, "The listener is not attached to a backbone" - fs = BackboneFeatureStore.from_descriptor(bb_descriptor) - fs[event.uid] = "received" + try: + # set up the listener but don't let the service event loop start + listener._create_eventing() # listener.execute() - # create the consumer and start a listener process - backend_consumer = backend._create_eventing(backbone) - registrar_descriptor = backend._event_consumer.descriptor + # grab the channel descriptor so we can simulate registrations + channel_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + comm_channel = DragonCommChannel.from_descriptor(channel_desc) - # ensure the consumer is stored to backend & published to backbone - assert backend._event_consumer == backend_consumer - assert backbone.backend_channel == registrar_descriptor - assert os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) + num_events = 5 + events = [] + for i in range(num_events): + # register some mock consumers using the backend channel + event = OnCreateConsumer(f"mock-consumer-descriptor-{uuid.uuid4()}", []) + event_bytes = bytes(event) + comm_channel.send(event_bytes) + events.append(event) - # simulate a new consumer registration - new_consumer_ch = DragonCommChannel.from_local() - new_consumer = EventConsumer( - new_consumer_ch, + # run few iterations of the event loop in case it takes a few cycles to write + for _ in range(20): + listener._on_iteration() + # Grab the value that should be getting updated + notify_consumers = set(backbone.notification_channels) + if len(notify_consumers) == len(events): + logger.info(f"Retrieved all consumers after {i} listen cycles") + break + + # ... and confirm that all the mock consumer descriptors are registered + assert set([e.descriptor for e in events]) == set(notify_consumers) + logger.info(f"Number of registered consumers: {len(notify_consumers)}") + + except Exception as ex: + logger.exception(f"test_dragonbackend_event_handled - exception occurred: {ex}") + finally: + # shutdown should unregister a registration listener + listener._on_shutdown() + + for i in range(10): + if "BackboneFeatureStore.MLI_BACKEND_CONSUMER" not in backbone: + logger.debug(f"The listener was removed after {i} iterations") + channel_desc = None + break + + # we should see that there is no listener registered + assert not channel_desc + + +def test_dragonbackend_shutdown_event(): + """Verify the background process shuts down when it receives a + shutdown request.""" + backend = DragonBackend(pid=9999) + + # We need to let the backend create the backbone to continue + backbone = backend._create_backbone() + + listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False) + + logger.debug(f"backbone loaded? {listener._backbone}") + logger.debug(f"listener created? {listener}") + + try: + # set up the listener but don't let the listener loop start + listener._create_eventing() # listener.execute() + + # grab the channel descriptor so we can publish to it + channel_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + comm_channel = DragonCommChannel.from_descriptor(channel_desc) + + assert listener._consumer.listening, "Listener wasn't ready to listen" + + # send a shutdown request... + event = OnShutdownRequested() + event_bytes = bytes(event) + comm_channel.send(event_bytes) + + # run iteration a few times in case it takes a few cycles to write + for _ in range(5): + listener._on_iteration() + + logger.info(f"{listener._consumer.listening=}") + + # ...and confirm the listener is now cancelled + assert not listener._consumer.listening + + except Exception as ex: + logger.exception( + f"test_dragonbackend_shutdown_event - exception occurred: {ex}" + ) + + +@pytest.mark.parametrize("health_check_frequency", [10, 20]) +def test_dragonbackend_shutdown_on_health_check(health_check_frequency: float): + """Verify that the event listener automatically shuts down when + a new listener is registered in its place. + + :param health_check_frequency: The expected frequency of service health check + invocations""" + backend = DragonBackend(pid=9999) + + # We need to let the backend create the backbone to continue + backbone = backend._create_backbone() + + listener = ConsumerRegistrationListener( backbone, + 1.0, + 1.0, [], - name="test-consumer-a", - event_handler=mock_event_handler, + as_service=True, # allow service to run long enough to health check + health_check_frequency=health_check_frequency, ) - assert new_consumer, "new_consumer construction failed" - - # send registration to registrar channel - new_consumer.register() - # the backend consumer should handle updating the notify list and the new - # consumer that just broadcast its registration should be registered... - # backend_consumer.listen_once(timeout=2.0) - backend.listen_to_registrations(timeout=0.1) + try: + # set up the listener but don't let the listener loop start + listener._create_eventing() # listener.execute() + assert listener._consumer.listening, "Listener wasn't ready to listen" - # # confirm the backend registrar consumer registerd the new listener - assert new_consumer_ch.descriptor in backbone.notification_channels + # Replace the consumer descriptor in the backbone to trigger + # an automatic shutdown + backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = str(uuid.uuid4()) - broadcaster = EventBroadcaster(backbone, DragonCommChannel.from_descriptor) + # set the last health check manually to verify the duration + start_at = time.time() + listener._last_health_check = time.time() - # re-send the same thing because i'm too lazy to create a new consumer - broadcast_event = OnCreateConsumer(registrar_descriptor, []) - broadcaster.send(broadcast_event, timeout=0.1) + # run execute to let the service trigger health checks + listener.execute() + elapsed = time.time() - start_at - new_consumer.listen_once(timeout=0.1) + # confirm the frequency of the health check was honored + assert elapsed >= health_check_frequency - values = backbone.wait_for( - [broadcast_event.uid, BackboneFeatureStore.MLI_NOTIFY_CONSUMERS], 1.0 - ) - stored = values[broadcast_event.uid] - assert stored == "received", "The handler didn't update the backbone" + # ...and confirm the listener is now cancelled + assert ( + not listener._consumer.listening + ), "Listener was not automatically shutdown by the health check" - # confirm that directly retrieving the value isn't different from - # using backbone.notification_channels helper method - notify_list = str(values[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS]).split(",") - assert new_consumer.descriptor in set(notify_list) + except Exception as ex: + logger.exception( + f"test_dragonbackend_shutdown_event - exception occurred: {ex}" + ) diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 4f511a9c3..df370cbc4 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -29,8 +29,6 @@ import pytest -from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict - dragon = pytest.importorskip("dragon") import multiprocessing as mp @@ -57,6 +55,7 @@ from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim._core.mli.infrastructure.storage.feature_store import ( FeatureStore, ModelKey, diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index 35720fa9d..c08a8f30e 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -34,8 +34,6 @@ import pytest -from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict - dragon = pytest.importorskip("dragon") from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel @@ -52,6 +50,7 @@ from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( time as bbtime, ) +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim.log import get_logger logger = get_logger(__name__) diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index a2df57f3b..819414eca 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -31,17 +31,11 @@ import pytest -from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict - torch = pytest.importorskip("torch") dragon = pytest.importorskip("dragon") import multiprocessing as mp -from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( - BackboneFeatureStore, -) - try: mp.set_start_method("dragon") except Exception: @@ -58,9 +52,13 @@ EnvironmentConfigLoader, WorkerManager, ) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger From 51baf611cc42d891748fa01abb7cc233abbc6508 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 1 Oct 2024 01:01:35 -0500 Subject: [PATCH 22/40] stringification bug fix --- tests/dragon/test_dragon_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py index 003e27e8c..229855bc5 100644 --- a/tests/dragon/test_dragon_backend.py +++ b/tests/dragon/test_dragon_backend.py @@ -203,7 +203,7 @@ def test_dragonbackend_event_handled(): listener._on_shutdown() for i in range(10): - if "BackboneFeatureStore.MLI_BACKEND_CONSUMER" not in backbone: + if BackboneFeatureStore.MLI_BACKEND_CONSUMER not in backbone: logger.debug(f"The listener was removed after {i} iterations") channel_desc = None break From 3f4af8eb6c451800bd3421817bf56ccb6941b160 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 1 Oct 2024 01:22:07 -0500 Subject: [PATCH 23/40] remove use of deprecated class --- .../_core/mli/infrastructure/storage/dragon_feature_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index c8c85623f..dc0f57ae6 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -68,7 +68,7 @@ def _get(self, key: str) -> t.Union[str, bytes]: """ try: return self._storage[key] - except dragon_ddict.DDictKeyError as e: + except dragon_ddict.DDictError as e: raise KeyError(f"Key not found in FeatureStore: {key}") from e def _set(self, key: str, value: t.Union[str, bytes]) -> None: @@ -96,7 +96,7 @@ def pop(self, key: str) -> t.Union[str, bytes, None]: `""" try: return self._storage.pop(key) - except dragon_ddict.DDictKeyError: + except dragon_ddict.DDictError: return None @classmethod From 979373c2d2686a97a19a4ad439c97851b5d90ae7 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 1 Oct 2024 18:02:05 -0500 Subject: [PATCH 24/40] review changes part 1, improve dragon errors handling, add unregister consumers, moar tests --- smartsim/_core/_cli/scripts/dragon_install.py | 4 +- .../_core/launcher/dragon/dragonBackend.py | 3 +- .../_core/mli/comm/channel/dragon_channel.py | 4 +- smartsim/_core/mli/comm/channel/dragon_fli.py | 3 + .../_core/mli/comm/channel/dragon_util.py | 36 +- .../infrastructure/control/event_listener.py | 61 ++- .../storage/backbone_feature_store.py | 110 ++++- smartsim/protoclient.py | 6 +- tests/dragon/test_event_consumer.py | 408 ++++++++++++++++++ tests/dragon/test_featurestore.py | 85 ---- tests/dragon/test_featurestore_integration.py | 84 ---- tests/test_dragon_comm_utils.py | 228 ++++++++++ 12 files changed, 810 insertions(+), 222 deletions(-) create mode 100644 tests/dragon/test_event_consumer.py create mode 100644 tests/test_dragon_comm_utils.py diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index 662820fed..d9d0ef3c7 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -57,7 +57,7 @@ def __init__( def _check(self) -> None: """Perform validation of this instance - :raises: ValueError if any value fails validation""" + :raises ValueError: if any value fails validation""" if not self.repo_name or len(self.repo_name.split("/")) != 2: raise ValueError( f"Invalid dragon repository name. Example: `dragonhpc/dragon`" @@ -287,7 +287,7 @@ def retrieve_asset( :param request: details of a request for the installation of the dragon package :param asset: GitHub release asset to retrieve :returns: path to the directory containing the extracted release asset - :raises: SmartSimCLIActionCancelled if the asset cannot be downloaded or extracted + :raises SmartSimCLIActionCancelled: if the asset cannot be downloaded or extracted """ download_dir = request.working_dir / str(asset.id) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index fa28f8690..f5c271518 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -53,7 +53,6 @@ ) from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, - EventCategory, ) from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim.error.errors import SmartSimError @@ -592,7 +591,7 @@ def start_event_listener( raise SmartSimError("Backbone feature store is not available") service = ConsumerRegistrationListener( - self._backbone, 1.0, 2.0, [EventCategory.CONSUMER_CREATED], True + self._backbone, 1.0, 2.0, as_service=True, health_check_frequency=90 ) options = dragon_process_desc.ProcessOptions(make_inf_channels=True) diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 7534719e7..4ccf7cf7f 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -55,6 +55,7 @@ def __init__(self, channel: "dch.Channel") -> None: descriptor = drg_util.channel_to_descriptor(channel) super().__init__(descriptor) self._channel = channel + """The underlying dragon channel used by this CommChannel for communications""" @property def channel(self) -> "dch.Channel": @@ -113,9 +114,6 @@ def from_descriptor( :raises SmartSimError: If creation of comm channel fails """ try: - if isinstance(descriptor, bytes): - raise ValueError("Descriptor must be a string") - channel = drg_util.descriptor_to_channel(descriptor) return DragonCommChannel(channel) except Exception as ex: diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 13eb58a2e..254a21c5b 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -59,9 +59,12 @@ def __init__( super().__init__(descriptor) self._fli = fli_ + """The underlying dragon FLInterface used by this CommChannel for communications""" self._channel: t.Optional["dch.Channel"] = ( drg_util.create_local(buffer_size) if sender_supplied else None ) + """The underlying dragon Channel used by a sender-side DragonFLIChannel + to attach to the main FLI channel""" def send(self, value: bytes, timeout: float = 0.001) -> None: """Send a message through the underlying communication channel. diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py index 014e9c0a4..8edff31c0 100644 --- a/smartsim/_core/mli/comm/channel/dragon_util.py +++ b/smartsim/_core/mli/comm/channel/dragon_util.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import base64 +import binascii import typing as t import dragon.channels as dch @@ -53,6 +54,7 @@ def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str :param channel: The dragon channel to convert :returns: The descriptor string + :raises: SmartSimError if a dragon channel is not provided """ if channel is None: raise SmartSimError("Channel is not available to create a descriptor") @@ -78,9 +80,20 @@ def descriptor_to_fli(descriptor: str) -> "fli.FLInterface": the string-encoded descriptor. :param descriptor: The descriptor of an FLI to attach to - :returns: The attached dragon FLI""" - descriptor_ = base64.b64decode(descriptor.encode("utf-8")) - return fli.FLInterface.attach(descriptor_) + :returns: The attached dragon FLI + :raises ValueError: If the descriptor is empty or incorrectly formatted + """ + if len(descriptor) < 1: + raise ValueError("Descriptors may not be empty") + + try: + encoded = descriptor.encode("utf-8") + descriptor_ = base64.b64decode(encoded) + return fli.FLInterface.attach(descriptor_) + except binascii.Error: + raise ValueError("The descriptor was not properly base64 encoded") + except fli.DragonFLIError: + raise SmartSimError("The descriptor did not address an available FLI") def descriptor_to_channel(descriptor: str) -> dch.Channel: @@ -88,9 +101,20 @@ def descriptor_to_channel(descriptor: str) -> dch.Channel: the string-encoded descriptor. :param descriptor: The descriptor of a channel to attach to - :returns: The attached dragon Channel""" - descriptor_ = base64.b64decode(descriptor.encode("utf-8")) - return dch.Channel.attach(descriptor_) + :returns: The attached dragon Channel + :raises ValueError: If the descriptor is empty or incorrectly formatted + :raises SmartSimError: If the descriptor does not attach to a channel""" + if len(descriptor) < 1: + raise ValueError("Descriptors may not be empty") + + try: + encoded = descriptor.encode("utf-8") + descriptor_ = base64.b64decode(encoded) + return dch.Channel.attach(descriptor_) + except binascii.Error: + raise ValueError("The descriptor was not properly base64 encoded") + except dch.ChannelError: + raise SmartSimError("The descriptor did not address an available channel") def create_local(_capacity: int = 0) -> dch.Channel: diff --git a/smartsim/_core/mli/infrastructure/control/event_listener.py b/smartsim/_core/mli/infrastructure/control/event_listener.py index 03d7b1ceb..f1b7b664e 100644 --- a/smartsim/_core/mli/infrastructure/control/event_listener.py +++ b/smartsim/_core/mli/infrastructure/control/event_listener.py @@ -51,6 +51,7 @@ EventCategory, EventConsumer, OnCreateConsumer, + OnRemoveConsumer, ) from smartsim.error.errors import SmartSimError from smartsim.log import get_logger @@ -67,7 +68,6 @@ def __init__( backbone: BackboneFeatureStore, timeout: float, batch_timeout: float, - event_filters: t.List[EventCategory], as_service: bool = False, cooldown: int = 0, health_check_frequency: float = 60.0, @@ -94,9 +94,6 @@ def __init__( """Maximum time (in seconds) to allow a batch of receives to continue to build""" - self._filters = event_filters - """Filters specifying the message types to handle""" - self._consumer: t.Optional[EventConsumer] = None """The event consumer that handles receiving events""" @@ -118,6 +115,9 @@ def _on_shutdown(self) -> None: # unregister this listener in the backbone self._backbone.pop(BackboneFeatureStore.MLI_BACKEND_CONSUMER) + # TODO: need the channel to be cleaned up + # self._consumer._comm_channel._channel.destroy() + def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete the inference pipeline.""" @@ -148,6 +148,33 @@ def _can_shutdown(self) -> bool: return False + def _on_unregister(self, event: OnRemoveConsumer) -> None: + """Event handler for updating the backbone when new event consumers + are registered. + + :param event: The event that was received + """ + notify_list = set(self._backbone.notification_channels) + + # remove the descriptor specified in the event + if event.descriptor in notify_list: + logger.debug(f"Removing notify consumer: {event.descriptor}") + notify_list.remove(event.descriptor) + + # push the updated list back into the backbone + self._backbone.notification_channels = list(notify_list) + + def _on_register(self, event: OnCreateConsumer) -> None: + """Event handler for updating the backbone when new event consumers + are registered. + + :param event: The event that was received + """ + notify_list = set(self._backbone.notification_channels) + logger.debug(f"Adding notify consumer: {event.descriptor}") + notify_list.add(event.descriptor) + self._backbone.notification_channels = list(notify_list) + def _on_event_received(self, event: EventBase) -> None: """Event handler for updating the backbone when new event consumers are registered. @@ -157,16 +184,15 @@ def _on_event_received(self, event: EventBase) -> None: if self._backbone is None: logger.info("Unable to handle event. Backbone is missing.") - if not isinstance(event, OnCreateConsumer): + if isinstance(event, OnCreateConsumer): + self._on_register(event) + elif isinstance(event, OnRemoveConsumer): + self._on_unregister(event) + else: logger.info( "Consumer registration listener received an " f"unexpected event: {event=}" ) - return - - notify_list = set(self._backbone.notification_channels) - notify_list.add(event.descriptor) - self._backbone.notification_channels = list(notify_list) def _on_health_check(self) -> None: """Check if this consumer has been replaced by a new listener @@ -190,9 +216,9 @@ def _on_health_check(self) -> None: self._consumer.listening = False def _publish_consumer(self) -> None: - """Publish the consumer descriptor to the backbone.""" + """Publish the registrar consumer descriptor to the backbone.""" if self._consumer is None: - logger.warning("No consumer descriptor available to publisher") + logger.warning("No registrar consumer descriptor available to publisher") return self._backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = ( @@ -227,8 +253,8 @@ def _create_eventing(self) -> EventConsumer: self._consumer = EventConsumer( event_channel, self._backbone, - self._filters, - name="BackendConsumerRegistrar", + [EventCategory.CONSUMER_CREATED, EventCategory.CONSUMER_REMOVED], + name="ConsumerRegistrar", event_handler=self._on_event_received, ) self._publish_consumer() @@ -248,17 +274,13 @@ def _create_parser() -> argparse.ArgumentParser: --timeout --batch_timeout - --categories :returns: A configured parser """ arg_parser = argparse.ArgumentParser(prog="ConsumerRegistrarEventListener") - category_default = EventCategory.CONSUMER_CREATED - arg_parser.add_argument("--timeout", type=float, default=1.0) arg_parser.add_argument("--batch_timeout", type=float, default=1.0) - arg_parser.add_argument("--categories", type=str, default=category_default) return arg_parser @@ -285,9 +307,7 @@ def _connect_backbone() -> t.Optional[BackboneFeatureStore]: mp.set_start_method("dragon") parser = _create_parser() - args = parser.parse_args() - user_filters: t.List[EventCategory] = list(args.categories.split(",")) backbone_fs = _connect_backbone() @@ -304,7 +324,6 @@ def _connect_backbone() -> t.Optional[BackboneFeatureStore]: backbone_fs, float(args.timeout), float(args.batch_timeout), - user_filters, as_service=True, ) diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index ffeb917a9..859e767b6 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -260,6 +260,8 @@ class EventCategory(str, enum.Enum): CONSUMER_CREATED: str = "consumer-created" """Event category for an event raised when a new consumer is created""" + CONSUMER_REMOVED: str = "consumer-removed" + """Event category for an event raised when a new consumer is created""" FEATURE_STORE_WRITTEN: str = "feature-store-written" """Event category for an event raised when a feature store key is written""" SHUTDOWN: str = "shutdown" @@ -327,6 +329,29 @@ def __str__(self) -> str: return f"{str(super())}|{self.descriptor}|{_filters}" +class OnRemoveConsumer(EventBase): + """Publish this event when a consumer is shutting down and + should be removed from notification lists.""" + + descriptor: str + """Descriptor of the comm channel exposed by the consumer""" + + def __init__(self, descriptor: str) -> None: + """Initialize the OnRemoveConsumer event. + + :param descriptor: Descriptor of the comm channel exposed by the consumer + """ + super().__init__(EventCategory.CONSUMER_REMOVED, str(uuid.uuid4())) + self.descriptor = descriptor + + def __str__(self) -> str: + """Convert the event to a string. + + :returns: A string representation of this instance + """ + return f"{str(super())}|{self.descriptor}" + + class OnWriteFeatureStore(EventBase): """Publish this event when a feature store key is written.""" @@ -582,9 +607,13 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int: class EventConsumer: """Reads system events published to a communications channel.""" + _BACKBONE_WAIT_TIMEOUT = 10.0 + """Maximum time (in seconds) to wait for the backbone to register the consumer""" + def __init__( self, comm_channel: CommChannelBase, + # channel_factory: ..., backbone: BackboneFeatureStore, filters: t.Optional[t.List[EventCategory]] = None, name: t.Optional[str] = None, @@ -601,11 +630,24 @@ def __init__( :raises ValueError: If batch_timeout <= 0 """ self._comm_channel = comm_channel + """The comm channel used by the consumer to receive messages. The channel + descriptor will be published for senders to discover.""" self._backbone = backbone + """The backbone instance used to bootstrap the instance. The EventConsumer + uses the backbone to discover where it can publish its descriptor.""" self._global_filters = filters or [] + """A set of global filters to apply to incoming events. Global filters are + combined with per-call filters. Filters act as an allow-list.""" self._name = name + """User-friendly name assigned to a consumer for logging. Automatically + assigned if not provided.""" self._event_handler = event_handler + """The function that should be executed when an event + passed by the filters is received.""" self.listening = True + """Flag indicating that the consumer is currently listening for new + events. Setting this flag to `False` will cause any active calls to + `listen` to terminate.""" @property def descriptor(self) -> str: @@ -639,10 +681,15 @@ def recv( :param batch_timeout: Maximum time to wait for messages to arrive; allows multiple batches to be retrieved in one call to `send` :returns: A list of events that pass any configured filters + :raises ValueError: If a positive, non-zero value is not provided for the + timeout or batch_timeout. """ if filters is None: filters = [] + if timeout is not None and timeout <= 0: + raise ValueError("request timeout must be a non-zero, positive value") + if batch_timeout is not None and batch_timeout <= 0: raise ValueError("batch_timeout must be a non-zero, positive value") @@ -688,25 +735,45 @@ def recv( return events_received + def _send_to_registrar(self, event: EventBase) -> None: + """Send an event direct to the registrar listener.""" + registrar_key = BackboneFeatureStore.MLI_BACKEND_CONSUMER + config = self._backbone.wait_for([registrar_key], self._BACKBONE_WAIT_TIMEOUT) + registrar_descriptor = str(config.get(registrar_key, None)) + + if not registrar_descriptor: + logger.warning(f"Unable to {event.category}. No registrar channel found.") + return + + logger.debug(f"Sending {event.category} for {self.name}") + + registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor) + registrar_channel.send(bytes(event), timeout=1.0) + + logger.debug(f"{event.category} for {self.name} sent") + def register(self) -> None: """Send an event to register this consumer as a listener.""" descriptor = self._comm_channel.descriptor event = OnCreateConsumer(descriptor, self._global_filters) - registrar_key = BackboneFeatureStore.MLI_BACKEND_CONSUMER - config = self._backbone.wait_for([registrar_key], 2.0) + self._send_to_registrar(event) - registrar_descriptor = str(config.get(registrar_key, None)) + def unregister(self) -> None: + """Send an event to un-register this consumer as a listener.""" + descriptor = self._comm_channel.descriptor + event = OnRemoveConsumer(descriptor) - if registrar_descriptor: - logger.debug(f"Sending registration for {self.name}") + self._send_to_registrar(event) - registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor) - registrar_channel.send(bytes(event), timeout=1.0) + @staticmethod + def _on_handler_missing(event: EventBase) -> None: + """A "dead letter" event handler that is called to perform + processing on events before they're discarded. - logger.debug(f"Registration for {self.name} sent") - else: - logger.warning("Unable to register. No registrar channel found.") + :param event: The event to handle + """ + logger.warning(f"No event handler is registered. Discarding {event=}") def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: """Receives messages for the consumer a single time. Delivers @@ -724,30 +791,41 @@ def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> Non logger.debug(f"Starting event listener with {timeout} second timeout") logger.debug("Awaiting new messages") + if not self._event_handler: + logger.debug("Unable to handle messages. No event handler is registered.") + incoming_messages = self.recv(timeout=timeout, batch_timeout=batch_timeout) if not incoming_messages: - logger.debug("Consumer received empty message list.") + logger.debug(f"Consumer {self.name} received empty message list.") for message in incoming_messages: logger.debug(f"Sending event {message=} to handler.") self._handle_shutdown(message) + if self._event_handler: self._event_handler(message) + else: + self._on_handler_missing(message) - def _handle_shutdown(self, event: EventBase) -> None: + def _handle_shutdown(self, event: EventBase) -> bool: """Handles shutdown requests sent to the consumer by setting the - `self.listener` property to `False`.""" + `self.listener` property to `False`. + + :param event: The event to handle + :returns: A bool indicating if the event was a shutdown request + """ if isinstance(event, OnShutdownRequested): self.listening = False + return True + return False def listen(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: - """Receives messages for the consumer until a shutdown request is received + """Receives messages for the consumer until a shutdown request is received. :param timeout: Maximum time to wait (in seconds) for a message to arrive - :param timeout: Maximum time to wait (in seconds) for a batch to arrive + :param batch_timeout: Maximum time to wait (in seconds) for a batch to arrive """ - self.listening = True while self.listening: self.listen_once(timeout, batch_timeout) diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py index 7f6d6f412..d9cdcf594 100644 --- a/smartsim/protoclient.py +++ b/smartsim/protoclient.py @@ -108,7 +108,7 @@ def _attach_to_worker_queue(self) -> DragonFLIChannel: then attach an FLI to the given worker queue. :returns: The attached FLI channel - :raises: SmartSimError if the required configuration is not found in the + :raises SmartSimError: if the required configuration is not found in the backbone feature store """ @@ -150,7 +150,7 @@ def __init__( written to file :param wait_timeout: Maximum wait time (in seconds) allowed to attach to the worker queue - :raises: SmartSimError if unable to attach to a backbone featurestore + :raises SmartSimError: If unable to attach to a backbone featurestore """ if MPI is not None: # TODO: determine a way to make MPI work in the test environment @@ -266,7 +266,7 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: :param model: The raw bytes or path to a pytorch model :param batch: The tensor batch to perform inference on :returns: The inference results - :raises: ValueError if the worker queue is not configured properly + :raises ValueError: if the worker queue is not configured properly in the environment variables """ tensors = [batch.numpy()] diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py new file mode 100644 index 000000000..adac966ab --- /dev/null +++ b/tests/dragon/test_event_consumer.py @@ -0,0 +1,408 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time +import typing as t +from unittest import mock + +import pytest + +from smartsim._core.mli.infrastructure.control.event_listener import ( + ConsumerRegistrationListener, +) + +dragon = pytest.importorskip("dragon") + +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, + EventBase, + EventBroadcaster, + EventCategory, + EventConsumer, + OnCreateConsumer, + OnRemoveConsumer, + OnShutdownRequested, + OnWriteFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + time as bbtime, +) +from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict +from smartsim.log import get_logger + +logger = get_logger(__name__) + +# isort: off +from dragon import fli +from dragon.channels import Channel + +# isort: on + +if t.TYPE_CHECKING: + import conftest + + +# The tests in this file must run in a dragon environment +pytestmark = pytest.mark.dragon + + +@pytest.fixture(scope="module") +def the_storage() -> t.Dict[str, str]: + """Fixture to instantiate a dragon distributed dictionary.""" + return create_ddict(1, 2, 4 * 1024**2) + + +@pytest.fixture(scope="module") +def the_worker_channel() -> DragonFLIChannel: + """Fixture to create a valid descriptor for a worker channel + that can be attached to. Does not modify environment vars.""" + channel_ = create_local() + fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) + comm_channel = DragonFLIChannel(fli_, True) + return comm_channel + + +@pytest.fixture(scope="module") +def the_backbone( + the_storage: t.Any, the_worker_channel: DragonFLIChannel +) -> BackboneFeatureStore: + """Fixture to create a distributed dragon dictionary and wrap it + in a BackboneFeatureStore. + + :param the_storage: The dragon storage engine to use + :param the_worker_channel: Pre-configured worker channel + """ + + backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True) + backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor + + return backbone + + +def test_eventconsumer_eventpublisher_integration( + the_backbone: t.Any, test_dir: str +) -> None: + """Verify that the publisher and consumer integrate as expected when + multiple publishers and consumers are sending simultaneously. This + test closely tracks the test in tests/test_featurestore_base.py also named + test_eventconsumer_eventpublisher_integration but requires dragon entities. + + :param the_backbone: The BackboneFeatureStore to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + wmgr_channel = DragonCommChannel(create_local()) + capp_channel = DragonCommChannel(create_local()) + back_channel = DragonCommChannel(create_local()) + + wmgr_consumer_descriptor = wmgr_channel.descriptor + capp_consumer_descriptor = capp_channel.descriptor + back_consumer_descriptor = back_channel.descriptor + + # create some consumers to receive messages + wmgr_consumer = EventConsumer( + wmgr_channel, + the_backbone, + filters=[EventCategory.FEATURE_STORE_WRITTEN], + ) + capp_consumer = EventConsumer( + capp_channel, + the_backbone, + ) + back_consumer = EventConsumer( + back_channel, + the_backbone, + filters=[EventCategory.CONSUMER_CREATED], + ) + + # create some broadcasters to publish messages + mock_worker_mgr = EventBroadcaster( + the_backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + mock_client_app = EventBroadcaster( + the_backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + + # register all of the consumers even though the OnCreateConsumer really should + # trigger its registration. event processing is tested elsewhere. + the_backbone.notification_channels = [ + wmgr_consumer_descriptor, + capp_consumer_descriptor, + back_consumer_descriptor, + ] + + # simulate worker manager sending a notification to backend that it's alive + event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[]) + mock_worker_mgr.send(event_1) + + # simulate the app updating a model a few times + for key in ["key-1", "key-2", "key-1"]: + event = OnWriteFeatureStore(the_backbone.descriptor, key) + mock_client_app.send(event, timeout=0.1) + + # worker manager should only get updates about feature update + wmgr_messages = wmgr_consumer.recv() + assert len(wmgr_messages) == 3 + + # the backend should only receive messages about consumer creation + back_messages = back_consumer.recv() + assert len(back_messages) == 1 + + # hypothetical app has no filters and will get all events + app_messages = capp_consumer.recv() + assert len(app_messages) == 4 + + +@pytest.mark.parametrize( + " timeout, batch_timeout, exp_err_msg", + [(-1, 1, " timeout"), (1, -1, "batch_timeout")], +) +def test_eventconsumer_invalid_timeout( + timeout: float, + batch_timeout: float, + exp_err_msg: str, + test_dir: str, + the_backbone: BackboneFeatureStore, +) -> None: + """Verify that the event consumer raises an exception + when provided an invalid request timeout. + + :param timeout: The request timeout for the event consumer recv call + :param batch_timeout: The batch timeout for the event consumer recv call + :param exp_err_msg: A unique value from the error message that should be raised + :param the_storage: The dragon storage engine to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + wmgr_channel = DragonCommChannel(create_local()) + + # create some consumers to receive messages + wmgr_consumer = EventConsumer( + wmgr_channel, + the_backbone, + filters=[EventCategory.FEATURE_STORE_WRITTEN], + ) + + # the consumer should report an error for the invalid timeout value + with pytest.raises(ValueError) as ex: + wmgr_consumer.recv(timeout=timeout, batch_timeout=batch_timeout) + + assert exp_err_msg in ex.value.args[0] + + +def test_eventconsumer_no_event_handler_registered( + the_backbone: t.Any, test_dir: str +) -> None: + """Verify that a consumer discards messages when + on a channel if no handler is registered. + + :param the_backbone: The BackboneFeatureStore to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + wmgr_channel = DragonCommChannel(create_local()) + + # create a consumer to receive messages + wmgr_consumer = EventConsumer(wmgr_channel, the_backbone, event_handler=None) + + # create a broadcasters to publish messages + mock_worker_mgr = EventBroadcaster( + the_backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + + # manually register the consumers since we don't have a backend running + the_backbone.notification_channels = [wmgr_channel.descriptor] + + # simulate the app updating a model a few times + for key in ["key-1", "key-2", "key-1"]: + event = OnWriteFeatureStore(the_backbone.descriptor, key) + mock_worker_mgr.send(event, timeout=0.1) + + # run the handler and let it discard messages + for _ in range(15): + wmgr_consumer.listen_once(0.2, 2.0) + + assert wmgr_consumer.listening + + +def test_eventconsumer_no_event_handler_registered_shutdown( + the_backbone: t.Any, test_dir: str +) -> None: + """Verify that a consumer without an event handler + registered still honors shutdown requests. + + :param the_backbone: The BackboneFeatureStore to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + wmgr_channel = DragonCommChannel(create_local()) + capp_channel = DragonCommChannel(create_local()) + + # create a consumers to receive messages + wmgr_consumer = EventConsumer(wmgr_channel, the_backbone) + + # create a broadcaster to publish messages + mock_worker_mgr = EventBroadcaster( + the_backbone, + channel_factory=DragonCommChannel.from_descriptor, + ) + + # manually register the consumers since we don't have a backend running + the_backbone.notification_channels = [ + wmgr_channel.descriptor, + capp_channel.descriptor, + ] + + # simulate the app updating a model a few times + for key in ["key-1", "key-2", "key-1"]: + event = OnWriteFeatureStore(the_backbone.descriptor, key) + mock_worker_mgr.send(event, timeout=0.1) + + event = OnShutdownRequested() + mock_worker_mgr.send(event, timeout=0.1) + + # wmgr will stop listening to messages when it is told to stop listening + wmgr_consumer.listen(timeout=0.1, batch_timeout=2.0) + + for _ in range(15): + wmgr_consumer.listen_once(timeout=0.1, batch_timeout=2.0) + + # confirm the messages were processed, discarded, and the shutdown was received + assert wmgr_consumer.listening == False + + +def test_eventconsumer_registration( + the_backbone: t.Any, test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that a consumer is correctly registered in + the backbone after sending a registration request. Then, + Confirm the consumer is unregistered after sending the + un-register request. + + :param the_backbone: The BackboneFeatureStore to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + with monkeypatch.context() as patch: + registrar = ConsumerRegistrationListener( + the_backbone, 1.0, 2.0, as_service=False + ) + + # NOTE: service.execute(as_service=False) will complete the service life- + # cycle and remove the registrar from the backbone, so mock _on_shutdown + disabled_shutdown = mock.MagicMock() + patch.setattr(registrar, "_on_shutdown", disabled_shutdown) + + # initialze registrar resources + registrar.execute() + + # create a consumer that will be registered + wmgr_channel = DragonCommChannel(create_local()) + wmgr_consumer = EventConsumer(wmgr_channel, the_backbone) + + registered_channels = the_backbone.notification_channels + + # trigger the consumer-to-registrar handshake + wmgr_consumer.register() + + current_registrations: t.List[str] = [] + + # have the registrar run a few times to pick up the msg + for i in range(15): + registrar.execute() + current_registrations = the_backbone.notification_channels + if len(current_registrations) != len(registered_channels): + logger.debug(f"The event was processed on iteration {i}") + break + + # confirm the consumer is registered + assert wmgr_channel.descriptor in current_registrations + + # copy old list so we can compare against it. + registered_channels = list(current_registrations) + + # trigger the consumer removal + wmgr_consumer.unregister() + + # have the registrar run a few times to pick up the msg + for i in range(15): + registrar.execute() + current_registrations = the_backbone.notification_channels + if len(current_registrations) != len(registered_channels): + logger.debug(f"The event was processed on iteration {i}") + break + + # confirm the consumer is no longer registered + assert wmgr_channel.descriptor not in current_registrations + + +def test_registrar_teardown( + the_backbone: t.Any, test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Verify that the consumer registrar removes itself from + the backbone when it shuts down. + + :param the_backbone: The BackboneFeatureStore to use + :param test_dir: Automatically generated unique working + directories for individual test outputs + """ + + with monkeypatch.context() as patch: + registrar = ConsumerRegistrationListener( + the_backbone, 1.0, 2.0, as_service=False + ) + + # directly initialze registrar resources to avoid service life-cycle + registrar._create_eventing() + + # confirm the registrar is published to the backbone + cfg = the_backbone.wait_for([BackboneFeatureStore.MLI_BACKEND_CONSUMER], 10) + assert BackboneFeatureStore.MLI_BACKEND_CONSUMER in cfg + + # execute the entire service lifecycle 1x + registrar.execute() + + consumer_found = BackboneFeatureStore.MLI_BACKEND_CONSUMER in the_backbone + + for i in range(15): + time.sleep(0.1) + consumer_found = BackboneFeatureStore.MLI_BACKEND_CONSUMER in the_backbone + if not consumer_found: + logger.debug(f"Registrar removed from the backbone on iteration {i}") + break + + assert BackboneFeatureStore.MLI_BACKEND_CONSUMER not in the_backbone diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index c08a8f30e..e34120c98 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -102,91 +102,6 @@ def the_backbone( return backbone -def test_eventconsumer_eventpublisher_integration( - the_backbone: BackboneFeatureStore, test_dir: str -) -> None: - """Verify that the publisher and consumer integrate as expected when - multiple publishers and consumers are sending simultaneously. This - test closely tracks the test in tests/test_featurestore.py also named - test_eventconsumer_eventpublisher_integration but requires dragon entities. - - :param the_backbone: the dragon storage engine to use - :param test_dir: pytest fixture automatically generating unique working - directories for individual test outputs - """ - - # verify ability to write and read from ddict - the_backbone["test_dir"] = test_dir - assert the_backbone["test_dir"] == test_dir - - wmgr_channel = DragonCommChannel(create_local()) - capp_channel = DragonCommChannel(create_local()) - back_channel = DragonCommChannel(create_local()) - - wmgr_consumer_descriptor = wmgr_channel.descriptor - capp_consumer_descriptor = capp_channel.descriptor - back_consumer_descriptor = back_channel.descriptor - - # create some consumers to receive messages - wmgr_consumer = EventConsumer( - wmgr_channel, - the_backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], - ) - capp_consumer = EventConsumer( - capp_channel, - the_backbone, - ) - back_consumer = EventConsumer( - back_channel, - the_backbone, - filters=[EventCategory.CONSUMER_CREATED], - ) - - # create some broadcasters to publish messages - mock_worker_mgr = EventBroadcaster( - the_backbone, - channel_factory=DragonCommChannel.from_descriptor, - ) - mock_client_app = EventBroadcaster( - the_backbone, - channel_factory=DragonCommChannel.from_descriptor, - ) - - # register all of the consumers even though the OnCreateConsumer really should - # trigger its registration. event processing is tested elsewhere. - the_backbone.notification_channels = [ - wmgr_consumer_descriptor, - capp_consumer_descriptor, - back_consumer_descriptor, - ] - - # simulate worker manager sending a notification to backend that it's alive - event_1 = OnCreateConsumer(wmgr_consumer_descriptor, []) - mock_worker_mgr.send(event_1) - - # simulate the app updating a model a few times - event_2 = OnWriteFeatureStore(the_backbone.descriptor, "key-1") - event_3 = OnWriteFeatureStore(the_backbone.descriptor, "key-2") - event_4 = OnWriteFeatureStore(the_backbone.descriptor, "key-1") - - mock_client_app.send(event_2) - mock_client_app.send(event_3) - mock_client_app.send(event_4) - - # worker manager should only get updates about feature update - wmgr_messages = wmgr_consumer.recv() - assert len(wmgr_messages) == 3 - - # the backend should only receive messages about consumer creation - back_messages = back_consumer.recv() - assert len(back_messages) == 1 - - # hypothetical app has no filters and will get all events - app_messages = capp_consumer.recv() - assert len(app_messages) == 4 - - def test_backbone_wait_for_no_keys( the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch ) -> None: diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index e4d6bb9eb..895bc6467 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -86,90 +86,6 @@ def the_backbone(the_storage: t.Any) -> BackboneFeatureStore: return BackboneFeatureStore(the_storage, allow_reserved_writes=True) -def test_eventconsumer_eventpublisher_integration( - the_storage: t.Any, test_dir: str -) -> None: - """Verify that the publisher and consumer integrate as expected when - multiple publishers and consumers are sending simultaneously. This - test closely tracks the test in tests/test_featurestore.py also named - test_eventconsumer_eventpublisher_integration but requires dragon entities. - - :param the_storage: The dragon storage engine to use - :param test_dir: Automatically generated unique working - directories for individual test outputs - """ - - mock_storage = the_storage - backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) - - # verify ability to write and read from ddict - backbone["test_dir"] = test_dir - assert backbone["test_dir"] == test_dir - - wmgr_channel = DragonCommChannel(create_local()) - capp_channel = DragonCommChannel(create_local()) - back_channel = DragonCommChannel(create_local()) - - wmgr_consumer_descriptor = wmgr_channel.descriptor - capp_consumer_descriptor = capp_channel.descriptor - back_consumer_descriptor = back_channel.descriptor - - # create some consumers to receive messages - wmgr_consumer = EventConsumer( - wmgr_channel, - backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], - ) - capp_consumer = EventConsumer( - capp_channel, - backbone, - ) - back_consumer = EventConsumer( - back_channel, - backbone, - filters=[EventCategory.CONSUMER_CREATED], - ) - - # create some broadcasters to publish messages - mock_worker_mgr = EventBroadcaster( - backbone, - channel_factory=DragonCommChannel.from_descriptor, - ) - mock_client_app = EventBroadcaster( - backbone, - channel_factory=DragonCommChannel.from_descriptor, - ) - - # register all of the consumers even though the OnCreateConsumer really should - # trigger its registration. event processing is tested elsewhere. - backbone.notification_channels = [ - wmgr_consumer_descriptor, - capp_consumer_descriptor, - back_consumer_descriptor, - ] - - # simulate worker manager sending a notification to backend that it's alive - event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[]) - mock_worker_mgr.send(event_1) - - # simulate the app updating a model a few times - for key in ["key-1", "key-2", "key-1"]: - event = OnWriteFeatureStore(backbone.descriptor, key) - mock_client_app.send(event, timeout=0.1) - - # worker manager should only get updates about feature update - wmgr_messages = wmgr_consumer.recv() - assert len(wmgr_messages) == 3 - - # the backend should only receive messages about consumer creation - back_messages = back_consumer.recv() - assert len(back_messages) == 1 - - # hypothetical app has no filters and will get all events - app_messages = capp_consumer.recv() - assert len(app_messages) == 4 - - @pytest.mark.parametrize( "num_events, batch_timeout, max_batches_expected", [ diff --git a/tests/test_dragon_comm_utils.py b/tests/test_dragon_comm_utils.py new file mode 100644 index 000000000..06d6e19b3 --- /dev/null +++ b/tests/test_dragon_comm_utils.py @@ -0,0 +1,228 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import uuid + +import pytest + +from smartsim.error.errors import SmartSimError + +dragon = pytest.importorskip("dragon") + +# isort: off +import dragon.channels as dch +import dragon.fli as fli + +# isort: on + +from smartsim._core.mli.comm.channel import dragon_util +from smartsim.log import get_logger + +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon +logger = get_logger(__name__) + + +@pytest.fixture(scope="function") +def the_channel() -> dch.Channel: + """Creates a Channel attached to the local memory pool.""" + channel = dch.Channel.make_process_local() + return channel + + +@pytest.fixture(scope="function") +def the_fli(the_channel) -> fli.FLInterface: + """Creates an FLI attached to the local memory pool.""" + fli_ = fli.FLInterface(main_ch=the_channel, manager_ch=None) + return fli_ + + +def test_descriptor_to_channel_empty() -> None: + """Verify that `descriptor_to_channel` raises an exception when + provided with an empty descriptor.""" + descriptor = "" + + with pytest.raises(ValueError) as ex: + dragon_util.descriptor_to_channel(descriptor) + + assert "empty" in ex.value.args[0] + + +@pytest.mark.parametrize( + "descriptor", + ["a", "ab", "abc", "x1", pathlib.Path(".").absolute().as_posix()], +) +def test_descriptor_to_channel_b64fail(descriptor: str) -> None: + """Verify that `descriptor_to_channel` raises an exception when + provided with an incorrectly encoded descriptor. + + :param descriptor: A descriptor that is not properly base64 encoded + """ + + with pytest.raises(ValueError) as ex: + dragon_util.descriptor_to_channel(descriptor) + + assert "base64" in ex.value.args[0] + + +@pytest.mark.parametrize( + "descriptor", + [str(uuid.uuid4())], +) +def test_descriptor_to_channel_channel_fail(descriptor: str) -> None: + """Verify that `descriptor_to_channel` raises an exception when a correctly + formatted descriptor that does not describe a real channel is passed. + + :param descriptor: A descriptor that is not properly base64 encoded + """ + + with pytest.raises(SmartSimError) as ex: + dragon_util.descriptor_to_channel(descriptor) + + # ensure we're receiving the right exception + assert "address" in ex.value.args[0] + assert "channel" in ex.value.args[0] + + +def test_descriptor_to_channel_channel_not_available(the_channel: dch.Channel) -> None: + """Verify that `descriptor_to_channel` raises an exception when a channel + is no longer available. + + :param the_channel: A dragon channel + """ + + # get a good descriptor & wipe out the channel so it can't be attached + descriptor = dragon_util.channel_to_descriptor(the_channel) + the_channel.destroy() + + with pytest.raises(SmartSimError) as ex: + dragon_util.descriptor_to_channel(descriptor) + + assert "address" in ex.value.args[0] + + +def test_descriptor_to_channel_happy_path(the_channel: dch.Channel) -> None: + """Verify that `descriptor_to_channel` works as expected when provided + a valid descriptor + + :param the_channel: A dragon channel + """ + + # get a good descriptor + descriptor = dragon_util.channel_to_descriptor(the_channel) + + reattached = dragon_util.descriptor_to_channel(descriptor) + assert reattached + + # and just make sure creation of the descriptor is transitive + assert dragon_util.channel_to_descriptor(reattached) == descriptor + + +def test_descriptor_to_fli_empty() -> None: + """Verify that `descriptor_to_fli` raises an exception when + provided with an empty descriptor.""" + descriptor = "" + + with pytest.raises(ValueError) as ex: + dragon_util.descriptor_to_fli(descriptor) + + assert "empty" in ex.value.args[0] + + +@pytest.mark.parametrize( + "descriptor", + ["a", "ab", "abc", "x1", pathlib.Path(".").absolute().as_posix()], +) +def test_descriptor_to_fli_b64fail(descriptor: str) -> None: + """Verify that `descriptor_to_fli` raises an exception when + provided with an incorrectly encoded descriptor. + + :param descriptor: A descriptor that is not properly base64 encoded + """ + + with pytest.raises(ValueError) as ex: + dragon_util.descriptor_to_fli(descriptor) + + assert "base64" in ex.value.args[0] + + +@pytest.mark.parametrize( + "descriptor", + [str(uuid.uuid4())], +) +def test_descriptor_to_fli_fli_fail(descriptor: str) -> None: + """Verify that `descriptor_to_fli` raises an exception when a correctly + formatted descriptor that does not describe a real FLI is passed. + + :param descriptor: A descriptor that is not properly base64 encoded + """ + + with pytest.raises(SmartSimError) as ex: + dragon_util.descriptor_to_fli(descriptor) + + # ensure we're receiving the right exception + assert "address" in ex.value.args[0] + assert "fli" in ex.value.args[0].lower() + + +def test_descriptor_to_fli_fli_not_available( + the_fli: fli.FLInterface, the_channel: dch.Channel +) -> None: + """Verify that `descriptor_to_fli` raises an exception when a channel + is no longer available. + + :param the_fli: A dragon FLInterface + :param the_channel: A dragon channel + """ + + # get a good descriptor & wipe out the FLI so it can't be attached + descriptor = dragon_util.channel_to_descriptor(the_fli) + the_fli.destroy() + the_channel.destroy() + + with pytest.raises(SmartSimError) as ex: + dragon_util.descriptor_to_fli(descriptor) + + # ensure we're receiving the right exception + assert "address" in ex.value.args[0] + + +def test_descriptor_to_fli_happy_path(the_fli: dch.Channel) -> None: + """Verify that `descriptor_to_fli` works as expected when provided + a valid descriptor + + :param the_fli: A dragon FLInterface + """ + + # get a good descriptor + descriptor = dragon_util.channel_to_descriptor(the_fli) + + reattached = dragon_util.descriptor_to_fli(descriptor) + assert reattached + + # and just make sure creation of the descriptor is transitive + assert dragon_util.channel_to_descriptor(reattached) == descriptor From 5898005662045e5245a97a3202c63b8505c5f8e7 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 1 Oct 2024 18:16:18 -0500 Subject: [PATCH 25/40] import order follow-up --- smartsim/_core/mli/comm/channel/dragon_util.py | 2 +- .../_core/mli/infrastructure/control/event_listener.py | 10 +++++----- .../infrastructure/storage/backbone_feature_store.py | 2 +- tests/dragon/test_event_consumer.py | 7 +++---- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py index 8edff31c0..258d84b3a 100644 --- a/smartsim/_core/mli/comm/channel/dragon_util.py +++ b/smartsim/_core/mli/comm/channel/dragon_util.py @@ -54,7 +54,7 @@ def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str :param channel: The dragon channel to convert :returns: The descriptor string - :raises: SmartSimError if a dragon channel is not provided + :raises SmartSimError: If a dragon channel is not provided """ if channel is None: raise SmartSimError("Channel is not available to create a descriptor") diff --git a/smartsim/_core/mli/infrastructure/control/event_listener.py b/smartsim/_core/mli/infrastructure/control/event_listener.py index f1b7b664e..2485f77ea 100644 --- a/smartsim/_core/mli/infrastructure/control/event_listener.py +++ b/smartsim/_core/mli/infrastructure/control/event_listener.py @@ -149,8 +149,8 @@ def _can_shutdown(self) -> bool: return False def _on_unregister(self, event: OnRemoveConsumer) -> None: - """Event handler for updating the backbone when new event consumers - are registered. + """Event handler for updating the backbone when event consumers + are un-registered. :param event: The event that was received """ @@ -176,8 +176,8 @@ def _on_register(self, event: OnCreateConsumer) -> None: self._backbone.notification_channels = list(notify_list) def _on_event_received(self, event: EventBase) -> None: - """Event handler for updating the backbone when new event consumers - are registered. + """Primary event handler for the listener. Distributes events to + type-specific handlers. :param event: The event that was received """ @@ -290,7 +290,7 @@ def _connect_backbone() -> t.Optional[BackboneFeatureStore]: Load the backbone by retrieving the descriptor from environment variables. :returns: The backbone feature store - :raises: SmartSimError if a descriptor is not found + :raises SmartSimError: if a descriptor is not found """ descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, "") diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index 859e767b6..21fdecbed 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -414,7 +414,7 @@ def send(self, event: EventBase, timeout: float = 0.001) -> int: :param event: The event to send :param timeout: Maximum time to wait (in seconds) for messages to send :returns: The number of message copies that were sent - :raises: SmartSimError if the comm channel is not configured + :raises SmartSimError: If the comm channel is not configured """ if self._channel is None: raise SmartSimError("No channel to send on") diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py index adac966ab..f361e6c16 100644 --- a/tests/dragon/test_event_consumer.py +++ b/tests/dragon/test_event_consumer.py @@ -30,15 +30,14 @@ import pytest -from smartsim._core.mli.infrastructure.control.event_listener import ( - ConsumerRegistrationListener, -) - dragon = pytest.importorskip("dragon") from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.control.event_listener import ( + ConsumerRegistrationListener, +) from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, EventBase, From af870f95e7e64bd4d7ea014b0a979422dd6157d0 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Thu, 3 Oct 2024 19:41:56 -0500 Subject: [PATCH 26/40] pr review updates --- conftest.py | 14 +- ex/high_throughput_inference/mock_app.py | 2 +- .../standalone_worker_manager.py | 2 +- smartsim/_core/_cli/scripts/dragon_install.py | 3 +- smartsim/_core/entrypoints/service.py | 26 +- .../_core/launcher/dragon/dragonBackend.py | 23 +- .../_core/launcher/dragon/dragonConnector.py | 3 + .../_core/mli/comm/channel/dragon_channel.py | 8 - smartsim/_core/mli/comm/channel/dragon_fli.py | 8 +- .../_core/mli/comm/channel/dragon_util.py | 53 +- .../_core/mli/infrastructure/comm/__init__.py | 0 .../mli/infrastructure/comm/broadcaster.py | 238 +++++++ .../_core/mli/infrastructure/comm/consumer.py | 283 ++++++++ .../_core/mli/infrastructure/comm/event.py | 162 +++++ .../_core/mli/infrastructure/comm/producer.py | 44 ++ .../{event_listener.py => listener.py} | 67 +- .../control/request_dispatcher.py | 2 +- .../mli/infrastructure/environment_loader.py | 9 +- .../storage/backbone_feature_store.py | 616 +----------------- .../storage/dragon_feature_store.py | 10 +- .../mli/infrastructure/storage/dragon_util.py | 1 + .../infrastructure/storage/feature_store.py | 2 +- .../_core/mli/infrastructure/worker/worker.py | 2 +- smartsim/_core/utils/timings.py | 8 +- smartsim/protoclient.py | 42 +- tests/dragon/test_dragon_backend.py | 178 ++--- tests/dragon/test_environment_loader.py | 18 +- tests/dragon/test_error_handling.py | 11 + tests/dragon/test_event_consumer.py | 62 +- tests/dragon/test_featurestore.py | 6 - tests/dragon/test_featurestore_base.py | 114 +++- tests/dragon/test_featurestore_integration.py | 22 +- tests/dragon/test_protoclient.py | 11 +- tests/dragon/test_worker_manager.py | 6 +- tests/dragon/utils/msg_pump.py | 3 +- tests/mli/test_service.py | 109 +++- tests/test_dragon_comm_utils.py | 29 + 37 files changed, 1281 insertions(+), 916 deletions(-) create mode 100644 smartsim/_core/mli/infrastructure/comm/__init__.py create mode 100644 smartsim/_core/mli/infrastructure/comm/broadcaster.py create mode 100644 smartsim/_core/mli/infrastructure/comm/consumer.py create mode 100644 smartsim/_core/mli/infrastructure/comm/event.py create mode 100644 smartsim/_core/mli/infrastructure/comm/producer.py rename smartsim/_core/mli/infrastructure/control/{event_listener.py => listener.py} (84%) diff --git a/conftest.py b/conftest.py index 098a4a0c5..7302482e6 100644 --- a/conftest.py +++ b/conftest.py @@ -93,6 +93,7 @@ test_hostlist = None has_aprun = shutil.which("aprun") is not None + def get_account() -> str: return test_account @@ -459,15 +460,10 @@ def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None: @pytest.fixture(scope="function", autouse=True) def check_output_dir() -> None: - try: - global test_output_dirs - assert os.path.isdir(test_output_root) - assert len(os.listdir(test_output_root)) >= test_output_dirs - test_output_dirs = len(os.listdir(test_output_root)) - except Exception: - # swallow error when the tests can't clean up test dirs - # and let the next run do the job. - ... + global test_output_dirs + assert os.path.isdir(test_output_root) + assert len(os.listdir(test_output_root)) >= test_output_dirs + test_output_dirs = len(os.listdir(test_output_root)) @pytest.fixture diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index f4db1bc1e..876f9145a 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -53,7 +53,7 @@ from smartsim.log import get_logger, log_to_file from smartsim.protoclient import ProtoClient -logger = get_logger("App", "DEBUG") +logger = get_logger("App") CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py index fdef4268a..9a3926803 100644 --- a/ex/high_throughput_inference/standalone_worker_manager.py +++ b/ex/high_throughput_inference/standalone_worker_manager.py @@ -135,7 +135,7 @@ def service_as_dragon_proc( args = parser.parse_args() connect_to_infrastructure() - ddict_str = os.environ["_SMARTSIM_INFRA_BACKBONE"] + ddict_str = os.environ[BackboneFeatureStore.MLI_BACKBONE] backbone = BackboneFeatureStore.from_descriptor(ddict_str) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index d9d0ef3c7..b6666f7c8 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -95,14 +95,13 @@ def get_auth_token(request: DragonInstallRequest) -> t.Optional[Token]: def create_dotenv(dragon_root_dir: pathlib.Path, dragon_version: str) -> None: """Create a .env file with required environment variables for the Dragon runtime""" dragon_root = str(dragon_root_dir) - dragon_rut_dir = dragon_root dragon_inc_dir = dragon_root + "/include" dragon_lib_dir = dragon_root + "/lib" dragon_bin_dir = dragon_root + "/bin" dragon_vars = { "DRAGON_BASE_DIR": dragon_root, - "DRAGON_ROOT_DIR": dragon_rut_dir, + "DRAGON_ROOT_DIR": dragon_root, "DRAGON_INCLUDE_DIR": dragon_inc_dir, "DRAGON_LIB_DIR": dragon_lib_dir, "DRAGON_VERSION": dragon_version, diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index 27d541312..497bdda2f 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -42,19 +42,21 @@ class Service(ABC): def __init__( self, as_service: bool = False, - cooldown: int = 0, - loop_delay: int = 0, + cooldown: float = 0, + loop_delay: float = 0, health_check_frequency: float = 0, ) -> None: """Initialize the ServiceHost - :param as_service: Determines if the host will run until shutdown criteria - are met or as a run-once instance - :param cooldown: Period of time to allow service to run before automatic - shutdown, in seconds. A non-zero, positive integer. - :param loop_delay: Delay between iterations of the event loop (in seconds) - :param health_check_frequency: Delay between calls to a - health check handler (in seconds) + :param as_service: Determines if the host runs continuously until + shutdown criteria are met, or executes the service lifecycle once and exits + :param cooldown: Period of time (in seconds) to allow the service to run + after a shutdown is permitted. Enables the service to avoid restarting if + new work is discovered. A value of 0 disables the cooldown. + :param loop_delay: Time (in seconds) between iterations of the event loop + :param health_check_frequency: Time (in seconds) between calls to a + health check handler. A value of 0 triggers the health check on every + iteration. """ self._as_service = as_service """If the service should run until shutdown function returns True""" @@ -64,8 +66,8 @@ def __init__( self._loop_delay = abs(loop_delay) """Forced delay between iterations of the event loop""" self._health_check_frequency = health_check_frequency - """The time (in seconds) between desired health checks. A health check - frequency of zero will never trigger the health check.""" + """The time (in seconds) between desired health checks. Frequency of 0 + will trigger the health check on every event loop iteration.""" self._last_health_check = time.time() """The timestamp of the latest health check""" @@ -135,7 +137,7 @@ def execute(self) -> None: "Failure in event loop resulted in service termination" ) - if self._health_check_frequency > 0: + if self._health_check_frequency >= 0: hc_elapsed = time.time() - self._last_health_check if hc_elapsed >= self._health_check_frequency: self._on_health_check() diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index f5c271518..fb33460d8 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -48,7 +48,7 @@ import dragon.native.machine as dragon_machine from smartsim._core.launcher.dragon.pqueue import NodePrioritizer, PrioritizerFilter -from smartsim._core.mli.infrastructure.control.event_listener import ( +from smartsim._core.mli.infrastructure.control.listener import ( ConsumerRegistrationListener, ) from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( @@ -158,6 +158,7 @@ class DragonBackend: """ _DEFAULT_NUM_MGR_PER_NODE = 2 + """The default number of manager processes for each feature store node""" _DEFAULT_MEM_PER_NODE = 256 * 1024**2 """The default memory capacity to allocate for a feaure store node (in megabytes)""" @@ -550,7 +551,9 @@ def _stop_steps(self) -> None: def _create_backbone(self) -> BackboneFeatureStore: """ - Create a BackboneFeatureStore if one does not exist. + Creates a BackboneFeatureStore if one does not exist. Updates + environment variables of this process to include the backbone + descriptor. :returns: The descriptor of the backbone feature store """ @@ -587,6 +590,13 @@ def _initialize_cooldown() -> int: def start_event_listener( self, cpu_affinity: list[int], gpu_affinity: list[int] ) -> dragon_process.Process: + """Start a standalone event listener. + + :param cpu_affinity: The CPU affinity for the process + :param gpu_affinity: The CPU affinity for the process + :returns: The dragon Process managing the process + :raises SmartSimError: If the backbone is not provided + """ if self._backbone is None: raise SmartSimError("Backbone feature store is not available") @@ -607,7 +617,7 @@ def start_event_listener( cwd=os.getcwd(), env={ **os.environ, - **(self._backbone.get_env() if self._backbone is not None else {}), + **self._backbone.get_env(), }, policy=local_policy, options=options, @@ -657,6 +667,7 @@ def create_run_policy( ) def _start_steps(self) -> None: + """Start all new steps created since the last update.""" self._heartbeat() with self._queue_lock: @@ -821,6 +832,9 @@ def _refresh_statuses(self) -> None: group_info.redir_workers = None def _update_shutdown_status(self) -> None: + """Query the status of running tasks and update the status + of any that have completed. + """ self._heartbeat() with self._queue_lock: self._can_shutdown |= ( @@ -834,6 +848,9 @@ def _update_shutdown_status(self) -> None: ) def _should_print_status(self) -> bool: + """Determine if status messages should be printed based off the last + update. Returns `True` to trigger prints, `False` otherwise. + """ if self.current_time - self._last_update_time > 10: self._last_update_time = self.current_time return True diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 9cbc55674..98670f347 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -245,6 +245,9 @@ def load_persisted_env(self) -> t.Dict[str, str]: with open(config.dragon_dotenv, encoding="utf-8") as dot_env: for kvp in dot_env.readlines(): + if not kvp: + continue + # skip any commented lines if not kvp.startswith("#"): split = kvp.strip().split("=", maxsplit=1) diff --git a/smartsim/_core/mli/comm/channel/dragon_channel.py b/smartsim/_core/mli/comm/channel/dragon_channel.py index 4ccf7cf7f..110f19258 100644 --- a/smartsim/_core/mli/comm/channel/dragon_channel.py +++ b/smartsim/_core/mli/comm/channel/dragon_channel.py @@ -35,14 +35,6 @@ logger = get_logger(__name__) -DEFAULT_CHANNEL_BUFFER_SIZE = 500 -"""Maximum number of messages that can be buffered. DragonCommChannel will -raise an exception if no clients consume messages before the buffer is filled.""" - -LAST_OFFSET = 0 -"""The last offset used to create a local channel. This is used to avoid -unnecessary retries when creating a local channel.""" - class DragonCommChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon channel.""" diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 254a21c5b..d7787f2ca 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -51,7 +51,7 @@ def __init__( ) -> None: """Initialize the DragonFLIChannel instance. - :param fli_desc: The descriptor of the FLI channel to attach + :param fli_: The FLIInterface to use as the underlying communications channel :param sender_supplied: Flag indicating if the FLI uses sender-supplied streams :param buffer_size: Maximum number of sent messages that can be buffered """ @@ -79,7 +79,7 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: raise SmartSimError( - f"Error sending message: DragonFLIChannel {self.descriptor}" + f"Error sending via DragonFLIChannel {self.descriptor}" ) from e def recv(self, timeout: float = 0.001) -> t.List[bytes]: @@ -99,6 +99,7 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: logger.debug(f"DragonFLIChannel {self.descriptor} received message") except fli.FLIEOT: eot = True + logger.debug(f"DragonFLIChannel exhausted: {self.descriptor}") except Exception as e: raise SmartSimError( f"Error receiving messages: DragonFLIChannel {self.descriptor}" @@ -134,7 +135,8 @@ def from_descriptor( :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached DragonFLIChannel - :raises SmartSimError: If creation of DragonFLIChanenel fails + :raises SmartSimError: If creation of DragonFLIChannel fails + :raises ValueError: If the descriptor is invalid """ if not descriptor: raise ValueError("Invalid descriptor provided") diff --git a/smartsim/_core/mli/comm/channel/dragon_util.py b/smartsim/_core/mli/comm/channel/dragon_util.py index 258d84b3a..8517979ec 100644 --- a/smartsim/_core/mli/comm/channel/dragon_util.py +++ b/smartsim/_core/mli/comm/channel/dragon_util.py @@ -30,10 +30,7 @@ import dragon.channels as dch import dragon.fli as fli -import dragon.infrastructure.facts as df -import dragon.infrastructure.parameters as dp import dragon.managed_memory as dm -import dragon.utils as du from smartsim.error.errors import SmartSimError from smartsim.log import get_logger @@ -54,10 +51,10 @@ def channel_to_descriptor(channel: t.Union[dch.Channel, fli.FLInterface]) -> str :param channel: The dragon channel to convert :returns: The descriptor string - :raises SmartSimError: If a dragon channel is not provided + :raises ValueError: If a dragon channel is not provided """ if channel is None: - raise SmartSimError("Channel is not available to create a descriptor") + raise ValueError("Channel is not available to create a descriptor") serialized_ch = channel.serialize() return base64.b64encode(serialized_ch).decode("utf-8") @@ -67,9 +64,11 @@ def pool_to_descriptor(pool: dm.MemoryPool) -> str: """Convert a dragon memory pool to a descriptor string. :param pool: The memory pool to convert - :returns: The descriptor string""" + :returns: The descriptor string + :raises ValueError: If a memory pool is not provided + """ if pool is None: - raise SmartSimError("Memory pool is not available to create a descriptor") + raise ValueError("Memory pool is not available to create a descriptor") serialized_pool = pool.serialize() return base64.b64encode(serialized_pool).decode("utf-8") @@ -82,6 +81,7 @@ def descriptor_to_fli(descriptor: str) -> "fli.FLInterface": :param descriptor: The descriptor of an FLI to attach to :returns: The attached dragon FLI :raises ValueError: If the descriptor is empty or incorrectly formatted + :raises SmartSimError: If attachment using the descriptor fails """ if len(descriptor) < 1: raise ValueError("Descriptors may not be empty") @@ -103,7 +103,8 @@ def descriptor_to_channel(descriptor: str) -> dch.Channel: :param descriptor: The descriptor of a channel to attach to :returns: The attached dragon Channel :raises ValueError: If the descriptor is empty or incorrectly formatted - :raises SmartSimError: If the descriptor does not attach to a channel""" + :raises SmartSimError: If attachment using the descriptor fails + """ if len(descriptor) < 1: raise ValueError("Descriptors may not be empty") @@ -122,43 +123,9 @@ def create_local(_capacity: int = 0) -> dch.Channel: direct calls to `dch.Channel.make_process_local()` to enable supplying a channel capacity. - :param capacity: The number of events the channel can buffer; uses the default + :param _capacity: The number of events the channel can buffer; uses the default buffer size `DEFAULT_CHANNEL_BUFFER_SIZE` when not supplied :returns: The instantiated channel - :raises SmartSimError: If unable to attach local channel """ - # current implementation has a bug wrt MPI that must be fixed. - # falling back to `make_process_local` and disabling buffer size tests - - # pool = dm.MemoryPool.attach(du.B64.str_to_bytes(dp.this_process.default_pd)) - # pool_descriptor = pool_to_descriptor(pool) - # channel: t.Optional[dch.Channel] = None - # offset = 0 - - # global LAST_OFFSET - # if LAST_OFFSET: - # offset = LAST_OFFSET - - # capacity = capacity if capacity > 0 else DEFAULT_CHANNEL_BUFFER_SIZE - - # while not channel: - # # search for an open channel ID - # offset += 1 - # channel_id = df.BASE_USER_MANAGED_CUID + offset - # try: - # channel = dch.Channel(mem_pool=pool, c_uid=channel_id, capacity=capacity) - # LAST_OFFSET = offset - # descriptor = channel_to_descriptor(channel) - # logger.debug( - # "Local channel created: " - # f"{channel_id=}, {pool_descriptor=}, {capacity=}, {descriptor=}" - # ) - # except dch.ChannelError as e: - # if offset < 100: - # logger.warning(f"Channnel id `{channel_id}` is not open. Retrying...") - # else: - # LAST_OFFSET = 0 - # logger.error(f"All attempts to attach local channel have failed") - # raise SmartSimError("Failed to attach local channel") from e channel = dch.Channel.make_process_local() return channel diff --git a/smartsim/_core/mli/infrastructure/comm/__init__.py b/smartsim/_core/mli/infrastructure/comm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/smartsim/_core/mli/infrastructure/comm/broadcaster.py b/smartsim/_core/mli/infrastructure/comm/broadcaster.py new file mode 100644 index 000000000..d813cce12 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/comm/broadcaster.py @@ -0,0 +1,238 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t +import uuid +from collections import defaultdict, deque + +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.infrastructure.comm.event import EventBase +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class BroadcastResult(t.NamedTuple): + """Contains summary details about a broadcast.""" + + num_sent: int + """The total number of messages delivered across all consumers""" + num_failed: int + """The total number of messages not delivered across all consumers""" + + +class EventBroadcaster: + """Performs fan-out publishing of system events.""" + + def __init__( + self, + backbone: BackboneFeatureStore, + channel_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None, + name: t.Optional[str] = None, + ) -> None: + """Initialize the EventPublisher instance. + + :param backbone: The MLI backbone feature store + :param channel_factory: Factory method to construct new channel instances + """ + self._backbone = backbone + """The backbone feature store used to retrieve consumer descriptors""" + self._channel_factory = channel_factory + """A factory method used to instantiate channels from descriptors""" + self._channel_cache: t.Dict[str, t.Optional[CommChannelBase]] = defaultdict( + lambda: None + ) + """A mapping of instantiated channels that can be re-used. Automatically + calls the channel factory if a descriptor is not already in the collection""" + self._event_buffer: t.Deque[EventBase] = deque() + """A buffer for storing events when a consumer list is not found""" + self._descriptors: t.Set[str] + """Stores the most recent list of broadcast consumers. Updated automatically + on each broadcast""" + self._name = name or str(uuid.uuid4()) + """A unique identifer assigned to the broadcaster for logging""" + + @property + def name(self) -> str: + """The friendly name assigned to the broadcaster. + + :returns: The broadcaster name if one is assigned, otherwise a unique + id assigned by the system. + """ + return self._name + + @property + def num_buffered(self) -> int: + """Return the number of events currently buffered to send. + + :returns: Number of buffered events + """ + return len(self._event_buffer) + + def _save_to_buffer(self, event: EventBase) -> None: + """Places the event in the buffer to be sent once a consumer + list is available. + + :param event: The event to buffer + :raises ValueError: If the event cannot be buffered + """ + try: + self._event_buffer.append(event) + logger.debug(f"Buffered event {event=}") + except Exception as ex: + raise ValueError( + f"Unable to buffer event {event} in broadcaster {self.name}" + ) from ex + + def _log_broadcast_start(self) -> None: + """Logs broadcast statistics.""" + num_events = len(self._event_buffer) + num_copies = len(self._descriptors) + logger.debug( + f"Broadcast {num_events} events to {num_copies} consumers from {self.name}" + ) + + def _prune_unused_consumers(self) -> None: + """Performs maintenance on the channel cache by pruning any channel + that has been removed from the consumers list.""" + active_consumers = set(self._descriptors) + current_channels = set(self._channel_cache.keys()) + + # find any cached channels that are now unused + inactive_channels = current_channels.difference(active_consumers) + new_channels = active_consumers.difference(current_channels) + + for descriptor in inactive_channels: + self._channel_cache.pop(descriptor) + + logger.debug( + f"Pruning {len(inactive_channels)} stale consumers and" + f" found {len(new_channels)} new channels for {self.name}" + ) + + def _get_comm_channel(self, descriptor: str) -> CommChannelBase: + """Helper method to build and cache a comm channel. + + :param descriptor: The descriptor to pass to the channel factory + :returns: The instantiated channel + :raises SmartSimError: If the channel fails to attach + """ + comm_channel = self._channel_cache[descriptor] + if comm_channel is not None: + return comm_channel + + if self._channel_factory is None: + raise SmartSimError("No channel factory provided for consumers") + + try: + channel = self._channel_factory(descriptor) + self._channel_cache[descriptor] = channel + return channel + except Exception as ex: + msg = f"Unable to construct channel with descriptor: {descriptor}" + logger.error(msg, exc_info=True) + raise SmartSimError(msg) from ex + + def _get_next_event(self) -> t.Optional[EventBase]: + """Pop the next event to be sent from the queue. + + :returns: The next event to send if any events are enqueued, otherwise `None`. + """ + try: + return self._event_buffer.popleft() + except IndexError: + logger.debug(f"Broadcast buffer exhausted for {self.name}") + + return None + + def _broadcast(self, timeout: float = 0.001) -> BroadcastResult: + """Broadcasts all buffered events to registered event consumers. + + :param timeout: Maximum time to wait (in seconds) for messages to send + :returns: BroadcastResult containing the number of messages that were + successfully and unsuccessfully sent for all consumers + :raises SmartSimError: If the channel fails to attach + :raises SmartSimError: If broadcasting fails + """ + # allow descriptors to be empty since events are buffered + self._descriptors = set(x for x in self._backbone.notification_channels if x) + if not self._descriptors: + msg = f"No event consumers are registered for {self.name}" + logger.warning(msg) + return BroadcastResult(0, 0) + + self._prune_unused_consumers() + self._log_broadcast_start() + + num_listeners = len(self._descriptors) + num_sent = 0 + num_failures = 0 + + # send each event to every consumer + while event := self._get_next_event(): + logger.debug(f"Broadcasting {event=} to {num_listeners} listeners") + event_bytes = bytes(event) + + for i, descriptor in enumerate(self._descriptors): + comm_channel = self._get_comm_channel(descriptor) + + try: + comm_channel.send(event_bytes, timeout) + num_sent += 1 + except Exception: + msg = ( + f"Broadcast {i+1}/{num_listeners} for event {event.uid} to " + f"channel {descriptor} from {self.name} failed." + ) + logger.exception(msg) + num_failures += 1 + + return BroadcastResult(num_sent, num_failures) + + def send(self, event: EventBase, timeout: float = 0.001) -> int: + """Implementation of `send` method of the `EventPublisher` protocol. Publishes + the supplied event to all registered broadcast consumers. + + :param event: An event to publish + :param timeout: Maximum time to wait (in seconds) for messages to send + :returns: The total number of events successfully published to consumers + :raises ValueError: If event serialization fails + :raises AttributeError: If event cannot be serialized + :raises KeyError: If channel fails to attach using registered descriptors + :raises SmartSimError: If any unexpected error occurs during send + """ + try: + self._save_to_buffer(event) + result = self._broadcast(timeout) + return result.num_sent + except (KeyError, ValueError, AttributeError, SmartSimError): + raise + except Exception as ex: + raise SmartSimError("An unexpected failure occurred while sending") from ex diff --git a/smartsim/_core/mli/infrastructure/comm/consumer.py b/smartsim/_core/mli/infrastructure/comm/consumer.py new file mode 100644 index 000000000..3e03ba86c --- /dev/null +++ b/smartsim/_core/mli/infrastructure/comm/consumer.py @@ -0,0 +1,283 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pickle +import time +import typing as t +import uuid + +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.infrastructure.comm.event import ( + EventBase, + OnCreateConsumer, + OnRemoveConsumer, + OnShutdownRequested, +) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class EventConsumer: + """Reads system events published to a communications channel.""" + + _BACKBONE_WAIT_TIMEOUT = 10.0 + """Maximum time (in seconds) to wait for the backbone to register the consumer""" + + def __init__( + self, + comm_channel: CommChannelBase, + # channel_factory: ..., + backbone: BackboneFeatureStore, + filters: t.Optional[t.List[str]] = None, + name: t.Optional[str] = None, + event_handler: t.Optional[t.Callable[[EventBase], None]] = None, + ) -> None: + """Initialize the EventConsumer instance. + + :param comm_channel: Communications channel to listen to for events + :param backbone: The MLI backbone feature store + :param filters: A list of event types to deliver. when empty, all + events will be delivered + :param name: A user-friendly name for logging. If not provided, an + auto-generated GUID will be used + :raises ValueError: If batch_timeout <= 0 + """ + self._comm_channel = comm_channel + """The comm channel used by the consumer to receive messages. The channel + descriptor will be published for senders to discover.""" + self._backbone = backbone + """The backbone instance used to bootstrap the instance. The EventConsumer + uses the backbone to discover where it can publish its descriptor.""" + self._global_filters = filters or [] + """A set of global filters to apply to incoming events. Global filters are + combined with per-call filters. Filters act as an allow-list.""" + self._name = name or str(uuid.uuid4()) + """User-friendly name assigned to a consumer for logging. Automatically + assigned if not provided.""" + self._event_handler = event_handler + """The function that should be executed when an event + passed by the filters is received.""" + self.listening = True + """Flag indicating that the consumer is currently listening for new + events. Setting this flag to `False` will cause any active calls to + `listen` to terminate.""" + + @property + def descriptor(self) -> str: + """The descriptor of the underlying comm channel. + + :returns: The comm channel descriptor""" + return self._comm_channel.descriptor + + @property + def name(self) -> str: + """The friendly name assigned to the consumer. + + :returns: The consumer name if one is assigned, otherwise a unique + id assigned by the system. + """ + return self._name + + def recv( + self, + filters: t.Optional[t.List[str]] = None, + timeout: float = 0.001, + batch_timeout: float = 1.0, + ) -> t.List[EventBase]: + """Receives available published event(s). + + :param filters: Additional filters to add to the global filters configured + on the EventConsumer instance + :param timeout: Maximum time to wait for a single message to arrive + :param batch_timeout: Maximum time to wait for messages to arrive; allows + multiple batches to be retrieved in one call to `send` + :returns: A list of events that pass any configured filters + :raises ValueError: If a positive, non-zero value is not provided for the + timeout or batch_timeout. + """ + if filters is None: + filters = [] + + if timeout is not None and timeout <= 0: + raise ValueError("request timeout must be a non-zero, positive value") + + if batch_timeout is not None and batch_timeout <= 0: + raise ValueError("batch_timeout must be a non-zero, positive value") + + filter_set = {*self._global_filters, *filters} + all_message_bytes: t.List[bytes] = [] + + # firehose as many messages as possible within the batch_timeout + start_at = time.time() + remaining = batch_timeout + + batch_message_bytes = self._comm_channel.recv(timeout=timeout) + while batch_message_bytes: + # remove any empty messages that will fail to decode + all_message_bytes.extend(batch_message_bytes) + batch_message_bytes = [] + + # avoid getting stuck indefinitely waiting for the channel + elapsed = time.time() - start_at + remaining = batch_timeout - elapsed + + if remaining > 0: + batch_message_bytes = self._comm_channel.recv(timeout=timeout) + + events_received: t.List[EventBase] = [] + + # Timeout elapsed or no messages received - return the empty list + if not all_message_bytes: + return events_received + + for message in all_message_bytes: + if not message or message is None: + continue + + event = pickle.loads(message) + if not event: + logger.warning(f"Consumer {self.name} is unable to unpickle message") + continue + + # skip events that don't pass a filter + if filter_set and event.category not in filter_set: + continue + + events_received.append(event) + + return events_received + + def _send_to_registrar(self, event: EventBase) -> None: + """Send an event direct to the registrar listener.""" + registrar_key = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER + config = self._backbone.wait_for([registrar_key], self._BACKBONE_WAIT_TIMEOUT) + registrar_descriptor = str(config.get(registrar_key, None)) + + if not registrar_descriptor: + logger.warning( + f"Unable to send {event.category} from {self.name}. " + "No registrar channel found." + ) + return + + logger.debug(f"Sending {event.category} from {self.name}") + + registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor) + registrar_channel.send(bytes(event), timeout=1.0) + + logger.debug(f"{event.category} from {self.name} sent") + + def register(self) -> None: + """Send an event to register this consumer as a listener.""" + descriptor = self._comm_channel.descriptor + event = OnCreateConsumer(self.name, descriptor, self._global_filters) + + self._send_to_registrar(event) + + def unregister(self) -> None: + """Send an event to un-register this consumer as a listener.""" + descriptor = self._comm_channel.descriptor + event = OnRemoveConsumer(self.name, descriptor) + + self._send_to_registrar(event) + + def _on_handler_missing(self, event: EventBase) -> None: + """A "dead letter" event handler that is called to perform + processing on events before they're discarded. + + :param event: The event to handle + """ + logger.warning( + "No event handler is registered in consumer " + f"{self.name}. Discarding {event=}" + ) + + def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: + """Receives messages for the consumer a single time. Delivers + all messages that pass the consumer filters. Shutdown requests + are handled by a default event handler. + + + NOTE: Executes a single batch-retrieval to receive the maximum + number of messages available under batch timeout. To continually + listen, use `listen` in a non-blocking thread/process + + :param timeout: Maximum time to wait (in seconds) for a message to arrive + :param timeout: Maximum time to wait (in seconds) for a batch to arrive + """ + logger.info( + f"Consumer {self.name} listening with {timeout} second timeout" + f" on channel {self._comm_channel.descriptor}" + ) + + if not self._event_handler: + logger.info("Unable to handle messages. No event handler is registered.") + + incoming_messages = self.recv(timeout=timeout, batch_timeout=batch_timeout) + + if not incoming_messages: + logger.info(f"Consumer {self.name} received empty message list") + + for message in incoming_messages: + logger.info(f"Consumer {self.name} is handling event {message=}") + self._handle_shutdown(message) + + if self._event_handler: + self._event_handler(message) + else: + self._on_handler_missing(message) + + def _handle_shutdown(self, event: EventBase) -> bool: + """Handles shutdown requests sent to the consumer by setting the + `self.listener` property to `False`. + + :param event: The event to handle + :returns: A bool indicating if the event was a shutdown request + """ + if isinstance(event, OnShutdownRequested): + logger.debug(f"Shutdown requested from: {event.source}") + self.listening = False + return True + return False + + def listen(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: + """Receives messages for the consumer until a shutdown request is received. + + :param timeout: Maximum time to wait (in seconds) for a message to arrive + :param batch_timeout: Maximum time to wait (in seconds) for a batch to arrive + """ + + logger.debug(f"Consumer {self.name} is now listening for events.") + + while self.listening: + self.listen_once(timeout, batch_timeout) + + logger.debug(f"Consumer {self.name} is no longer listening.") diff --git a/smartsim/_core/mli/infrastructure/comm/event.py b/smartsim/_core/mli/infrastructure/comm/event.py new file mode 100644 index 000000000..ccef9f9b8 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/comm/event.py @@ -0,0 +1,162 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pickle +import typing as t +import uuid +from dataclasses import dataclass, field + +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +@dataclass +class EventBase: + """Core API for an event.""" + + category: str + """Unique category name for an event class""" + source: str + """A unique identifier for the publisher of the event""" + uid: str = field(default_factory=lambda: str(uuid.uuid4())) + """A unique identifier for this event""" + + def __bytes__(self) -> bytes: + """Default conversion to bytes for an event required to publish + messages using byte-oriented communication channels. + + :returns: This entity encoded as bytes""" + return pickle.dumps(self) + + def __str__(self) -> str: + """Convert the event to a string. + + :returns: A string representation of this instance""" + return f"{self.uid}|{self.category}" + + +class OnShutdownRequested(EventBase): + """Publish this event to trigger the listener to shutdown.""" + + SHUTDOWN: t.ClassVar[str] = "consumer-unregister" + """Unique category name for an event raised when a new consumer is unregistered""" + + def __init__(self, source: str) -> None: + """Initialize the event instance. + + :param source: A unique identifier for the publisher of the event + creating the event + """ + super().__init__(self.SHUTDOWN, source) + + +class OnCreateConsumer(EventBase): + """Publish this event when a new event consumer registration is required.""" + + descriptor: str + """Descriptor of the comm channel exposed by the consumer""" + filters: t.List[str] = field(default_factory=list) + """The collection of filters indicating messages of interest to this consumer""" + + CONSUMER_CREATED: t.ClassVar[str] = "consumer-created" + """Unique category name for an event raised when a new consumer is registered""" + + def __init__(self, source: str, descriptor: str, filters: t.Sequence[str]) -> None: + """Initialize the event instance. + + :param source: A unique identifier for the publisher of the event + :param descriptor: Descriptor of the comm channel exposed by the consumer + :param filters: Collection of filters indicating messages of interest + """ + super().__init__(self.CONSUMER_CREATED, source) + self.descriptor = descriptor + self.filters = list(filters) + + def __str__(self) -> str: + """Convert the event to a string. + + :returns: A string representation of this instance + """ + _filters = ",".join(self.filters) + return f"{str(super())}|{self.descriptor}|{_filters}" + + +class OnRemoveConsumer(EventBase): + """Publish this event when a consumer is shutting down and + should be removed from notification lists.""" + + descriptor: str + """Descriptor of the comm channel exposed by the consumer""" + + CONSUMER_REMOVED: t.ClassVar[str] = "consumer-removed" + """Unique category name for an event raised when a new consumer is unregistered""" + + def __init__(self, source: str, descriptor: str) -> None: + """Initialize the OnRemoveConsumer event. + + :param source: A unique identifier for the publisher of the event + :param descriptor: Descriptor of the comm channel exposed by the consumer + """ + super().__init__(self.CONSUMER_REMOVED, source) + self.descriptor = descriptor + + def __str__(self) -> str: + """Convert the event to a string. + + :returns: A string representation of this instance + """ + return f"{str(super())}|{self.descriptor}" + + +class OnWriteFeatureStore(EventBase): + """Publish this event when a feature store key is written.""" + + descriptor: str + """The descriptor of the feature store where the write occurred""" + key: str + """The key identifying where the write occurred""" + + FEATURE_STORE_WRITTEN: str = "feature-store-written" + """Event category for an event raised when a feature store key is written""" + + def __init__(self, source: str, descriptor: str, key: str) -> None: + """Initialize the OnWriteFeatureStore event. + + :param source: A unique identifier for the publisher of the event + :param descriptor: The descriptor of the feature store where the write occurred + :param key: The key identifying where the write occurred + """ + super().__init__(self.FEATURE_STORE_WRITTEN, source) + self.descriptor = descriptor + self.key = key + + def __str__(self) -> str: + """Convert the event to a string. + + :returns: A string representation of this instance + """ + return f"{str(super())}|{self.descriptor}|{self.key}" diff --git a/smartsim/_core/mli/infrastructure/comm/producer.py b/smartsim/_core/mli/infrastructure/comm/producer.py new file mode 100644 index 000000000..2d8a7c14a --- /dev/null +++ b/smartsim/_core/mli/infrastructure/comm/producer.py @@ -0,0 +1,44 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +from smartsim._core.mli.infrastructure.comm.event import EventBase +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class EventProducer(t.Protocol): + """Core API of a class that publishes events.""" + + def send(self, event: EventBase, timeout: float = 0.001) -> int: + """Send an event using the configured comm channel. + + :param event: The event to send + :param timeout: Maximum time to wait (in seconds) for messages to send + :returns: The number of messages that were sent + """ diff --git a/smartsim/_core/mli/infrastructure/control/event_listener.py b/smartsim/_core/mli/infrastructure/control/listener.py similarity index 84% rename from smartsim/_core/mli/infrastructure/control/event_listener.py rename to smartsim/_core/mli/infrastructure/control/listener.py index 2485f77ea..b5c529615 100644 --- a/smartsim/_core/mli/infrastructure/control/event_listener.py +++ b/smartsim/_core/mli/infrastructure/control/listener.py @@ -27,11 +27,9 @@ # isort: off # pylint: disable=import-error # pylint: disable=unused-import +import socket import dragon -# from dragon.globalservices.api_setup import connect_to_infrastructure - - # pylint: enable=unused-import # pylint: enable=import-error # isort: on @@ -45,13 +43,15 @@ from smartsim._core.entrypoints.service import Service from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_util import create_local -from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( - BackboneFeatureStore, +from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer +from smartsim._core.mli.infrastructure.comm.event import ( EventBase, - EventCategory, - EventConsumer, OnCreateConsumer, OnRemoveConsumer, + OnShutdownRequested, +) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, ) from smartsim.error.errors import SmartSimError from smartsim.log import get_logger @@ -60,8 +60,9 @@ class ConsumerRegistrationListener(Service): - """A long-running service that listens for events of a specific type - and executes the appropriate event handler.""" + """A long-running service that manages the list of consumers receiving + events that are broadcast. It hosts handlers for adding and removing consumers + """ def __init__( self, @@ -78,7 +79,6 @@ def __init__( :param timeout: Maximum time (in seconds) to allow a single recv request to wait :param batch_timeout: Maximum time (in seconds) to allow a batch of receives to continue to build - :param filters: Filters specifying the message types to handle :param as_service: Specifies run-once or run-until-complete behavior of service :param cooldown: Number of seconds to wait before shutting down after shutdown criteria are met @@ -86,17 +86,13 @@ def __init__( super().__init__( as_service, cooldown, health_check_frequency=health_check_frequency ) - self._timeout = timeout """ Maximum time (in seconds) to allow a single recv request to wait""" - self._batch_timeout = batch_timeout """Maximum time (in seconds) to allow a batch of receives to continue to build""" - self._consumer: t.Optional[EventConsumer] = None """The event consumer that handles receiving events""" - self._backbone = backbone """A standalone, system-created feature store used to share internal information among MLI components""" @@ -112,8 +108,20 @@ def _on_shutdown(self) -> None: the main event loop during automatic shutdown.""" super()._on_shutdown() - # unregister this listener in the backbone - self._backbone.pop(BackboneFeatureStore.MLI_BACKEND_CONSUMER) + if not self._consumer: + return + + # remove descriptor for this listener from the backbone if it's there + if registered_consumer := self._backbone.backend_channel: + # if there is a descriptor in the backbone and it's still this listener + if registered_consumer == self._consumer.descriptor: + logger.info( + f"Listener clearing backend consumer {self._consumer.name} " + "from backbone" + ) + + # unregister this listener in the backbone + self._backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) # TODO: need the channel to be cleaned up # self._consumer._comm_channel._channel.destroy() @@ -135,15 +143,18 @@ def _can_shutdown(self) -> bool: """ if self._backbone is None: - logger.info("Listener must shutdown: no backbone attached") + logger.info("Listener must shutdown. No backbone attached") return True if self._consumer is None: - logger.info("Listener must shutdown: no consumer channel created") + logger.info("Listener must shutdown. No consumer channel created") return True if not self._consumer.listening: - logger.info("Listener can shutdown: consumer is not listening") + logger.info( + f"Listener can shutdown. Consumer `{self._consumer.name}` " + "is not listening" + ) return True return False @@ -202,7 +213,7 @@ def _on_health_check(self) -> None: try: logger.debug("Retrieving registered listener descriptor") - descriptor = self._backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + descriptor = self._backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] except KeyError: descriptor = None if self._consumer: @@ -210,8 +221,8 @@ def _on_health_check(self) -> None: if self._consumer and descriptor != self._consumer.descriptor: logger.warning( - "This listener is no longer registered. It " - "will automatically shut down." + f"Consumer `{self._consumer.name}` for `ConsumerRegistrationListener` " + "is no longer registered. It will automatically shut down." ) self._consumer.listening = False @@ -221,7 +232,8 @@ def _publish_consumer(self) -> None: logger.warning("No registrar consumer descriptor available to publisher") return - self._backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = ( + logger.debug(f"Publishing {self._consumer.descriptor} to backbone") + self._backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = ( self._consumer.descriptor ) @@ -235,6 +247,7 @@ def _create_eventing(self) -> EventConsumer: NOTE: the backbone must be initialized before connecting eventing clients. :returns: The newly created EventConsumer instance + :raises SmartSimError: If a listener channel cannot be created """ if self._consumer: @@ -253,8 +266,12 @@ def _create_eventing(self) -> EventConsumer: self._consumer = EventConsumer( event_channel, self._backbone, - [EventCategory.CONSUMER_CREATED, EventCategory.CONSUMER_REMOVED], - name="ConsumerRegistrar", + [ + OnCreateConsumer.CONSUMER_CREATED, + OnRemoveConsumer.CONSUMER_REMOVED, + OnShutdownRequested.SHUTDOWN, + ], + name=f"ConsumerRegistrar.{socket.gethostname()}", event_handler=self._on_event_received, ) self._publish_consumer() diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index d14755f53..b0f931cb3 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -146,7 +146,7 @@ def ready(self) -> bool: return False timed_out = False - if self._batch_timeout > 0: + if self._batch_timeout >= 0: timed_out = self._elapsed_time >= self._batch_timeout if self.full(): diff --git a/smartsim/_core/mli/infrastructure/environment_loader.py b/smartsim/_core/mli/infrastructure/environment_loader.py index 2c89184d8..5ba0fccc2 100644 --- a/smartsim/_core/mli/infrastructure/environment_loader.py +++ b/smartsim/_core/mli/infrastructure/environment_loader.py @@ -39,6 +39,11 @@ class EnvironmentConfigLoader: Facilitates the loading of a FeatureStore and Queue into the WorkerManager. """ + REQUEST_QUEUE_ENV_VAR = "_SMARTSIM_REQUEST_QUEUE" + """The environment variable that holds the request queue descriptor""" + BACKBONE_ENV_VAR = "_SMARTSIM_INFRA_BACKBONE" + """The environment variable that holds the backbone descriptor""" + def __init__( self, featurestore_factory: t.Callable[[str], FeatureStore], @@ -76,7 +81,7 @@ def get_backbone(self) -> t.Optional[FeatureStore]: :returns: The attached feature store via `_SMARTSIM_INFRA_BACKBONE` """ - descriptor = os.getenv("_SMARTSIM_INFRA_BACKBONE", "") + descriptor = os.getenv(self.BACKBONE_ENV_VAR, "") if not descriptor: logger.warning("No backbone descriptor is configured") @@ -97,7 +102,7 @@ def get_queue(self) -> t.Optional[CommChannelBase]: :returns: The attached queue specified via `_SMARTSIM_REQUEST_QUEUE` """ - descriptor = os.getenv("_SMARTSIM_REQUEST_QUEUE", "") + descriptor = os.getenv(self.REQUEST_QUEUE_ENV_VAR, "") if not descriptor: logger.warning("No queue descriptor is configured") diff --git a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py index 21fdecbed..b12d7b11b 100644 --- a/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/backbone_feature_store.py @@ -24,15 +24,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import enum import itertools import os -import pickle import time import typing as t -import uuid -from collections import defaultdict, deque -from dataclasses import dataclass # pylint: disable=import-error # isort: off @@ -40,8 +35,6 @@ # isort: on -from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) @@ -56,15 +49,23 @@ class BackboneFeatureStore(DragonFeatureStore): information stored in the MLI backbone feature store.""" MLI_NOTIFY_CONSUMERS = "_SMARTSIM_MLI_NOTIFY_CONSUMERS" - MLI_BACKEND_CONSUMER = "_SMARTIM_MLI_BACKEND_CONSUMER" + """Unique key used in the backbone to locate the consumer list""" + MLI_REGISTRAR_CONSUMER = "_SMARTIM_MLI_REGISTRAR_CONSUMER" + """Unique key used in the backbone to locate the registration consumer""" MLI_WORKER_QUEUE = "_SMARTSIM_REQUEST_QUEUE" + """Unique key used in the backbone to locate MLI work queue""" MLI_BACKBONE = "_SMARTSIM_INFRA_BACKBONE" + """Unique key used in the backbone to locate the backbone feature store""" _CREATED_ON = "creation" + """Unique key used in the backbone to locate the creation date of the + feature store""" _DEFAULT_WAIT_TIMEOUT = 1.0 + """The default wait time (in seconds) for blocking requests to + the feature store""" def __init__( self, - storage: "dragon_ddict.DDict", + storage: dragon_ddict.DDict, allow_reserved_writes: bool = False, ) -> None: """Initialize the DragonFeatureStore instance. @@ -119,24 +120,23 @@ def notification_channels(self, values: t.Sequence[str]) -> None: @property def backend_channel(self) -> t.Optional[str]: - """Retrieve the channel descriptor exposed by the MLI backend for events. + """Retrieve the channel descriptor used to register event consumers. :returns: The channel descriptor""" - if self.MLI_BACKEND_CONSUMER in self: - return str(self[self.MLI_BACKEND_CONSUMER]) + if self.MLI_REGISTRAR_CONSUMER in self: + return str(self[self.MLI_REGISTRAR_CONSUMER]) return None @backend_channel.setter def backend_channel(self, value: str) -> None: - """Set the channel exposed by the MLI backend for events. + """Set the channel used to register event consumers. :param value: The stringified channel descriptor""" - self[self.MLI_BACKEND_CONSUMER] = value + self[self.MLI_REGISTRAR_CONSUMER] = value @property def worker_queue(self) -> t.Optional[str]: - """Retrieve the channel descriptor exposed by the MLI - backend to send work to an MLI worker manager instance. + """Retrieve the channel descriptor used to send work to MLI worker managers. :returns: The channel descriptor, if found. Otherwise, `None`""" if self.MLI_WORKER_QUEUE in self: @@ -145,8 +145,7 @@ def worker_queue(self) -> t.Optional[str]: @worker_queue.setter def worker_queue(self, value: str) -> None: - """Set the channel descriptor exposed by the MLI - backend to send work to an MLI worker manager instance. + """Set the channel descriptor used to send work to MLI worker managers. :param value: The channel descriptor""" self[self.MLI_WORKER_QUEUE] = value @@ -195,6 +194,8 @@ def _check_wait_timeout( :param start_time: the start time to use for elapsed calculation :param timeout: the timeout (in seconds) :param indicators: latest retrieval status for requested keys + :raises SmartSimError: If the timeout elapses before all values are + retrieved """ elapsed = time.time() - start_time if timeout and elapsed > timeout: @@ -211,6 +212,9 @@ def wait_for( :param keys: The required collection of keys to retrieve :param timeout: The maximum wait time in seconds + :returns: Dictionary containing the keys and values requested + :raises SmartSimError: If the timeout elapses without retrieving + all requested keys """ if timeout < 0: timeout = self._DEFAULT_WAIT_TIMEOUT @@ -253,579 +257,3 @@ def get_env(self) -> t.Dict[str, str]: :returns: The dictionary populated with env vars """ return {self.MLI_BACKBONE: self.descriptor} - - -class EventCategory(str, enum.Enum): - """Predefined event types raised by SmartSim backend.""" - - CONSUMER_CREATED: str = "consumer-created" - """Event category for an event raised when a new consumer is created""" - CONSUMER_REMOVED: str = "consumer-removed" - """Event category for an event raised when a new consumer is created""" - FEATURE_STORE_WRITTEN: str = "feature-store-written" - """Event category for an event raised when a feature store key is written""" - SHUTDOWN: str = "shutdown" - """Event category for an event that should trigger the listener to shutdown""" - - -@dataclass -class EventBase: - """Core API for an event.""" - - # todo: shift eventing code to: infrastructure / event / event.py - category: EventCategory - """The event category for this event; may be used for addressing, - prioritization, or filtering of events by a event publisher/consumer""" - - uid: str - """A unique identifier for this event""" - - def __bytes__(self) -> bytes: - """Default conversion to bytes for an event required to publish - messages using byte-oriented communication channels. - - :returns: This entity encoded as bytes""" - return pickle.dumps(self) - - def __str__(self) -> str: - """Convert the event to a string. - - :returns: A string representation of this instance""" - return f"{self.uid}|{self.category}" - - -class OnShutdownRequested(EventBase): - """Publish this event to trigger the listener to shutdown.""" - - def __init__(self) -> None: - """Initialize the OnShutdownRequest event.""" - super().__init__(EventCategory.SHUTDOWN, str(uuid.uuid4())) - - -class OnCreateConsumer(EventBase): - """Publish this event when a new event consumer registration is required.""" - - descriptor: str - """Descriptor of the comm channel exposed by the consumer""" - filters: t.List[EventCategory] - """The collection of filters indicating messages of interest to this consumer""" - - def __init__(self, descriptor: str, filters: t.Sequence[EventCategory]) -> None: - """Initialize the OnCreateConsumer event. - - :param descriptor: Descriptor of the comm channel exposed by the consumer - :param descriptor: Collection of filters indicating messages of interest - """ - super().__init__(EventCategory.CONSUMER_CREATED, str(uuid.uuid4())) - self.descriptor = descriptor - self.filters = list(filters) - - def __str__(self) -> str: - """Convert the event to a string. - - :returns: A string representation of this instance - """ - _filters = ",".join(self.filters) - return f"{str(super())}|{self.descriptor}|{_filters}" - - -class OnRemoveConsumer(EventBase): - """Publish this event when a consumer is shutting down and - should be removed from notification lists.""" - - descriptor: str - """Descriptor of the comm channel exposed by the consumer""" - - def __init__(self, descriptor: str) -> None: - """Initialize the OnRemoveConsumer event. - - :param descriptor: Descriptor of the comm channel exposed by the consumer - """ - super().__init__(EventCategory.CONSUMER_REMOVED, str(uuid.uuid4())) - self.descriptor = descriptor - - def __str__(self) -> str: - """Convert the event to a string. - - :returns: A string representation of this instance - """ - return f"{str(super())}|{self.descriptor}" - - -class OnWriteFeatureStore(EventBase): - """Publish this event when a feature store key is written.""" - - descriptor: str - """The descriptor of the feature store where the write occurred""" - - key: str - """The key identifying where the write occurred""" - - def __init__(self, descriptor: str, key: str) -> None: - """Initialize the OnWriteFeatureStore event. - - :param descriptor: The descriptor of the feature store where the write occurred - :param key: The key identifying where the write occurred - """ - super().__init__(EventCategory.FEATURE_STORE_WRITTEN, str(uuid.uuid4())) - self.descriptor = descriptor - self.key = key - - def __str__(self) -> str: - """Convert the event to a string. - - :returns: A string representation of this instance - """ - return f"{str(super())}|{self.descriptor}|{self.key}" - - -class EventProducer(t.Protocol): - """Core API of a class that publishes events.""" - - def send(self, event: EventBase, timeout: float = 0.001) -> int: - """Send an event using the configured comm channel. - - :param event: The event to send - :param timeout: Maximum time to wait (in seconds) for messages to send - :returns: The number of messages that were sent - """ - - -class EventSender: - """An event publisher that performs publishing of system events to a - single endpoint""" - - def __init__( - self, - backbone: BackboneFeatureStore, - channel: t.Optional[CommChannelBase], - ) -> None: - """Initialize the instance. - - :param backbone: The backbone feature store to use - :param channel: The comm channel to send events on - """ - self._backbone = backbone - self._channel: t.Optional[CommChannelBase] = channel - - def send(self, event: EventBase, timeout: float = 0.001) -> int: - """Send an event using the configured comm channel. - - :param event: The event to send - :param timeout: Maximum time to wait (in seconds) for messages to send - :returns: The number of message copies that were sent - :raises SmartSimError: If the comm channel is not configured - """ - if self._channel is None: - raise SmartSimError("No channel to send on") - num_sent = 0 - - logger.debug(f"Sending {event} to {self._channel.descriptor}") - - try: - event_bytes = bytes(event) - self._channel.send(event_bytes, timeout) - num_sent += 1 - except Exception as ex: - raise SmartSimError(f"Failed broadcast to channel: {self._channel}") from ex - - return num_sent - - -class EventBroadcaster: - """Performs fan-out publishing of system events.""" - - def __init__( - self, - backbone: BackboneFeatureStore, - channel_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None, - ) -> None: - """Initialize the EventPublisher instance. - - :param backbone: The MLI backbone feature store - :param channel_factory: Factory method to construct new channel instances - """ - self._backbone = backbone - """The backbone feature store used to retrieve consumer descriptors""" - self._channel_factory = channel_factory - """A factory method used to instantiate channels from descriptors""" - self._channel_cache: t.Dict[str, t.Optional[CommChannelBase]] = defaultdict( - lambda: None - ) - """A mapping of instantiated channels that can be re-used. Automatically - calls the channel factory if a descriptor is not already in the collection""" - self._event_buffer: t.Deque[EventBase] = deque() - """A buffer for storing events when a consumer list is not found""" - self._descriptors: t.Set[str] - """Stores the most recent list of broadcast consumers. Updated automatically - on each broadcast""" - self._uid = str(uuid.uuid4()) - """A unique identifer assigned to the broadcaster for logging""" - - @property - def num_buffered(self) -> int: - """Return the number of events currently buffered to send. - - :returns: Number of buffered events - """ - return len(self._event_buffer) - - def _save_to_buffer(self, event: EventBase) -> None: - """Places the event in the buffer to be sent once a consumer - list is available. - - :param event: The event to buffer - :raises ValueError: If the event cannot be buffered - """ - try: - self._event_buffer.append(event) - logger.debug(f"Buffered event {event=}") - except Exception as ex: - raise ValueError(f"Unable to serialize event from {self._uid}") from ex - - def _log_broadcast_start(self) -> None: - """Logs broadcast statistics.""" - num_events = len(self._event_buffer) - num_copies = len(self._descriptors) - logger.debug( - f"Broadcast {num_events} events to {num_copies} consumers from {self._uid}" - ) - - def _prune_unused_consumers(self) -> None: - """Performs maintenance on the channel cache by pruning any channel - that has been removed from the consumers list.""" - active_consumers = set(self._descriptors) - current_channels = set(self._channel_cache.keys()) - - # find any cached channels that are now unused - inactive_channels = current_channels.difference(active_consumers) - new_channels = active_consumers.difference(current_channels) - - for descriptor in inactive_channels: - self._channel_cache.pop(descriptor) - - logger.debug( - f"Pruning {len(inactive_channels)} stale consumers and" - f" found {len(new_channels)} new channels for {self._uid}" - ) - - def _get_comm_channel(self, descriptor: str) -> CommChannelBase: - """Helper method to build and cache a comm channel. - - :param descriptor: The descriptor to pass to the channel factory - :returns: The instantiated channel - :raises SmartSimError: If the channel fails to attach - """ - comm_channel = self._channel_cache[descriptor] - if comm_channel is not None: - return comm_channel - - if self._channel_factory is None: - raise SmartSimError("No channel factory provided for consumers") - - try: - channel = self._channel_factory(descriptor) - self._channel_cache[descriptor] = channel - return channel - except Exception as ex: - msg = f"Unable to construct channel with descriptor: {descriptor}" - logger.error(msg, exc_info=True) - raise SmartSimError(msg) from ex - - def _get_next_event(self) -> t.Optional[EventBase]: - """Pop the next event to be sent from the queue. - - :returns: The next event to send if any events are enqueued, otherwise `None`. - """ - try: - return self._event_buffer.popleft() - except IndexError: - logger.debug(f"Broadcast buffer exhausted for {self._uid}") - - return None - - def _broadcast(self, timeout: float = 0.001) -> int: - """Broadcasts all buffered events to registered event consumers. - - :param timeout: Maximum time to wait (in seconds) for messages to send - :returns: The number of events broadcasted to consumers - :raises SmartSimError: If the channel fails to attach - :raises SmartSimError: If broadcasting fails - """ - # allow descriptors to be empty since events are buffered - self._descriptors = set(x for x in self._backbone.notification_channels if x) - if not self._descriptors: - logger.warning(f"No event consumers are registered for {self._uid}") - return 0 - - self._prune_unused_consumers() - self._log_broadcast_start() - - num_sent = 0 - num_listeners = len(self._descriptors) - - # send each event to every consumer - while event := self._get_next_event(): - logger.debug(f"Broadcasting {event=} to {num_listeners} listeners") - event_bytes = bytes(event) - - for i, descriptor in enumerate(self._descriptors): - comm_channel = self._get_comm_channel(descriptor) - - try: - comm_channel.send(event_bytes, timeout) - num_sent += 1 - except Exception as ex: - raise SmartSimError( - f"Broadcast {i+1}/{num_listeners} for event {event.uid} to " - f"channel {descriptor} from {self._uid} failed." - ) from ex - - return num_sent - - def send(self, event: EventBase, timeout: float = 0.001) -> int: - """Implementation of `send` method of the `EventPublisher` protocol. Publishes - the supplied event to all registered broadcast consumers. - - :param event: An event to publish - :param timeout: Maximum time to wait (in seconds) for messages to send - :returns: The number of events successfully published - :raises ValueError: If event serialization fails - :raises AttributeError: If event cannot be serialized - :raises KeyError: If channel fails to attach using registered descriptors - :raises SmartSimError: If any unexpected error occurs during send - """ - try: - self._save_to_buffer(event) - return self._broadcast(timeout) - except (KeyError, ValueError, AttributeError, SmartSimError): - raise - except Exception as ex: - raise SmartSimError("An unexpected failure occurred while sending") from ex - - -class EventConsumer: - """Reads system events published to a communications channel.""" - - _BACKBONE_WAIT_TIMEOUT = 10.0 - """Maximum time (in seconds) to wait for the backbone to register the consumer""" - - def __init__( - self, - comm_channel: CommChannelBase, - # channel_factory: ..., - backbone: BackboneFeatureStore, - filters: t.Optional[t.List[EventCategory]] = None, - name: t.Optional[str] = None, - event_handler: t.Optional[t.Callable[[EventBase], None]] = None, - ) -> None: - """Initialize the EventConsumer instance. - - :param comm_channel: Communications channel to listen to for events - :param backbone: The MLI backbone feature store - :param filters: A list of event types to deliver. when empty, all - events will be delivered - :param name: A user-friendly name for logging. If not provided, an - auto-generated GUID will be used - :raises ValueError: If batch_timeout <= 0 - """ - self._comm_channel = comm_channel - """The comm channel used by the consumer to receive messages. The channel - descriptor will be published for senders to discover.""" - self._backbone = backbone - """The backbone instance used to bootstrap the instance. The EventConsumer - uses the backbone to discover where it can publish its descriptor.""" - self._global_filters = filters or [] - """A set of global filters to apply to incoming events. Global filters are - combined with per-call filters. Filters act as an allow-list.""" - self._name = name - """User-friendly name assigned to a consumer for logging. Automatically - assigned if not provided.""" - self._event_handler = event_handler - """The function that should be executed when an event - passed by the filters is received.""" - self.listening = True - """Flag indicating that the consumer is currently listening for new - events. Setting this flag to `False` will cause any active calls to - `listen` to terminate.""" - - @property - def descriptor(self) -> str: - """The descriptor of the underlying comm channel. - - :returns: The comm channel descriptor""" - return self._comm_channel.descriptor - - @property - def name(self) -> str: - """The friendly name assigned to the consumer. - - :returns: The consumer name if one is assigned, othewise a unique - id assigned by the system. - """ - if self._name is None: - self._name = str(uuid.uuid4()) - return self._name - - def recv( - self, - filters: t.Optional[t.List[EventCategory]] = None, - timeout: float = 0.001, - batch_timeout: float = 1.0, - ) -> t.List[EventBase]: - """Receives available published event(s). - - :param filters: Additional filters to add to the global filters configured - on the EventConsumer instance - :param timeout: Maximum time to wait for a single message to arrive - :param batch_timeout: Maximum time to wait for messages to arrive; allows - multiple batches to be retrieved in one call to `send` - :returns: A list of events that pass any configured filters - :raises ValueError: If a positive, non-zero value is not provided for the - timeout or batch_timeout. - """ - if filters is None: - filters = [] - - if timeout is not None and timeout <= 0: - raise ValueError("request timeout must be a non-zero, positive value") - - if batch_timeout is not None and batch_timeout <= 0: - raise ValueError("batch_timeout must be a non-zero, positive value") - - filter_set = {*self._global_filters, *filters} - all_message_bytes: t.List[bytes] = [] - - # firehose as many messages as possible within the batch_timeout - start_at = time.time() - remaining = batch_timeout - - batch_message_bytes = self._comm_channel.recv(timeout=timeout) - while batch_message_bytes: - # remove any empty messages that will fail to decode - all_message_bytes.extend(batch_message_bytes) - batch_message_bytes = [] - - # avoid getting stuck indefinitely waiting for the channel - elapsed = time.time() - start_at - remaining = batch_timeout - elapsed - - if remaining > 0: - batch_message_bytes = self._comm_channel.recv(timeout=timeout) - - events_received: t.List[EventBase] = [] - - # Timeout elapsed or no messages received - return the empty list - if not all_message_bytes: - return events_received - - for message in all_message_bytes: - if not message or message is None: - continue - - event = pickle.loads(message) - if not event: - logger.warning("Unable to unpickle message") - - # skip events that don't pass a filter - if filter_set and event.category not in filter_set: - continue - - events_received.append(event) - - return events_received - - def _send_to_registrar(self, event: EventBase) -> None: - """Send an event direct to the registrar listener.""" - registrar_key = BackboneFeatureStore.MLI_BACKEND_CONSUMER - config = self._backbone.wait_for([registrar_key], self._BACKBONE_WAIT_TIMEOUT) - registrar_descriptor = str(config.get(registrar_key, None)) - - if not registrar_descriptor: - logger.warning(f"Unable to {event.category}. No registrar channel found.") - return - - logger.debug(f"Sending {event.category} for {self.name}") - - registrar_channel = DragonCommChannel.from_descriptor(registrar_descriptor) - registrar_channel.send(bytes(event), timeout=1.0) - - logger.debug(f"{event.category} for {self.name} sent") - - def register(self) -> None: - """Send an event to register this consumer as a listener.""" - descriptor = self._comm_channel.descriptor - event = OnCreateConsumer(descriptor, self._global_filters) - - self._send_to_registrar(event) - - def unregister(self) -> None: - """Send an event to un-register this consumer as a listener.""" - descriptor = self._comm_channel.descriptor - event = OnRemoveConsumer(descriptor) - - self._send_to_registrar(event) - - @staticmethod - def _on_handler_missing(event: EventBase) -> None: - """A "dead letter" event handler that is called to perform - processing on events before they're discarded. - - :param event: The event to handle - """ - logger.warning(f"No event handler is registered. Discarding {event=}") - - def listen_once(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: - """Receives messages for the consumer a single time. Delivers - all messages that pass the consumer filters. Shutdown requests - are handled by a default event handler. - - - NOTE: Executes a single batch-retrieval to receive the maximum - number of messages available under batch timeout. To continually - listen, use `listen` in a non-blocking thread/process - - :param timeout: Maximum time to wait (in seconds) for a message to arrive - :param timeout: Maximum time to wait (in seconds) for a batch to arrive - """ - logger.debug(f"Starting event listener with {timeout} second timeout") - logger.debug("Awaiting new messages") - - if not self._event_handler: - logger.debug("Unable to handle messages. No event handler is registered.") - - incoming_messages = self.recv(timeout=timeout, batch_timeout=batch_timeout) - - if not incoming_messages: - logger.debug(f"Consumer {self.name} received empty message list.") - - for message in incoming_messages: - logger.debug(f"Sending event {message=} to handler.") - self._handle_shutdown(message) - - if self._event_handler: - self._event_handler(message) - else: - self._on_handler_missing(message) - - def _handle_shutdown(self, event: EventBase) -> bool: - """Handles shutdown requests sent to the consumer by setting the - `self.listener` property to `False`. - - :param event: The event to handle - :returns: A bool indicating if the event was a shutdown request - """ - if isinstance(event, OnShutdownRequested): - self.listening = False - return True - return False - - def listen(self, timeout: float = 0.001, batch_timeout: float = 1.0) -> None: - """Receives messages for the consumer until a shutdown request is received. - - :param timeout: Maximum time to wait (in seconds) for a message to arrive - :param batch_timeout: Maximum time to wait (in seconds) for a batch to arrive - """ - - while self.listening: - self.listen_once(timeout, batch_timeout) diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py index dc0f57ae6..24f2221c8 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_feature_store.py @@ -51,13 +51,19 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None: :param storage: A distributed dictionary to be used as the underlying storage mechanism of the feature store""" + if storage is None: + raise ValueError( + "Storage is required when instantiating a DragonFeatureStore." + ) + + descriptor = "" if isinstance(storage, dragon_ddict.DDict): descriptor = ddict_to_descriptor(storage) - else: - descriptor = "not-set" super().__init__(descriptor) self._storage: t.Dict[str, t.Union[str, bytes]] = storage + """The underlying storage mechanism of the DragonFeatureStore; a + distributed, in-memory key-value store""" def _get(self, key: str) -> t.Union[str, bytes]: """Retrieve a value from the underlying storage mechanism. diff --git a/smartsim/_core/mli/infrastructure/storage/dragon_util.py b/smartsim/_core/mli/infrastructure/storage/dragon_util.py index fda89bba5..50d15664c 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragon_util.py +++ b/smartsim/_core/mli/infrastructure/storage/dragon_util.py @@ -40,6 +40,7 @@ def ddict_to_descriptor(ddict: dragon_ddict.DDict) -> str: :param ddict: The dragon dictionary to convert :returns: The descriptor string + :raises ValueError: If a ddict is not provided """ if ddict is None: raise ValueError("DDict is not available to create a descriptor") diff --git a/smartsim/_core/mli/infrastructure/storage/feature_store.py b/smartsim/_core/mli/infrastructure/storage/feature_store.py index 260b1a337..ebca07ed4 100644 --- a/smartsim/_core/mli/infrastructure/storage/feature_store.py +++ b/smartsim/_core/mli/infrastructure/storage/feature_store.py @@ -43,7 +43,7 @@ class ReservedKeys(str, enum.Enum): """Storage location for the list of registered consumers that will receive events from an EventBroadcaster""" - MLI_BACKEND_CONSUMER = "_SMARTIM_MLI_BACKEND_CONSUMER" + MLI_REGISTRAR_CONSUMER = "_SMARTIM_MLI_REGISTRAR_CONSUMER" """Storage location for the channel used to send messages directly to the MLI backend""" diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index ac1a14866..018703271 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -111,7 +111,7 @@ def has_model_key(self) -> bool: @property def has_raw_inputs(self) -> bool: - """Check if the InferenceRequest contains raw_outputs. + """Check if the InferenceRequest contains raw_inputs. :returns: True if raw_outputs is not None and is not an empty list, False otherwise diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index 114db88d9..f99950739 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -145,10 +145,12 @@ def max_length(self) -> int: return max(len(value) for value in self._timings.values()) def print_timings(self, to_file: bool = False) -> None: - """Print all timing information + """Print timing information to standard output. If `to_file` + is `True`, also write results to a file. - :param to_file: flag indicating if timing should be written to stdout - or to the timing file""" + :param to_file: If `True`, also saves timing information + to the files `timings.npy` and `timings.txt` + """ print(" ".join(self._timings.keys())) try: value_array = np.array(list(self._timings.values()), dtype=float) diff --git a/smartsim/protoclient.py b/smartsim/protoclient.py index d9cdcf594..46598a817 100644 --- a/smartsim/protoclient.py +++ b/smartsim/protoclient.py @@ -51,11 +51,10 @@ from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster +from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, - EventBroadcaster, - EventProducer, - OnWriteFeatureStore, ) from smartsim._core.mli.message_handler import MessageHandler from smartsim._core.utils.timings import PerfTimer @@ -82,6 +81,10 @@ class ProtoClient: """A default number of events to be buffered in the work queue before triggering QueueFull exceptions.""" + _EVENT_SOURCE = "proto-client" + """A user-friendly name for this class instance to identify + the client as the publisher of an event.""" + @staticmethod def _attach_to_backbone() -> BackboneFeatureStore: """Use the supplied environment variables to attach @@ -90,6 +93,8 @@ def _attach_to_backbone() -> BackboneFeatureStore: environment variable. :returns: The attached backbone featurestore + :raises SmartSimError: If the backbone descriptor is not contained + in the appropriate environment variable """ descriptor = os.environ.get(BackboneFeatureStore.MLI_BACKBONE, None) if descriptor is None or not descriptor: @@ -128,11 +133,11 @@ def _attach_to_worker_queue(self) -> DragonFLIChannel: return DragonFLIChannel.from_descriptor(descriptor) - def _create_broadcaster(self) -> EventProducer: - """Create an event publisher that will broadcast updates to - other MLI components. This publisher + def _create_broadcaster(self) -> EventBroadcaster: + """Create an EventBroadcaster that broadcasts events to + all MLI components registered to consume them. - :returns: the event publisher instance + :returns: An EventBroadcaster instance """ broadcaster = EventBroadcaster( self._backbone, DragonCommChannel.from_descriptor @@ -147,10 +152,11 @@ def __init__( """Initialize the client instance. :param timing_on: Flag indicating if timing information should be - written to file - :param wait_timeout: Maximum wait time (in seconds) allowed to attach to the - worker queue + written to file + :param backbone_timeout: Maximum wait time (in seconds) allowed to attach to the + worker queue :raises SmartSimError: If unable to attach to a backbone featurestore + :raises ValueError: If an invalid backbone timeout is specified """ if MPI is not None: # TODO: determine a way to make MPI work in the test environment @@ -215,8 +221,8 @@ def _format_number(number: t.Union[numbers.Number, float]) -> str: def start_timings(self, batch_size: numbers.Number) -> None: """Configure the client to begin storing timing information. - :param bach_size: The size of batches to generate as inputs - to the model + :param batch_size: The size of batches to generate as inputs + to the model """ if self._timing_on: self._add_label_to_timings("batch_size") @@ -245,10 +251,11 @@ def measure_time(self, label: str) -> None: self._interm = time.perf_counter() def print_timings(self, to_file: bool = False) -> None: - """Print timing information to standard output. + """Print timing information to standard output. If `to_file` + is `True`, also write results to a file. :param to_file: If `True`, also saves timing information - to the files `timings.npy` and `timings.txt` + to the files `timings.npy` and `timings.txt` """ print(" ".join(self._timings.keys())) @@ -261,7 +268,7 @@ def print_timings(self, to_file: bool = False) -> None: numpy.savetxt("timings.txt", value_array) def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: - """Execute a bach of inference requests with the supplied ML model. + """Execute a batch of inference requests with the supplied ML model. :param model: The raw bytes or path to a pytorch model :param batch: The tensor batch to perform inference on @@ -305,7 +312,6 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: self.perf_timer.measure_time("send_request") for tensor in tensors: to_sendh.send_bytes(tensor.tobytes()) # TODO NOT FAST ENOUGH!!! - # to_sendh.send_bytes(bytes(tensor.data)) logger.info(f"Message size: {len(request_bytes)} bytes") self.perf_timer.measure_time("send_tensors") @@ -314,7 +320,7 @@ def run_model(self, model: t.Union[bytes, str], batch: torch.Tensor) -> t.Any: self.perf_timer.measure_time("receive_response") response = MessageHandler.deserialize_response(resp) self.perf_timer.measure_time("deserialize_response") - # list of data blobs? + # recv depending on the len(response.result.descriptors)? data_blob: bytes = from_recvh.recv_bytes(timeout=None) self.perf_timer.measure_time("receive_tensor") @@ -338,5 +344,5 @@ def set_model(self, key: str, model: bytes) -> None: self._backbone[key] = model # notify components of a change in the data at this key - event = OnWriteFeatureStore(self._backbone.descriptor, key) + event = OnWriteFeatureStore(self._EVENT_SOURCE, self._backbone.descriptor, key) self._publisher.send(event) diff --git a/tests/dragon/test_dragon_backend.py b/tests/dragon/test_dragon_backend.py index 229855bc5..2b2ef50f9 100644 --- a/tests/dragon/test_dragon_backend.py +++ b/tests/dragon/test_dragon_backend.py @@ -35,13 +35,15 @@ from smartsim._core.launcher.dragon.dragonBackend import DragonBackend from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel -from smartsim._core.mli.infrastructure.control.event_listener import ( +from smartsim._core.mli.infrastructure.comm.event import ( + OnCreateConsumer, + OnShutdownRequested, +) +from smartsim._core.mli.infrastructure.control.listener import ( ConsumerRegistrationListener, ) from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, - OnCreateConsumer, - OnShutdownRequested, ) from smartsim.log import get_logger @@ -50,25 +52,30 @@ logger = get_logger(__name__) -def test_dragonbackend_start_listener(): +@pytest.fixture(scope="module") +def the_backend() -> DragonBackend: + return DragonBackend(pid=9999) + + +def test_dragonbackend_start_listener(the_backend: DragonBackend): """Verify the background process listening to consumer registration events is up and processing messages as expected.""" - backend = DragonBackend(pid=9999) # We need to let the backend create the backbone to continue - backbone = backend._create_backbone() - backbone.pop(BackboneFeatureStore.MLI_BACKEND_CONSUMER) + backbone = the_backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS) + backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor with pytest.raises(KeyError) as ex: # we expect the value of the consumer to be empty until # the listener start-up completes. - backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] assert "not found" in ex.value.args[0] - drg_process = backend.start_event_listener(cpu_affinity=[], gpu_affinity=[]) + drg_process = the_backend.start_event_listener(cpu_affinity=[], gpu_affinity=[]) # # confirm there is a process still running logger.info(f"Dragon process started: {drg_process}") @@ -79,24 +86,24 @@ def test_dragonbackend_start_listener(): # wait for the event listener to come up try: config = backbone.wait_for( - [BackboneFeatureStore.MLI_BACKEND_CONSUMER], timeout=30 + [BackboneFeatureStore.MLI_REGISTRAR_CONSUMER], timeout=30 ) # verify result was in the returned configuration map - assert config[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + assert config[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] except Exception: raise KeyError( - f"Unable to locate {BackboneFeatureStore.MLI_BACKEND_CONSUMER}" + f"Unable to locate {BackboneFeatureStore.MLI_REGISTRAR_CONSUMER}" "in the backbone" ) # wait_for ensures the normal retrieval will now work, error-free - descriptor = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + descriptor = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] assert descriptor is not None # register a new listener channel comm_channel = DragonCommChannel.from_descriptor(descriptor) mock_descriptor = str(uuid.uuid4()) - event = OnCreateConsumer(mock_descriptor, []) + event = OnCreateConsumer("test_dragonbackend_start_listener", mock_descriptor, []) event_bytes = bytes(event) comm_channel.send(event_bytes) @@ -122,17 +129,19 @@ def test_dragonbackend_start_listener(): drg_process.join() -def test_dragonbackend_backend_consumer(): - """Verify the listener background process updates the MLI_BACKEND_CONSUMER +def test_dragonbackend_backend_consumer(the_backend: DragonBackend): + """Verify the listener background process updates the appropriate value in the backbone.""" - backend = DragonBackend(pid=9999) # We need to let the backend create the backbone to continue - backbone = backend._create_backbone() + backbone = the_backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS) + backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) + assert backbone._allow_reserved_writes # create listener with `as_service=False` to perform a single loop iteration - listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False) + listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=False) logger.debug(f"backbone loaded? {listener._backbone}") logger.debug(f"listener created? {listener}") @@ -142,8 +151,8 @@ def test_dragonbackend_backend_consumer(): # the entire service lifecycle listener.execute() - consumer_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] - logger.debug(f"MLI_BACKEND_CONSUMER: {consumer_desc}") + consumer_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] + logger.debug(f"MLI_REGISTRAR_CONSUMER: {consumer_desc}") assert consumer_desc except Exception as ex: @@ -152,17 +161,17 @@ def test_dragonbackend_backend_consumer(): listener._on_shutdown() -def test_dragonbackend_event_handled(): - """Verify the event listener process updates the MLI_NOTIFY_CONSUMERS +def test_dragonbackend_event_handled(the_backend: DragonBackend): + """Verify the event listener process updates the appropriate value in the backbone when an event is received and again on shutdown. """ - backend = DragonBackend(pid=9999) - # We need to let the backend create the backbone to continue - backbone = backend._create_backbone() + backbone = the_backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS) + backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) # create the listener to be tested - listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False) + listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=False) assert listener._backbone, "The listener is not attached to a backbone" @@ -171,14 +180,18 @@ def test_dragonbackend_event_handled(): listener._create_eventing() # listener.execute() # grab the channel descriptor so we can simulate registrations - channel_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] + channel_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] comm_channel = DragonCommChannel.from_descriptor(channel_desc) num_events = 5 events = [] for i in range(num_events): # register some mock consumers using the backend channel - event = OnCreateConsumer(f"mock-consumer-descriptor-{uuid.uuid4()}", []) + event = OnCreateConsumer( + "test_dragonbackend_event_handled", + f"mock-consumer-descriptor-{uuid.uuid4()}", + [], + ) event_bytes = bytes(event) comm_channel.send(event_bytes) events.append(event) @@ -198,110 +211,97 @@ def test_dragonbackend_event_handled(): except Exception as ex: logger.exception(f"test_dragonbackend_event_handled - exception occurred: {ex}") + assert False finally: # shutdown should unregister a registration listener listener._on_shutdown() for i in range(10): - if BackboneFeatureStore.MLI_BACKEND_CONSUMER not in backbone: + if BackboneFeatureStore.MLI_REGISTRAR_CONSUMER not in backbone: logger.debug(f"The listener was removed after {i} iterations") channel_desc = None break # we should see that there is no listener registered - assert not channel_desc + assert not channel_desc, "Listener shutdown failed to clean up the backbone" -def test_dragonbackend_shutdown_event(): +def test_dragonbackend_shutdown_event(the_backend: DragonBackend): """Verify the background process shuts down when it receives a shutdown request.""" - backend = DragonBackend(pid=9999) # We need to let the backend create the backbone to continue - backbone = backend._create_backbone() + backbone = the_backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS) + backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) - listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, [], as_service=False) + listener = ConsumerRegistrationListener(backbone, 1.0, 1.0, as_service=True) - logger.debug(f"backbone loaded? {listener._backbone}") - logger.debug(f"listener created? {listener}") + # set up the listener but don't let the listener loop start + listener._create_eventing() # listener.execute() - try: - # set up the listener but don't let the listener loop start - listener._create_eventing() # listener.execute() + # grab the channel descriptor so we can publish to it + channel_desc = backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] + comm_channel = DragonCommChannel.from_descriptor(channel_desc) - # grab the channel descriptor so we can publish to it - channel_desc = backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] - comm_channel = DragonCommChannel.from_descriptor(channel_desc) + assert listener._consumer.listening, "Listener isn't ready to listen" - assert listener._consumer.listening, "Listener wasn't ready to listen" - - # send a shutdown request... - event = OnShutdownRequested() - event_bytes = bytes(event) - comm_channel.send(event_bytes) - - # run iteration a few times in case it takes a few cycles to write - for _ in range(5): - listener._on_iteration() - - logger.info(f"{listener._consumer.listening=}") + # send a shutdown request... + event = OnShutdownRequested("test_dragonbackend_shutdown_event") + event_bytes = bytes(event) + comm_channel.send(event_bytes, 0.1) - # ...and confirm the listener is now cancelled - assert not listener._consumer.listening + # execute should encounter the shutdown and exit + listener.execute() - except Exception as ex: - logger.exception( - f"test_dragonbackend_shutdown_event - exception occurred: {ex}" - ) + # ...and confirm the listener is now cancelled + assert not listener._consumer.listening @pytest.mark.parametrize("health_check_frequency", [10, 20]) -def test_dragonbackend_shutdown_on_health_check(health_check_frequency: float): +def test_dragonbackend_shutdown_on_health_check( + the_backend: DragonBackend, + health_check_frequency: float, +): """Verify that the event listener automatically shuts down when a new listener is registered in its place. :param health_check_frequency: The expected frequency of service health check invocations""" - backend = DragonBackend(pid=9999) # We need to let the backend create the backbone to continue - backbone = backend._create_backbone() + backbone = the_backend._create_backbone() + backbone.pop(BackboneFeatureStore.MLI_NOTIFY_CONSUMERS) + backbone.pop(BackboneFeatureStore.MLI_REGISTRAR_CONSUMER) listener = ConsumerRegistrationListener( backbone, 1.0, 1.0, - [], as_service=True, # allow service to run long enough to health check health_check_frequency=health_check_frequency, ) - try: - # set up the listener but don't let the listener loop start - listener._create_eventing() # listener.execute() - assert listener._consumer.listening, "Listener wasn't ready to listen" - - # Replace the consumer descriptor in the backbone to trigger - # an automatic shutdown - backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = str(uuid.uuid4()) + # set up the listener but don't let the listener loop start + listener._create_eventing() # listener.execute() + assert listener._consumer.listening, "Listener wasn't ready to listen" - # set the last health check manually to verify the duration - start_at = time.time() - listener._last_health_check = time.time() + # Replace the consumer descriptor in the backbone to trigger + # an automatic shutdown + backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = str(uuid.uuid4()) - # run execute to let the service trigger health checks - listener.execute() - elapsed = time.time() - start_at + # set the last health check manually to verify the duration + start_at = time.time() + listener._last_health_check = time.time() - # confirm the frequency of the health check was honored - assert elapsed >= health_check_frequency + # run execute to let the service trigger health checks + listener.execute() + elapsed = time.time() - start_at - # ...and confirm the listener is now cancelled - assert ( - not listener._consumer.listening - ), "Listener was not automatically shutdown by the health check" + # confirm the frequency of the health check was honored + assert elapsed >= health_check_frequency - except Exception as ex: - logger.exception( - f"test_dragonbackend_shutdown_event - exception occurred: {ex}" - ) + # ...and confirm the listener is now cancelled + assert ( + not listener._consumer.listening + ), "Listener was not automatically shutdown by the health check" diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 9dd0255fe..e9f6004d1 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -36,7 +36,7 @@ from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.environment_loader import EnvironmentConfigLoader -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( DragonFeatureStore, ) from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict @@ -64,7 +64,8 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke chan = create_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, + du.B64.bytes_to_str(queue.serialize()), ) config = EnvironmentConfigLoader( @@ -87,7 +88,8 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): chan = create_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv( - "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, + du.B64.bytes_to_str(queue.serialize()), ) config = EnvironmentConfigLoader( @@ -102,7 +104,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): """An incorrect serialized descriptor will fails to attach.""" - monkeypatch.setenv("_SMARTSIM_REQUEST_QUEUE", "randomstring") + monkeypatch.setenv(EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, "randomstring") config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -120,7 +122,9 @@ def test_environment_loader_backbone_load_dfs( """Verify the dragon feature store is loaded correctly by the EnvironmentConfigLoader to demonstrate featurestore_factory correctness.""" feature_store = DragonFeatureStore(the_storage) - monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", feature_store.descriptor) + monkeypatch.setenv( + EnvironmentConfigLoader.BACKBONE_ENV_VAR, feature_store.descriptor + ) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -138,8 +142,8 @@ def test_environment_variables_not_set(monkeypatch: pytest.MonkeyPatch): """EnvironmentConfigLoader getters return None when environment variables are not set.""" with monkeypatch.context() as patch: - patch.setenv("_SMARTSIM_INFRA_BACKBONE", "") - patch.setenv("_SMARTSIM_REQUEST_QUEUE", "") + patch.setenv(EnvironmentConfigLoader.BACKBONE_ENV_VAR, "") + patch.setenv(EnvironmentConfigLoader.REQUEST_QUEUE_ENV_VAR, "") config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index df370cbc4..a7ba7e7f2 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -523,3 +523,14 @@ def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch) -> None: mock_reply_fn.assert_called_once() mock_reply_fn.assert_called_with("fail", "Failure while fetching the model.") + + +def test_dragon_feature_store_invalid_storage(): + """Verify that attempting to create a DragonFeatureStore without storage fails.""" + storage = None + + with pytest.raises(ValueError) as ex: + DragonFeatureStore(storage) + + assert "storage" in ex.value.args[0].lower() + assert "required" in ex.value.args[0].lower() diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py index f361e6c16..bda8f33cd 100644 --- a/tests/dragon/test_event_consumer.py +++ b/tests/dragon/test_event_consumer.py @@ -35,22 +35,18 @@ from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.comm.channel.dragon_util import create_local -from smartsim._core.mli.infrastructure.control.event_listener import ( - ConsumerRegistrationListener, -) -from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( - BackboneFeatureStore, - EventBase, - EventBroadcaster, - EventCategory, - EventConsumer, +from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster +from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer +from smartsim._core.mli.infrastructure.comm.event import ( OnCreateConsumer, - OnRemoveConsumer, OnShutdownRequested, OnWriteFeatureStore, ) +from smartsim._core.mli.infrastructure.control.listener import ( + ConsumerRegistrationListener, +) from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( - time as bbtime, + BackboneFeatureStore, ) from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim.log import get_logger @@ -129,7 +125,7 @@ def test_eventconsumer_eventpublisher_integration( wmgr_consumer = EventConsumer( wmgr_channel, the_backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], + filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN], ) capp_consumer = EventConsumer( capp_channel, @@ -138,7 +134,7 @@ def test_eventconsumer_eventpublisher_integration( back_consumer = EventConsumer( back_channel, the_backbone, - filters=[EventCategory.CONSUMER_CREATED], + filters=[OnCreateConsumer.CONSUMER_CREATED], ) # create some broadcasters to publish messages @@ -160,12 +156,20 @@ def test_eventconsumer_eventpublisher_integration( ] # simulate worker manager sending a notification to backend that it's alive - event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[]) + event_1 = OnCreateConsumer( + "test_eventconsumer_eventpublisher_integration", + wmgr_consumer_descriptor, + filters=[], + ) mock_worker_mgr.send(event_1) # simulate the app updating a model a few times for key in ["key-1", "key-2", "key-1"]: - event = OnWriteFeatureStore(the_backbone.descriptor, key) + event = OnWriteFeatureStore( + "test_eventconsumer_eventpublisher_integration", + the_backbone.descriptor, + key, + ) mock_client_app.send(event, timeout=0.1) # worker manager should only get updates about feature update @@ -209,7 +213,7 @@ def test_eventconsumer_invalid_timeout( wmgr_consumer = EventConsumer( wmgr_channel, the_backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], + filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN], ) # the consumer should report an error for the invalid timeout value @@ -246,7 +250,11 @@ def test_eventconsumer_no_event_handler_registered( # simulate the app updating a model a few times for key in ["key-1", "key-2", "key-1"]: - event = OnWriteFeatureStore(the_backbone.descriptor, key) + event = OnWriteFeatureStore( + "test_eventconsumer_no_event_handler_registered", + the_backbone.descriptor, + key, + ) mock_worker_mgr.send(event, timeout=0.1) # run the handler and let it discard messages @@ -287,10 +295,16 @@ def test_eventconsumer_no_event_handler_registered_shutdown( # simulate the app updating a model a few times for key in ["key-1", "key-2", "key-1"]: - event = OnWriteFeatureStore(the_backbone.descriptor, key) + event = OnWriteFeatureStore( + "test_eventconsumer_no_event_handler_registered_shutdown", + the_backbone.descriptor, + key, + ) mock_worker_mgr.send(event, timeout=0.1) - event = OnShutdownRequested() + event = OnShutdownRequested( + "test_eventconsumer_no_event_handler_registered_shutdown" + ) mock_worker_mgr.send(event, timeout=0.1) # wmgr will stop listening to messages when it is told to stop listening @@ -389,19 +403,19 @@ def test_registrar_teardown( registrar._create_eventing() # confirm the registrar is published to the backbone - cfg = the_backbone.wait_for([BackboneFeatureStore.MLI_BACKEND_CONSUMER], 10) - assert BackboneFeatureStore.MLI_BACKEND_CONSUMER in cfg + cfg = the_backbone.wait_for([BackboneFeatureStore.MLI_REGISTRAR_CONSUMER], 10) + assert BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in cfg # execute the entire service lifecycle 1x registrar.execute() - consumer_found = BackboneFeatureStore.MLI_BACKEND_CONSUMER in the_backbone + consumer_found = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in the_backbone for i in range(15): time.sleep(0.1) - consumer_found = BackboneFeatureStore.MLI_BACKEND_CONSUMER in the_backbone + consumer_found = BackboneFeatureStore.MLI_REGISTRAR_CONSUMER in the_backbone if not consumer_found: logger.debug(f"Registrar removed from the backbone on iteration {i}") break - assert BackboneFeatureStore.MLI_BACKEND_CONSUMER not in the_backbone + assert BackboneFeatureStore.MLI_REGISTRAR_CONSUMER not in the_backbone diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index e34120c98..9156979ed 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -36,16 +36,10 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, - EventBroadcaster, - EventCategory, - EventConsumer, - OnCreateConsumer, - OnWriteFeatureStore, ) from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( time as bbtime, diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 2278a0036..4cadfd8f3 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -31,14 +31,15 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( - BackboneFeatureStore, - EventBroadcaster, - EventCategory, - EventConsumer, +from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster +from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer +from smartsim._core.mli.infrastructure.comm.event import ( OnCreateConsumer, OnWriteFeatureStore, ) +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) @@ -55,9 +56,6 @@ # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon -WORK_QUEUE_KEY = "_SMARTSIM_REQUEST_QUEUE" -RANDOMLY_SET_KEY = "_SOMETHING_ELSE" - def boom(*args, **kwargs) -> None: """Helper function that blows up when used to mock up @@ -72,8 +70,8 @@ def test_event_uid() -> None: # generate a bunch of events and keep track all the IDs for i in range(num_iters): - event_a = OnCreateConsumer(str(i), filters=[]) - event_b = OnWriteFeatureStore(str(i), "key") + event_a = OnCreateConsumer("test_event_uid", str(i), filters=[]) + event_b = OnWriteFeatureStore("test_event_uid", str(i), filters=[]) uids.add(event_a.uid) uids.add(event_b.uid) @@ -186,7 +184,9 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: # NOTE: we're not putting any consumers into the backbone here! backbone = BackboneFeatureStore(mock_storage) - event = OnCreateConsumer(consumer_descriptor, filters=[]) + event = OnCreateConsumer( + "test_eventpublisher_broadcast_no_factory", consumer_descriptor, filters=[] + ) publisher = EventBroadcaster(backbone) num_receivers = 0 @@ -194,7 +194,9 @@ def test_eventpublisher_broadcast_no_factory(test_dir: str) -> None: # publishing this event without any known consumers registered should succeed # but report that it didn't have anybody to send the event to consumer_descriptor = storage_path / f"test-consumer" - event = OnCreateConsumer(consumer_descriptor, filters=[]) + event = OnCreateConsumer( + "test_eventpublisher_broadcast_no_factory", consumer_descriptor, filters=[] + ) num_receivers += publisher.send(event) @@ -225,7 +227,11 @@ def test_eventpublisher_broadcast_to_empty_consumer_list(test_dir: str) -> None: backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) backbone.notification_channels = [] - event = OnCreateConsumer(consumer_descriptor, filters=[]) + event = OnCreateConsumer( + "test_eventpublisher_broadcast_to_empty_consumer_list", + consumer_descriptor, + filters=[], + ) publisher = EventBroadcaster( backbone, channel_factory=FileSystemCommChannel.from_descriptor ) @@ -258,7 +264,11 @@ def test_eventpublisher_broadcast_without_channel_factory(test_dir: str) -> None backbone = BackboneFeatureStore(mock_storage, allow_reserved_writes=True) backbone.notification_channels = [consumer_descriptor] - event = OnCreateConsumer(consumer_descriptor, filters=[]) + event = OnCreateConsumer( + "test_eventpublisher_broadcast_without_channel_factory", + consumer_descriptor, + filters=[], + ) publisher = EventBroadcaster( backbone, # channel_factory=FileSystemCommChannel.from_descriptor # <--- not supplied @@ -293,11 +303,17 @@ def test_eventpublisher_broadcast_empties_buffer(test_dir: str) -> None: # mock building up some buffered events num_buffered_events = 14 for i in range(num_buffered_events): - event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", []) + event = OnCreateConsumer( + "test_eventpublisher_broadcast_empties_buffer", + storage_path / f"test-consumer-{str(i)}", + [], + ) publisher._event_buffer.append(bytes(event)) event0 = OnCreateConsumer( - storage_path / f"test-consumer-{str(num_buffered_events + 1)}", [] + "test_eventpublisher_broadcast_empties_buffer", + storage_path / f"test-consumer-{str(num_buffered_events + 1)}", + [], ) num_receivers = publisher.send(event0) @@ -344,13 +360,21 @@ def test_eventpublisher_broadcast_returns_total_sent( # mock building up some buffered events for i in range(num_buffered): - event = OnCreateConsumer(storage_path / f"test-consumer-{str(i)}", []) + event = OnCreateConsumer( + "test_eventpublisher_broadcast_returns_total_sent", + storage_path / f"test-consumer-{str(i)}", + [], + ) publisher._event_buffer.append(bytes(event)) assert publisher.num_buffered == num_buffered # this event will trigger clearing anything already in buffer - event0 = OnCreateConsumer(storage_path / f"test-consumer-{num_buffered}", []) + event0 = OnCreateConsumer( + "test_eventpublisher_broadcast_returns_total_sent", + storage_path / f"test-consumer-{num_buffered}", + [], + ) # num_receivers should contain a number that computes w/all consumers and all events num_receivers = publisher.send(event0) @@ -376,7 +400,11 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: backbone, channel_factory=FileSystemCommChannel.from_descriptor ) - event = OnCreateConsumer(consumer_descriptor, filters=[]) + event = OnCreateConsumer( + "test_eventpublisher_prune_unused_consumer", + consumer_descriptor, + filters=[], + ) # the only registered cnosumer is in the event, expect no pruning backbone.notification_channels = (consumer_descriptor,) @@ -390,7 +418,9 @@ def test_eventpublisher_prune_unused_consumer(test_dir: str) -> None: # ... and remove the old descriptor from the backbone when it's looked up backbone.notification_channels = (consumer_descriptor2,) - event = OnCreateConsumer(consumer_descriptor2, filters=[]) + event = OnCreateConsumer( + "test_eventpublisher_prune_unused_consumer", consumer_descriptor2, filters=[] + ) publisher.send(event) @@ -447,7 +477,9 @@ def test_eventpublisher_serialize_failure( ) with monkeypatch.context() as patch: - event = OnCreateConsumer(target_descriptor, filters=[]) + event = OnCreateConsumer( + "test_eventpublisher_serialize_failure", target_descriptor, filters=[] + ) # patch the __bytes__ implementation to cause pickling to fail during send def bad_bytes(self) -> bytes: @@ -490,7 +522,9 @@ def boom(descriptor: str) -> None: publisher = EventBroadcaster(backbone, channel_factory=boom) with monkeypatch.context() as patch: - event = OnCreateConsumer(target_descriptor, filters=[]) + event = OnCreateConsumer( + "test_eventpublisher_factory_failure", target_descriptor, filters=[] + ) backbone.notification_channels = (target_descriptor,) @@ -527,7 +561,9 @@ def boom(self) -> None: raise Exception("That was unexpected...") with monkeypatch.context() as patch: - event = OnCreateConsumer(target_descriptor, filters=[]) + event = OnCreateConsumer( + "test_eventpublisher_failure", target_descriptor, filters=[] + ) # patch the _broadcast implementation to cause send to fail after # after the event has been pickled @@ -559,7 +595,9 @@ def test_eventconsumer_receive(test_dir: str) -> None: backbone = BackboneFeatureStore(mock_storage) comm_channel = FileSystemCommChannel.from_descriptor(target_descriptor) - event = OnCreateConsumer(target_descriptor, filters=[]) + event = OnCreateConsumer( + "test_eventconsumer_receive", target_descriptor, filters=[] + ) # simulate a sent event by writing directly to the input comm channel comm_channel.send(bytes(event)) @@ -596,7 +634,9 @@ def test_eventconsumer_receive_multi(test_dir: str, num_sent: int) -> None: # simulate multiple sent events by writing directly to the input comm channel for _ in range(num_sent): - event = OnCreateConsumer(target_descriptor, filters=[]) + event = OnCreateConsumer( + "test_eventconsumer_receive_multi", target_descriptor, filters=[] + ) comm_channel.send(bytes(event)) consumer = EventConsumer(comm_channel, backbone) @@ -660,7 +700,7 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: wmgr_consumer = EventConsumer( wmgr_channel, backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], + filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN], ) capp_consumer = EventConsumer( capp_channel, @@ -669,7 +709,7 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: back_consumer = EventConsumer( back_channel, backbone, - filters=[EventCategory.CONSUMER_CREATED], + filters=[OnCreateConsumer.CONSUMER_CREATED], ) # create some broadcasters to publish messages @@ -691,13 +731,23 @@ def test_eventconsumer_eventpublisher_integration(test_dir: str) -> None: ] # simulate worker manager sending a notification to backend that it's alive - event_1 = OnCreateConsumer(wmgr_consumer_descriptor, filters=[]) + event_1 = OnCreateConsumer( + "test_eventconsumer_eventpublisher_integration", + wmgr_consumer_descriptor, + filters=[], + ) mock_worker_mgr.send(event_1) # simulate the app updating a model a few times - event_2 = OnWriteFeatureStore(mock_fs_descriptor, "key-1") - event_3 = OnWriteFeatureStore(mock_fs_descriptor, "key-2") - event_4 = OnWriteFeatureStore(mock_fs_descriptor, "key-1") + event_2 = OnWriteFeatureStore( + "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-1" + ) + event_3 = OnWriteFeatureStore( + "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-2" + ) + event_4 = OnWriteFeatureStore( + "test_eventconsumer_eventpublisher_integration", mock_fs_descriptor, "key-1" + ) mock_client_app.send(event_2) mock_client_app.send(event_3) @@ -741,7 +791,7 @@ def test_eventconsumer_batch_timeout( consumer = EventConsumer( channel, backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], + filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN], ) consumer.recv(batch_timeout=invalid_timeout) diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index 895bc6467..69de23495 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -30,18 +30,16 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.comm.channel.dragon_channel import ( +from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragon_util import ( DEFAULT_CHANNEL_BUFFER_SIZE, - DragonCommChannel, + create_local, ) -from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster +from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer +from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, - EventBroadcaster, - EventCategory, - EventConsumer, - OnCreateConsumer, - OnWriteFeatureStore, ) from smartsim._core.mli.infrastructure.storage.dragon_util import ( create_ddict, @@ -131,7 +129,9 @@ def test_eventconsumer_max_dequeue( # simulate the app updating a model a lot of times for key in (f"key-{i}" for i in range(num_events)): - event = OnWriteFeatureStore(the_backbone.descriptor, key) + event = OnWriteFeatureStore( + "test_eventconsumer_max_dequeue", the_backbone.descriptor, key + ) mock_client_app.send(event, timeout=0.01) num_dequeued = 0 @@ -223,7 +223,9 @@ def test_channel_buffer_size( # simulate the app updating a model a lot of times for key in (f"key-{i}" for i in range(buffer_size)): - event = OnWriteFeatureStore(backbone.descriptor, key) + event = OnWriteFeatureStore( + "test_channel_buffer_size", backbone.descriptor, key + ) mock_client_app.send(event, timeout=0.01) # adding 1 more over the configured buffer size should report the error diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index b871de267..bc4a69612 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -37,10 +37,10 @@ from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster +from smartsim._core.mli.infrastructure.comm.event import OnWriteFeatureStore from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, - EventBroadcaster, - OnWriteFeatureStore, ) from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim.error.errors import SmartSimError @@ -281,7 +281,7 @@ def test_protoclient_write_model_notification_sent( the_backbone[BackboneFeatureStore.MLI_BACKBONE] = the_backbone.descriptor the_backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_queue.descriptor the_backbone[BackboneFeatureStore.MLI_NOTIFY_CONSUMERS] = ",".join(listeners) - the_backbone[BackboneFeatureStore.MLI_BACKEND_CONSUMER] = None + the_backbone[BackboneFeatureStore.MLI_REGISTRAR_CONSUMER] = None with monkeypatch.context() as ctx: ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) @@ -323,6 +323,9 @@ def test_protoclient_write_model_notification_sent( ), "Expected default timeout on call to `publisher.send`, " # confirm the correct event was raised - event = t.cast(OnWriteFeatureStore, pickle.loads(event_bytes)) + event = t.cast( + OnWriteFeatureStore, + pickle.loads(event_bytes), + ) assert event.descriptor == the_backbone.descriptor assert event.key == model_key diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 819414eca..3372bc1ad 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -265,8 +265,10 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader # or test environment may be unable to send messages w/queue - os.environ["_SMARTSIM_REQUEST_QUEUE"] = to_worker_fli_comm_channel.descriptor - os.environ["_SMARTSIM_INFRA_BACKBONE"] = backbone.descriptor + os.environ[BackboneFeatureStore.MLI_WORKER_QUEUE] = ( + to_worker_fli_comm_channel.descriptor + ) + os.environ[BackboneFeatureStore.MLI_BACKBONE] = backbone.descriptor config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py index e54cdf7fd..c658f2f26 100644 --- a/tests/dragon/utils/msg_pump.py +++ b/tests/dragon/utils/msg_pump.py @@ -96,7 +96,8 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: """Create a simple torch model and persist to disk for testing purposes. - TODO: remove once unit tests are in place""" + :returns: Path to the model file + """ # test_path = pathlib.Path(work_dir) if not model_path.parent.exists(): model_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/tests/mli/test_service.py b/tests/mli/test_service.py index 617738f94..3635f6ff7 100644 --- a/tests/mli/test_service.py +++ b/tests/mli/test_service.py @@ -27,6 +27,7 @@ import datetime import multiprocessing as mp import pathlib +import time import typing as t from asyncore import loop @@ -47,23 +48,37 @@ class SimpleService(Service): def __init__( self, log: t.List[str], - quit_after: int = 0, + quit_after: int = -1, as_service: bool = False, - cooldown: int = 0, - loop_delay: int = 0, + cooldown: float = 0, + loop_delay: float = 0, + hc_freq: float = -1, + run_for: float = 0, ) -> None: - super().__init__(as_service, cooldown, loop_delay) + super().__init__(as_service, cooldown, loop_delay, hc_freq) self._log = log self._quit_after = quit_after - self.num_iterations = 0 self.num_starts = 0 self.num_shutdowns = 0 + self.num_health_checks = 0 self.num_cooldowns = 0 - self.num_can_shutdown = 0 self.num_delays = 0 + self.num_iterations = 0 + self.num_can_shutdown = 0 + self.run_for = run_for + self.start_time = time.time() - def _on_iteration(self) -> None: - self.num_iterations += 1 + @property + def runtime(self) -> float: + return time.time() - self.start_time + + def _can_shutdown(self) -> bool: + self.num_can_shutdown += 1 + + if self._quit_after > -1 and self.num_iterations >= self._quit_after: + return True + if self.run_for > 0: + return self.runtime >= self.run_for def _on_start(self) -> None: self.num_starts += 1 @@ -71,16 +86,17 @@ def _on_start(self) -> None: def _on_shutdown(self) -> None: self.num_shutdowns += 1 + def _on_health_check(self) -> None: + self.num_health_checks += 1 + def _on_cooldown_elapsed(self) -> None: self.num_cooldowns += 1 def _on_delay(self) -> None: self.num_delays += 1 - def _can_shutdown(self) -> bool: - self.num_can_shutdown += 1 - if self._quit_after == 0: - return True + def _on_iteration(self) -> None: + self.num_iterations += 1 return self.num_iterations >= self._quit_after @@ -134,6 +150,7 @@ def test_service_run_until_can_shutdown(num_iterations: int) -> None: # no matter what, it should always execute the _on_iteration method assert service.num_iterations == 1 else: + # the shutdown check follows on_iteration. there will be one last call assert service.num_iterations == num_iterations assert service.num_starts == 1 @@ -203,3 +220,71 @@ def test_service_delay(delay: int, num_iterations: int) -> None: assert duration_in_seconds <= expected_duration assert service.num_cooldowns == 0 assert service.num_shutdowns == 1 + + +@pytest.mark.parametrize( + "health_check_freq, run_for", + [ + pytest.param(1, 5.5, id="1s freq, 10x"), + pytest.param(5, 10.5, id="5s freq, 2x"), + pytest.param(0.1, 5.1, id="0.1s freq, 50x"), + ], +) +def test_service_health_check_freq(health_check_freq: float, run_for: float) -> None: + """Verify that a the health check frequency is honored + + :param health_check_freq: The desired frequency of the health check + :pram run_for: A fixed duration to allow the service to run + """ + activity_log: t.List[str] = [] + + service = SimpleService( + activity_log, + quit_after=-1, + as_service=True, + cooldown=0, + hc_freq=health_check_freq, + run_for=run_for, + ) + + ts0 = datetime.datetime.now() + service.execute() + ts1 = datetime.datetime.now() + + # the expected duration is the sum of the delay between each iteration + expected_hc_count = run_for // health_check_freq + + # allow some wiggle room for frequency comparison + assert expected_hc_count - 1 <= service.num_health_checks <= expected_hc_count + 1 + + assert service.num_cooldowns == 0 + assert service.num_shutdowns == 1 + + +def test_service_health_check_freq_unbound() -> None: + """Verify that a health check frequency of zero is treated as + "always on" and is called each loop iteration + + :param health_check_freq: The desired frequency of the health check + :pram run_for: A fixed duration to allow the service to run + """ + health_check_freq: float = 0.0 + run_for: float = 5 + + activity_log: t.List[str] = [] + + service = SimpleService( + activity_log, + quit_after=-1, + as_service=True, + cooldown=0, + hc_freq=health_check_freq, + run_for=run_for, + ) + + service.execute() + + # allow some wiggle room for frequency comparison + assert service.num_health_checks == service.num_iterations + assert service.num_cooldowns == 0 + assert service.num_shutdowns == 1 diff --git a/tests/test_dragon_comm_utils.py b/tests/test_dragon_comm_utils.py index 06d6e19b3..a6f9c206a 100644 --- a/tests/test_dragon_comm_utils.py +++ b/tests/test_dragon_comm_utils.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import base64 import pathlib import uuid @@ -35,6 +36,8 @@ # isort: off import dragon.channels as dch +import dragon.infrastructure.parameters as dp +import dragon.managed_memory as dm import dragon.fli as fli # isort: on @@ -47,6 +50,16 @@ logger = get_logger(__name__) +@pytest.fixture(scope="function") +def the_pool() -> dm.MemoryPool: + """Creates a memory pool.""" + raw_pool_descriptor = dp.this_process.default_pd + descriptor_ = base64.b64decode(raw_pool_descriptor) + + pool = dm.MemoryPool.attach(descriptor_) + return pool + + @pytest.fixture(scope="function") def the_channel() -> dch.Channel: """Creates a Channel attached to the local memory pool.""" @@ -226,3 +239,19 @@ def test_descriptor_to_fli_happy_path(the_fli: dch.Channel) -> None: # and just make sure creation of the descriptor is transitive assert dragon_util.channel_to_descriptor(reattached) == descriptor + + +def test_pool_to_descriptor_empty() -> None: + """Verify that `pool_to_descriptor` raises an exception when + provided with a null pool.""" + + with pytest.raises(ValueError) as ex: + dragon_util.pool_to_descriptor(None) + + +def test_pool_to_happy_path(the_pool) -> None: + """Verify that `pool_to_descriptor` creates a descriptor + when supplied with a valid memory pool.""" + + descriptor = dragon_util.pool_to_descriptor(the_pool) + assert descriptor From 27ab4f985a59868066d71e428e591ea344910f5a Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Thu, 3 Oct 2024 20:43:53 -0500 Subject: [PATCH 27/40] fix extra arg copypasta --- tests/dragon/test_featurestore_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 4cadfd8f3..440b1f543 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -71,7 +71,7 @@ def test_event_uid() -> None: # generate a bunch of events and keep track all the IDs for i in range(num_iters): event_a = OnCreateConsumer("test_event_uid", str(i), filters=[]) - event_b = OnWriteFeatureStore("test_event_uid", str(i), filters=[]) + event_b = OnWriteFeatureStore("test_event_uid", str(i)) uids.add(event_a.uid) uids.add(event_b.uid) From fac30bde4b2b890c5b6c3446e67f9e718467490c Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Fri, 4 Oct 2024 11:21:59 -0500 Subject: [PATCH 28/40] test bugs --- smartsim/_core/mli/infrastructure/worker/worker.py | 2 +- tests/dragon/test_featurestore_base.py | 2 +- tests/dragon/test_featurestore_integration.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 018703271..f1718f053 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -489,7 +489,7 @@ def fetch_model( feature_store = feature_stores[fsd] raw_bytes: bytes = t.cast(bytes, feature_store[key]) return FetchModelResult(raw_bytes) - except FileNotFoundError as ex: + except (FileNotFoundError, KeyError) as ex: logger.exception(ex) raise SmartSimError(f"Model could not be retrieved with key {key}") from ex diff --git a/tests/dragon/test_featurestore_base.py b/tests/dragon/test_featurestore_base.py index 440b1f543..6daceb906 100644 --- a/tests/dragon/test_featurestore_base.py +++ b/tests/dragon/test_featurestore_base.py @@ -71,7 +71,7 @@ def test_event_uid() -> None: # generate a bunch of events and keep track all the IDs for i in range(num_iters): event_a = OnCreateConsumer("test_event_uid", str(i), filters=[]) - event_b = OnWriteFeatureStore("test_event_uid", str(i)) + event_b = OnWriteFeatureStore("test_event_uid", "test_event_uid", str(i)) uids.add(event_a.uid) uids.add(event_b.uid) diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index 69de23495..e9fa3d5dd 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -114,7 +114,7 @@ def test_eventconsumer_max_dequeue( wmgr_consumer = EventConsumer( the_worker_channel, the_backbone, - filters=[EventCategory.FEATURE_STORE_WRITTEN], + filters=[OnWriteFeatureStore.FEATURE_STORE_WRITTEN], ) # create a broadcaster to publish messages From a1cf7ff61b36e12af28d4c28aae52576bd387481 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Mon, 7 Oct 2024 14:02:31 -0500 Subject: [PATCH 29/40] Remove from_sender_supplied_descriptor factory on FLI channel --- .../standalone_worker_manager.py | 2 +- smartsim/_core/mli/comm/channel/dragon_fli.py | 38 +++++-------------- tests/dragon/test_environment_loader.py | 2 +- tests/dragon/test_error_handling.py | 10 ++--- tests/dragon/test_event_consumer.py | 2 +- tests/dragon/test_featurestore.py | 2 +- tests/dragon/test_protoclient.py | 2 +- tests/dragon/test_request_dispatcher.py | 4 +- tests/dragon/test_worker_manager.py | 8 ++-- tests/dragon/utils/msg_pump.py | 4 +- 10 files changed, 26 insertions(+), 48 deletions(-) diff --git a/ex/high_throughput_inference/standalone_worker_manager.py b/ex/high_throughput_inference/standalone_worker_manager.py index 9a3926803..b4527bc5d 100644 --- a/ex/high_throughput_inference/standalone_worker_manager.py +++ b/ex/high_throughput_inference/standalone_worker_manager.py @@ -141,7 +141,7 @@ def service_as_dragon_proc( to_worker_channel = create_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, True) + to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli) backbone.worker_queue = to_worker_fli_comm_ch.descriptor diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index d7787f2ca..aa9be8897 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -46,8 +46,7 @@ class DragonFLIChannel(cch.CommChannelBase): def __init__( self, fli_: fli.FLInterface, - sender_supplied: bool = True, - buffer_size: int = 0, + buffer_size: int = drg_util.DEFAULT_CHANNEL_BUFFER_SIZE, ) -> None: """Initialize the DragonFLIChannel instance. @@ -60,11 +59,11 @@ def __init__( self._fli = fli_ """The underlying dragon FLInterface used by this CommChannel for communications""" - self._channel: t.Optional["dch.Channel"] = ( - drg_util.create_local(buffer_size) if sender_supplied else None - ) + self._channel: t.Optional["dch.Channel"] = None """The underlying dragon Channel used by a sender-side DragonFLIChannel to attach to the main FLI channel""" + self._buffer_size: int = buffer_size + """Maximum number of messages that can be buffered before sending""" def send(self, value: bytes, timeout: float = 0.001) -> None: """Send a message through the underlying communication channel. @@ -74,10 +73,14 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: :raises SmartSimError: If sending message fails """ try: + if self._channel is None: + self._channel = drg_util.create_local(self._buffer_size) + with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value, timeout=timeout) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: + self._channel = None raise SmartSimError( f"Error sending via DragonFLIChannel {self.descriptor}" ) from e @@ -106,26 +109,6 @@ def recv(self, timeout: float = 0.001) -> t.List[bytes]: ) from e return messages - @classmethod - def from_sender_supplied_descriptor( - cls, - descriptor: str, - ) -> "DragonFLIChannel": - """A factory method that creates an instance from a descriptor string - - :param descriptor: the descriptor of the main FLI channel to attach - :returns: An attached DragonFLIChannel""" - try: - return DragonFLIChannel( - fli_=drg_util.descriptor_to_fli(descriptor), - sender_supplied=True, - ) - except: - logger.error( - f"Error while creating sender supplied DragonFLIChannel: {descriptor}" - ) - raise - @classmethod def from_descriptor( cls, @@ -142,10 +125,7 @@ def from_descriptor( raise ValueError("Invalid descriptor provided") try: - return DragonFLIChannel( - fli_=drg_util.descriptor_to_fli(descriptor), - sender_supplied=False, - ) + return DragonFLIChannel(fli_=drg_util.descriptor_to_fli(descriptor)) except Exception as e: raise SmartSimError( f"Error while creating DragonFLIChannel: {descriptor}" diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index e9f6004d1..aed1b0ae4 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -71,7 +71,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) config_queue = config.get_queue() diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index a7ba7e7f2..8421999a1 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -94,7 +94,7 @@ def the_worker_channel() -> DragonFLIChannel: that can be attached to.""" channel_ = create_local() fli_ = FLInterface(main_ch=channel_, manager_ch=None) - comm_channel = DragonFLIChannel(fli_, True) + comm_channel = DragonFLIChannel(fli_) return comm_channel @@ -132,7 +132,7 @@ def setup_worker_manager_model_bytes( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0) @@ -190,7 +190,7 @@ def setup_worker_manager_model_key( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) dispatcher_task_queue: mp.Queue[RequestBatch] = mp.Queue(maxsize=0) @@ -246,7 +246,7 @@ def setup_request_dispatcher_model_bytes( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) request_dispatcher = RequestDispatcher( @@ -289,7 +289,7 @@ def setup_request_dispatcher_model_key( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) request_dispatcher = RequestDispatcher( diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py index bda8f33cd..8c752c372 100644 --- a/tests/dragon/test_event_consumer.py +++ b/tests/dragon/test_event_consumer.py @@ -79,7 +79,7 @@ def the_worker_channel() -> DragonFLIChannel: that can be attached to. Does not modify environment vars.""" channel_ = create_local() fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) - comm_channel = DragonFLIChannel(fli_, True) + comm_channel = DragonFLIChannel(fli_) return comm_channel diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index 9156979ed..a97accd64 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -75,7 +75,7 @@ def the_worker_channel() -> DragonFLIChannel: that can be attached to. Does not modify environment vars.""" channel_ = create_local() fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) - comm_channel = DragonFLIChannel(fli_, True) + comm_channel = DragonFLIChannel(fli_) return comm_channel diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index bc4a69612..6885acc96 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -94,7 +94,7 @@ def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: # create the FLI to_worker_channel = create_local() fli_ = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - comm_channel = DragonFLIChannel(fli_, True) + comm_channel = DragonFLIChannel(fli_) # store the descriptor in the backbone the_backbone.worker_queue = comm_channel.descriptor diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index 82f41e3db..b8b725f79 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -104,7 +104,7 @@ def test_request_dispatcher( to_worker_channel = create_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli, sender_supplied=True) + to_worker_fli_comm_ch = DragonFLIChannel(to_worker_fli) backbone_fs = BackboneFeatureStore(the_storage, allow_reserved_writes=True) @@ -116,7 +116,7 @@ def test_request_dispatcher( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) request_dispatcher = RequestDispatcher( diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 3372bc1ad..4047a731f 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -149,7 +149,7 @@ def mock_messages( config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) backbone = config_loader.get_backbone() @@ -212,7 +212,7 @@ def mock_mli_infrastructure_mgr() -> None: config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) integrated_worker = TorchWorker @@ -261,7 +261,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: to_worker_channel = create_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli, sender_supplied=True) + to_worker_fli_comm_channel = DragonFLIChannel(to_worker_fli) # NOTE: env vars must be set prior to instantiating EnvironmentConfigLoader # or test environment may be unable to send messages w/queue @@ -273,7 +273,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_sender_supplied_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) integrated_worker_type = TorchWorker diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py index c658f2f26..835bccd2b 100644 --- a/tests/dragon/utils/msg_pump.py +++ b/tests/dragon/utils/msg_pump.py @@ -122,9 +122,7 @@ def mock_messages( offset = 2 * parent_iteration feature_store = BackboneFeatureStore.from_descriptor(fs_descriptor) - request_dispatcher_queue = DragonFLIChannel.from_sender_supplied_descriptor( - dispatch_fli_descriptor - ) + request_dispatcher_queue = DragonFLIChannel.from_descriptor(dispatch_fli_descriptor) for iteration_number in range(2): logged_iteration = offset + iteration_number From 124a195100d357936e370e05f2971a917f2a8079 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Mon, 7 Oct 2024 14:47:58 -0500 Subject: [PATCH 30/40] re-home protoclient into mli subpackage --- ex/high_throughput_inference/mock_app.py | 2 +- smartsim/_core/mli/client/__init__.py | 0 smartsim/{ => _core/mli/client}/protoclient.py | 0 tests/dragon/test_protoclient.py | 2 +- 4 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 smartsim/_core/mli/client/__init__.py rename smartsim/{ => _core/mli/client}/protoclient.py (100%) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 876f9145a..c3b3eaaf4 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -51,7 +51,7 @@ from collections import OrderedDict from smartsim.log import get_logger, log_to_file -from smartsim.protoclient import ProtoClient +from smartsim._core.mli.client.protoclient import ProtoClient logger = get_logger("App") diff --git a/smartsim/_core/mli/client/__init__.py b/smartsim/_core/mli/client/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/smartsim/protoclient.py b/smartsim/_core/mli/client/protoclient.py similarity index 100% rename from smartsim/protoclient.py rename to smartsim/_core/mli/client/protoclient.py diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index 6885acc96..b02859f51 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -51,7 +51,7 @@ from dragon.data.ddict.ddict import DDict # from ..ex..high_throughput_inference.mock_app import ProtoClient -from smartsim.protoclient import ProtoClient +from smartsim._core.mli.client.protoclient import ProtoClient # The tests in this file belong to the dragon group From b2c4cb7f3ee53d79ca77906661931f4acb01d27f Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Mon, 7 Oct 2024 16:53:05 -0500 Subject: [PATCH 31/40] additional missed docstrings --- smartsim/_core/entrypoints/service.py | 10 +-- .../_core/launcher/dragon/dragonBackend.py | 4 +- .../_core/launcher/dragon/dragonConnector.py | 74 ++++++++++++++++++- .../mli/infrastructure/comm/broadcaster.py | 5 +- .../_core/mli/infrastructure/comm/consumer.py | 2 - 5 files changed, 82 insertions(+), 13 deletions(-) diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index 497bdda2f..12b0cdd4d 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -35,9 +35,9 @@ class Service(ABC): - """Base contract for standalone entrypoint scripts. Defines API for entrypoint - behaviors (event loop, automatic shutdown, cooldown) as well as simple - hooks for status changes""" + """Core API for standalone entrypoint scripts. Makes use of overridable hook + methods to modify behaviors (event loop, automatic shutdown, cooldown) as + well as simple hooks for status changes""" def __init__( self, @@ -46,7 +46,7 @@ def __init__( loop_delay: float = 0, health_check_frequency: float = 0, ) -> None: - """Initialize the ServiceHost + """Initialize the Service :param as_service: Determines if the host runs continuously until shutdown criteria are met, or executes the service lifecycle once and exits @@ -83,7 +83,7 @@ def _can_shutdown(self) -> bool: def _on_start(self) -> None: """Empty hook method for use by subclasses. Called on initial entry into - ServiceHost `execute` event loop before `_on_iteration` is invoked.""" + Service `execute` event loop before `_on_iteration` is invoked.""" logger.debug(f"Starting {self.__class__.__name__}") def _on_shutdown(self) -> None: diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index fb33460d8..45d646bf5 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -100,7 +100,7 @@ class ProcessGroupInfo: return_codes: t.Optional[t.List[int]] = None """List of return codes of completed processes""" hosts: t.List[str] = field(default_factory=list) - """List of hosts on which the Process Group """ + """List of hosts on which the Process Group should be executed""" redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None """Workers used to redirect stdout and stderr to file""" @@ -593,7 +593,7 @@ def start_event_listener( """Start a standalone event listener. :param cpu_affinity: The CPU affinity for the process - :param gpu_affinity: The CPU affinity for the process + :param gpu_affinity: The GPU affinity for the process :returns: The dragon Process managing the process :raises SmartSimError: If the backbone is not provided """ diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index 98670f347..1144b7764 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -71,17 +71,23 @@ class DragonConnector: def __init__(self) -> None: self._context: zmq.Context[t.Any] = zmq.Context.instance() + """ZeroMQ context used to share configuration across requests""" self._context.setsockopt(zmq.REQ_CORRELATE, 1) self._context.setsockopt(zmq.REQ_RELAXED, 1) self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None + """ZeroMQ authenticator used to secure queue access""" config = get_config() self._reset_timeout(config.dragon_server_timeout) self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None + """ZeroMQ socket exposing the connection to the DragonBackend""" self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None + """A handle to the process executing the DragonBackend""" # Returned by dragon head, useful if shutdown is to be requested # but process was started by another connector self._dragon_head_pid: t.Optional[int] = None + """Process ID of the process executing the DragonBackend""" self._dragon_server_path = config.dragon_server_path + """Path to a dragon installation""" logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") self._env_vars: t.Dict[str, str] = {} if self._dragon_server_path is None: @@ -95,7 +101,7 @@ def __init__(self) -> None: @property def is_connected(self) -> bool: - """Whether the Connector established a connection to the server + """Whether the Connector established a connection to the server. :return: True if connected """ @@ -104,12 +110,18 @@ def is_connected(self) -> bool: @property def can_monitor(self) -> bool: """Whether the Connector knows the PID of the dragon server head process - and can monitor its status + and can monitor its status. :return: True if the server can be monitored""" return self._dragon_head_pid is not None def _handshake(self, address: str) -> None: + """Perform the handshake process with the DragonBackend and + confirm two-way communication is established. + + :param address: The address of the head node socket to initiate a + handhake with + """ self._dragon_head_socket = dragonSockets.get_secure_socket( self._context, zmq.REQ, False ) @@ -132,6 +144,11 @@ def _handshake(self, address: str) -> None: ) from e def _reset_timeout(self, timeout: int = get_config().dragon_server_timeout) -> None: + """Reset the timeout applied to the ZMQ context. If an authenticator is + enabled, also update the authenticator timeouts. + + :param timeout: The timeout value to apply to ZMQ sockets + """ self._context.setsockopt(zmq.SNDTIMEO, value=timeout) self._context.setsockopt(zmq.RCVTIMEO, value=timeout) if self._authenticator is not None and self._authenticator.thread is not None: @@ -183,11 +200,19 @@ def _get_new_authenticator( @staticmethod def _get_dragon_log_level() -> str: + """Maps the log level from SmartSim to a valid log level + for a dragon process. + + :returns: The dragon log level string + """ smartsim_to_dragon = defaultdict(lambda: "NONE") smartsim_to_dragon["developer"] = "INFO" return smartsim_to_dragon.get(get_config().log_level, "NONE") def _connect_to_existing_server(self, path: Path) -> None: + """Connects to an existing DragonBackend using address information from + a persisted dragon log file. + """ config = get_config() dragon_config_log = path / config.dragon_log_filename @@ -217,6 +242,11 @@ def _connect_to_existing_server(self, path: Path) -> None: return def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: + """Instantiate the ZMQ socket to be used by the connector. + + :param socket_addr: The socket address the connector should bind to + :returns: The bound socket + """ config = get_config() connector_socket: t.Optional[zmq.Socket[t.Any]] = None self._reset_timeout(config.dragon_server_startup_timeout) @@ -423,6 +453,15 @@ def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse def _parse_launched_dragon_server_info_from_iterable( stream: t.Iterable[str], num_dragon_envs: t.Optional[int] = None ) -> t.List[t.Dict[str, str]]: + """Parses dragon backend connection information from a stream. + + :param stream: The stream to inspect. Usually the stdout of the + DragonBackend process + :param num_dragon_envs: The expected number of dragon environments + to parse from the stream. + :returns: A list of dictionaries, one per environment, containing + the parsed server information + """ lines = (line.strip() for line in stream) lines = (line for line in lines if line) tokenized = (line.split(maxsplit=1) for line in lines) @@ -449,6 +488,15 @@ def _parse_launched_dragon_server_info_from_files( file_paths: t.List[t.Union[str, "os.PathLike[str]"]], num_dragon_envs: t.Optional[int] = None, ) -> t.List[t.Dict[str, str]]: + """Read a known log file into a Stream and parse dragon server configuration + from the stream. + + :param file_paths: Path to a file containing dragon server configuration + :num_dragon_envs: The expected number of dragon environments to be found + in the file + :returns: The parsed server configuration, one item per + discovered dragon environment + """ with fileinput.FileInput(file_paths) as ifstream: dragon_envs = cls._parse_launched_dragon_server_info_from_iterable( ifstream, num_dragon_envs @@ -463,6 +511,15 @@ def _send_req_with_socket( send_flags: int = 0, recv_flags: int = 0, ) -> DragonResponse: + """Sends a synchronous request through a ZMQ socket. + + :param socket: Socket to send on + :param request: The request to send + :param send_flags: Configuration to apply to the send operation + :param recv_flags: Configuration to apply to the recv operation; used to + allow the receiver to immediately respond to the sent request. + :returns: The response from the target + """ client = dragonSockets.as_client(socket) with DRG_LOCK: logger.debug(f"Sending {type(request).__name__}: {request}") @@ -474,6 +531,13 @@ def _send_req_with_socket( def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: + """Verify that objects can be sent as messages acceptable to the target. + + :param obj: The message to test + :param typ: The type that is acceptable + :returns: The original `obj` if it is of the requested type + :raises TypeError: If the object fails the test and is not + an instance of the desired type""" if not isinstance(obj, typ): raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") return obj @@ -525,6 +589,12 @@ def _dragon_cleanup( def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: + """Determine the applicable dragon server path for the connector + + :param fallback: A default dragon server path to use if one is not + found in the runtime configuration + :returns: The path to the dragon libraries + """ dragon_server_path = get_config().dragon_server_path or os.path.join( fallback, ".smartsim", "dragon" ) diff --git a/smartsim/_core/mli/infrastructure/comm/broadcaster.py b/smartsim/_core/mli/infrastructure/comm/broadcaster.py index d813cce12..cd8c45745 100644 --- a/smartsim/_core/mli/infrastructure/comm/broadcaster.py +++ b/smartsim/_core/mli/infrastructure/comm/broadcaster.py @@ -61,6 +61,8 @@ def __init__( :param backbone: The MLI backbone feature store :param channel_factory: Factory method to construct new channel instances + :param name: A unique identifer assigned to the broadcaster for logging. If + not provided, the system will auto-assign one. """ self._backbone = backbone """The backbone feature store used to retrieve consumer descriptors""" @@ -178,8 +180,7 @@ def _broadcast(self, timeout: float = 0.001) -> BroadcastResult: :param timeout: Maximum time to wait (in seconds) for messages to send :returns: BroadcastResult containing the number of messages that were successfully and unsuccessfully sent for all consumers - :raises SmartSimError: If the channel fails to attach - :raises SmartSimError: If broadcasting fails + :raises SmartSimError: If the channel fails to attach or broadcasting fails """ # allow descriptors to be empty since events are buffered self._descriptors = set(x for x in self._backbone.notification_channels if x) diff --git a/smartsim/_core/mli/infrastructure/comm/consumer.py b/smartsim/_core/mli/infrastructure/comm/consumer.py index 3e03ba86c..08b5c4785 100644 --- a/smartsim/_core/mli/infrastructure/comm/consumer.py +++ b/smartsim/_core/mli/infrastructure/comm/consumer.py @@ -54,7 +54,6 @@ class EventConsumer: def __init__( self, comm_channel: CommChannelBase, - # channel_factory: ..., backbone: BackboneFeatureStore, filters: t.Optional[t.List[str]] = None, name: t.Optional[str] = None, @@ -68,7 +67,6 @@ def __init__( events will be delivered :param name: A user-friendly name for logging. If not provided, an auto-generated GUID will be used - :raises ValueError: If batch_timeout <= 0 """ self._comm_channel = comm_channel """The comm channel used by the consumer to receive messages. The channel From 0c495f6a81347d47c785da028fc2fa3f4b213b1a Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 8 Oct 2024 11:46:36 -0500 Subject: [PATCH 32/40] improve service param docstrings, avoid separate var/param descriptions --- smartsim/_core/entrypoints/service.py | 35 +++++++++++++++++---------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index 12b0cdd4d..719c2a60f 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -48,26 +48,35 @@ def __init__( ) -> None: """Initialize the Service - :param as_service: Determines if the host runs continuously until - shutdown criteria are met, or executes the service lifecycle once and exits + :param as_service: Determines lifetime of the service. When `True`, calling + execute on the service will run continuously until shutdown criteria are met. + Otherwise, `execute` performs a single pass through the service lifecycle and + automatically exits (regardless of the result of `_can_shutdown`). :param cooldown: Period of time (in seconds) to allow the service to run - after a shutdown is permitted. Enables the service to avoid restarting if - new work is discovered. A value of 0 disables the cooldown. - :param loop_delay: Time (in seconds) between iterations of the event loop + after a shutdown is permitted. Enables the service to avoid restarting if + new work is discovered. A value of 0 disables the cooldown. + :param loop_delay: Duration (in seconds) of a forced delay between + iterations of the event loop :param health_check_frequency: Time (in seconds) between calls to a - health check handler. A value of 0 triggers the health check on every - iteration. + health check handler. A value of 0 triggers the health check on every + iteration. """ self._as_service = as_service - """If the service should run until shutdown function returns True""" + """Determines lifetime of the service. When `True`, calling + `execute` on the service will run continuously until shutdown criteria are met. + Otherwise, `execute` performs a single pass through the service lifecycle and + automatically exits (regardless of the result of `_can_shutdown`).""" self._cooldown = abs(cooldown) - """Duration of a cooldown period between requests to the service - before shutdown""" + """Period of time (in seconds) to allow the service to run + after a shutdown is permitted. Enables the service to avoid restarting if + new work is discovered. A value of 0 disables the cooldown.""" self._loop_delay = abs(loop_delay) - """Forced delay between iterations of the event loop""" + """Duration (in seconds) of a forced delay between + iterations of the event loop""" self._health_check_frequency = health_check_frequency - """The time (in seconds) between desired health checks. Frequency of 0 - will trigger the health check on every event loop iteration.""" + """Time (in seconds) between calls to a + health check handler. A value of 0 triggers the health check on every + iteration.""" self._last_health_check = time.time() """The timestamp of the latest health check""" From 1fc59e4954fa2192406623f34f101345d355ba9b Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 8 Oct 2024 12:21:53 -0500 Subject: [PATCH 33/40] docstring fixes --- smartsim/_core/launcher/dragon/dragonBackend.py | 4 ++-- smartsim/_core/mli/infrastructure/comm/broadcaster.py | 4 ++-- smartsim/_core/mli/infrastructure/control/listener.py | 2 -- .../mli/infrastructure/control/request_dispatcher.py | 1 - smartsim/_core/mli/infrastructure/worker/worker.py | 11 +++++++---- smartsim/_core/mli/message_handler.py | 3 +++ 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 45d646bf5..9f0473d0a 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -160,7 +160,7 @@ class DragonBackend: _DEFAULT_NUM_MGR_PER_NODE = 2 """The default number of manager processes for each feature store node""" _DEFAULT_MEM_PER_NODE = 256 * 1024**2 - """The default memory capacity to allocate for a feaure store node (in megabytes)""" + """The default memory capacity (in bytes) to allocate for a feaure store node""" def __init__(self, pid: int) -> None: self._pid = pid @@ -555,7 +555,7 @@ def _create_backbone(self) -> BackboneFeatureStore: environment variables of this process to include the backbone descriptor. - :returns: The descriptor of the backbone feature store + :returns: The backbone feature store """ if self._backbone is None: backbone_storage = create_ddict( diff --git a/smartsim/_core/mli/infrastructure/comm/broadcaster.py b/smartsim/_core/mli/infrastructure/comm/broadcaster.py index cd8c45745..56dcf549f 100644 --- a/smartsim/_core/mli/infrastructure/comm/broadcaster.py +++ b/smartsim/_core/mli/infrastructure/comm/broadcaster.py @@ -61,8 +61,8 @@ def __init__( :param backbone: The MLI backbone feature store :param channel_factory: Factory method to construct new channel instances - :param name: A unique identifer assigned to the broadcaster for logging. If - not provided, the system will auto-assign one. + :param name: A user-friendly name for logging. If not provided, an + auto-generated GUID will be used """ self._backbone = backbone """The backbone feature store used to retrieve consumer descriptors""" diff --git a/smartsim/_core/mli/infrastructure/control/listener.py b/smartsim/_core/mli/infrastructure/control/listener.py index b5c529615..56a7b12d3 100644 --- a/smartsim/_core/mli/infrastructure/control/listener.py +++ b/smartsim/_core/mli/infrastructure/control/listener.py @@ -242,8 +242,6 @@ def _create_eventing(self) -> EventConsumer: Create an event publisher and event consumer for communicating with other MLI resources. - :param backbone: The backbone feature store used by the MLI backend. - NOTE: the backbone must be initialized before connecting eventing clients. :returns: The newly created EventConsumer instance diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index b0f931cb3..3cc8f88da 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -227,7 +227,6 @@ def __init__( :param config_loader: Object to load configuration from environment :param worker_type: Type of worker to instantiate to batch inputs :param mem_pool_size: Size of the memory pool used to allocate tensors - :raises SmartSimError: If config_loaded.get_queue() does not return a channel """ super().__init__(as_service=True, cooldown=1) self._queues: dict[str, list[BatchQueue]] = {} diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index f1718f053..9556b8e43 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -200,6 +200,7 @@ def __init__(self, model: t.Any) -> None: :param model: The loaded model """ self.model = model + """The loaded model (e.g. a TensorFlow, PyTorch, ONNX, etc. model)""" class TransformInputResult: @@ -549,7 +550,7 @@ def place_output( feature store. :param request: The request that triggered the pipeline - :param execute_result: Results from inference + :param transform_result: Transformed version of the inference result :param feature_stores: Available feature stores used for persistence :returns: A collection of keys that were placed in the feature store :raises ValueError: If a feature store is not provided @@ -579,10 +580,12 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): def load_model( batch: RequestBatch, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: - """Given a loaded MachineLearningModel, ensure it is loaded into - device memory. + """Given the raw bytes of an ML model that were fetched, ensure + it is loaded into device memory. :param request: The request that triggered the pipeline + :param fetch_result: The result of a fetch-model operation; contains + the raw bytes of the ML model. :param device: The device on which the model must be placed :returns: LoadModelResult wrapping the model loaded for the request :raises ValueError: If model reference object is not found @@ -599,7 +602,7 @@ def transform_input( """Given a collection of data, perform a transformation on the data and put the raw tensor data on a MemoryPool allocation. - :param request: The request that triggered the pipeline + :param batch: The request that triggered the pipeline :param fetch_result: Raw outputs from fetching inputs out of a feature store :param mem_pool: The memory pool used to access batched input tensors :returns: The transformed inputs wrapped in a TransformInputResult diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index d7324e4a4..2511e9d25 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -35,6 +35,9 @@ class MessageHandler: + """Utility methods for transforming capnproto messages to and from + internal representations. + """ @staticmethod def build_tensor_descriptor( order: "tensor_capnp.Order", From b31612864694d7af3736c96765c8e1db0f27a46e Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 8 Oct 2024 12:29:56 -0500 Subject: [PATCH 34/40] increase default memory per backbone node --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 9f0473d0a..5e0129914 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -159,7 +159,7 @@ class DragonBackend: _DEFAULT_NUM_MGR_PER_NODE = 2 """The default number of manager processes for each feature store node""" - _DEFAULT_MEM_PER_NODE = 256 * 1024**2 + _DEFAULT_MEM_PER_NODE = 512 * 1024**2 """The default memory capacity (in bytes) to allocate for a feaure store node""" def __init__(self, pid: int) -> None: From 2ed47a4129c7c1d66096783dd9bb6c04bedf1692 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 8 Oct 2024 13:02:23 -0500 Subject: [PATCH 35/40] fix fixture usage bug (worker queue preloaded into backbone) --- tests/dragon/test_protoclient.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index b02859f51..fff5fac47 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -138,6 +138,10 @@ def test_protoclient_timeout( with monkeypatch.context() as ctx, pytest.raises(SmartSimError) as ex: start_time = time.time() + # remove the worker queue value from the backbone if it exists + # to ensure the timeout occurs + the_backbone.pop(BackboneFeatureStore.MLI_WORKER_QUEUE) + ctx.setenv(BackboneFeatureStore.MLI_BACKBONE, the_backbone.descriptor) ProtoClient(timing_on=False, backbone_timeout=backbone_timeout) From 8ccebb53690d9060f544fe1feb0d3524f7c85dc1 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 8 Oct 2024 13:54:41 -0500 Subject: [PATCH 36/40] extract dragon fixtures into dragon conftest.py --- conftest.py | 78 -------- tests/dragon/conftest.py | 172 ++++++++++++++++++ tests/dragon/test_dragon_ddict_utils.py | 6 - tests/dragon/test_environment_loader.py | 7 - tests/dragon/test_error_handling.py | 25 --- tests/dragon/test_event_consumer.py | 35 ---- tests/dragon/test_featurestore.py | 36 ---- tests/dragon/test_featurestore_integration.py | 20 -- tests/dragon/test_protoclient.py | 22 --- tests/dragon/test_request_dispatcher.py | 8 +- 10 files changed, 173 insertions(+), 236 deletions(-) create mode 100644 tests/dragon/conftest.py diff --git a/conftest.py b/conftest.py index 7302482e6..54a47f9e2 100644 --- a/conftest.py +++ b/conftest.py @@ -1022,81 +1022,3 @@ def _prepare_db(db_config: DBConfiguration) -> PrepareDatabaseOutput: return PrepareDatabaseOutput(db, new_db) return _prepare_db - - -class MsgPumpRequest(t.NamedTuple): - """Fields required for starting a simulated inference request producer.""" - - backbone_descriptor: str - """The descriptor to use when connecting the message pump to a - backbone featurestore. - - Passed to the message pump as `--fs-descriptor` - """ - work_queue_descriptor: str - """The descriptor to use for sending work from the pump to the worker manager. - - Passed to the message pump as `--dispatch-fli-descriptor` - """ - callback_descriptor: str - """The descriptor the worker should use to returning results. - - Passed to the message pump as `--callback-descriptor` - """ - iteration_index: int = 1 - """If calling the message pump repeatedly, supply an iteration index to ensure - that logged messages appear unique instead of apparing to be duplicated logs. - - Passed to the message pump as `--parent-iteration` - """ - - def as_command(self) -> t.List[str]: - """Produce CLI arguments suitable for calling subprocess.Popen that - to execute the msg pump. - - NOTE: does NOT include the `[sys.executable, msg_pump_path, ...]` - portion of the necessary parameters to Popen. - - :returns: The arguments of the request formatted appropriately to - Popen the `/tests/dragon/utils/msg_pump.py`""" - return [ - "--dispatch-fli-descriptor", - self.work_queue_descriptor, - "--fs-descriptor", - self.backbone_descriptor, - "--parent-iteration", - str(self.iteration_index), - "--callback-descriptor", - self.callback_descriptor, - ] - - -@pytest.fixture(scope="session") -def msg_pump_factory() -> t.Callable[[MsgPumpRequest], subprocess.Popen]: - """A pytest fixture used to create a mock event producer capable of - feeding asynchronous inference requests to tests requiring them. - - :returns: A function that opens a subprocess running a mock message pump - """ - - def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen: - """Invoke the message pump entry-point with the descriptors - from the request. - - :param request: A request containing all parameters required to - invoke the message pump entrypoint - :returns: The Popen object for the subprocess that was started""" - # /tests/dragon/utils/msg_pump.py - msg_pump_script = "tests/dragon/utils/msg_pump.py" - msg_pump_path = pathlib.Path(__file__).parent / msg_pump_script - - cmd = [sys.executable, str(msg_pump_path.absolute()), *request.as_command()] - - popen = subprocess.Popen( - args=cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - return popen - - return run_message_pump diff --git a/tests/dragon/conftest.py b/tests/dragon/conftest.py new file mode 100644 index 000000000..3084a2f38 --- /dev/null +++ b/tests/dragon/conftest.py @@ -0,0 +1,172 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import pathlib +import subprocess +import sys +import typing as t + +import pytest + +dragon = pytest.importorskip("dragon") + +# isort: off +import dragon.data.ddict.ddict as dragon_ddict + +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.fli import FLInterface +from dragon.mpbridge.queues import DragonQueue + +# isort: on + +from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragon_util import create_local +from smartsim._core.mli.infrastructure.storage import dragon_util +from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( + BackboneFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( + DragonFeatureStore, +) + +class MsgPumpRequest(t.NamedTuple): + """Fields required for starting a simulated inference request producer.""" + + backbone_descriptor: str + """The descriptor to use when connecting the message pump to a + backbone featurestore. + + Passed to the message pump as `--fs-descriptor` + """ + work_queue_descriptor: str + """The descriptor to use for sending work from the pump to the worker manager. + + Passed to the message pump as `--dispatch-fli-descriptor` + """ + callback_descriptor: str + """The descriptor the worker should use to returning results. + + Passed to the message pump as `--callback-descriptor` + """ + iteration_index: int = 1 + """If calling the message pump repeatedly, supply an iteration index to ensure + that logged messages appear unique instead of apparing to be duplicated logs. + + Passed to the message pump as `--parent-iteration` + """ + + def as_command(self) -> t.List[str]: + """Produce CLI arguments suitable for calling subprocess.Popen that + to execute the msg pump. + + NOTE: does NOT include the `[sys.executable, msg_pump_path, ...]` + portion of the necessary parameters to Popen. + + :returns: The arguments of the request formatted appropriately to + Popen the `/tests/dragon/utils/msg_pump.py`""" + return [ + "--dispatch-fli-descriptor", + self.work_queue_descriptor, + "--fs-descriptor", + self.backbone_descriptor, + "--parent-iteration", + str(self.iteration_index), + "--callback-descriptor", + self.callback_descriptor, + ] + + +@pytest.fixture(scope="session") +def msg_pump_factory() -> t.Callable[[MsgPumpRequest], subprocess.Popen]: + """A pytest fixture used to create a mock event producer capable of + feeding asynchronous inference requests to tests requiring them. + + :returns: A function that opens a subprocess running a mock message pump + """ + + def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen: + """Invoke the message pump entry-point with the descriptors + from the request. + + :param request: A request containing all parameters required to + invoke the message pump entrypoint + :returns: The Popen object for the subprocess that was started""" + # /tests/dragon/utils/msg_pump.py + msg_pump_script = "tests/dragon/utils/msg_pump.py" + msg_pump_path = pathlib.Path(__file__).parent / msg_pump_script + + cmd = [sys.executable, str(msg_pump_path.absolute()), *request.as_command()] + + popen = subprocess.Popen( + args=cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + return popen + + return run_message_pump + + +@pytest.fixture(scope="module") +def the_storage() -> dragon_ddict.DDict: + """Fixture to instantiate a dragon distributed dictionary.""" + return dragon_util.create_ddict(1, 2, 32 * 1024**2) + + +@pytest.fixture(scope="module") +def the_worker_channel() -> DragonFLIChannel: + """Fixture to create a valid descriptor for a worker channel + that can be attached to.""" + channel_ = create_local() + fli_ = FLInterface(main_ch=channel_, manager_ch=None) + comm_channel = DragonFLIChannel(fli_) + return comm_channel + + +@pytest.fixture(scope="module") +def the_backbone( + the_storage: t.Any, the_worker_channel: DragonFLIChannel +) -> BackboneFeatureStore: + """Fixture to create a distributed dragon dictionary and wrap it + in a BackboneFeatureStore. + + :param the_storage: The dragon storage engine to use + :param the_worker_channel: Pre-configured worker channel + """ + + backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True) + backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor + + return backbone + + +@pytest.fixture(scope="module") +def backbone_descriptor(the_backbone: BackboneFeatureStore) -> str: + # create a shared backbone featurestore + return the_backbone.descriptor diff --git a/tests/dragon/test_dragon_ddict_utils.py b/tests/dragon/test_dragon_ddict_utils.py index d2240abc1..c8bf687ef 100644 --- a/tests/dragon/test_dragon_ddict_utils.py +++ b/tests/dragon/test_dragon_ddict_utils.py @@ -41,12 +41,6 @@ logger = get_logger(__name__) -@pytest.fixture(scope="module") -def the_storage() -> dragon_ddict.DDict: - """Fixture to instantiate a dragon distributed dictionary.""" - return dragon_util.create_ddict(1, 2, 3 * 1024**2) - - @pytest.mark.parametrize( "num_nodes, num_managers, mem_per_node", [ diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index aed1b0ae4..07b2a45c1 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -39,19 +39,12 @@ from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim.error.errors import SmartSimError # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon -@pytest.fixture(scope="module") -def the_storage() -> dragon_ddict.DDict: - """Fixture to instantiate a dragon distributed dictionary.""" - return create_ddict(1, 2, 4 * 1024**2) - - @pytest.mark.parametrize( "content", [ diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 8421999a1..aacd47b55 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -40,7 +40,6 @@ from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.control.request_dispatcher import ( RequestDispatcher, ) @@ -55,7 +54,6 @@ from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim._core.mli.infrastructure.storage.feature_store import ( FeatureStore, ModelKey, @@ -82,29 +80,6 @@ pytestmark = pytest.mark.dragon -@pytest.fixture(scope="module") -def the_storage() -> DDict: - """Fixture to instantiate a dragon distributed dictionary.""" - return create_ddict(1, 2, 4 * 1024**2) - - -@pytest.fixture(scope="module") -def the_worker_channel() -> DragonFLIChannel: - """Fixture to create a valid descriptor for a worker channel - that can be attached to.""" - channel_ = create_local() - fli_ = FLInterface(main_ch=channel_, manager_ch=None) - comm_channel = DragonFLIChannel(fli_) - return comm_channel - - -@pytest.fixture(scope="module") -def backbone_descriptor(the_storage) -> str: - # create a shared backbone featurestore - feature_store = DragonFeatureStore(the_storage) - return feature_store.descriptor - - @pytest.fixture(scope="module") def app_feature_store(the_storage) -> FeatureStore: # create a standalone feature store to mimic a user application putting diff --git a/tests/dragon/test_event_consumer.py b/tests/dragon/test_event_consumer.py index 8c752c372..8a241bab1 100644 --- a/tests/dragon/test_event_consumer.py +++ b/tests/dragon/test_event_consumer.py @@ -33,7 +33,6 @@ dragon = pytest.importorskip("dragon") from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel -from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.comm.broadcaster import EventBroadcaster from smartsim._core.mli.infrastructure.comm.consumer import EventConsumer @@ -48,7 +47,6 @@ from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim.log import get_logger logger = get_logger(__name__) @@ -67,39 +65,6 @@ pytestmark = pytest.mark.dragon -@pytest.fixture(scope="module") -def the_storage() -> t.Dict[str, str]: - """Fixture to instantiate a dragon distributed dictionary.""" - return create_ddict(1, 2, 4 * 1024**2) - - -@pytest.fixture(scope="module") -def the_worker_channel() -> DragonFLIChannel: - """Fixture to create a valid descriptor for a worker channel - that can be attached to. Does not modify environment vars.""" - channel_ = create_local() - fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) - comm_channel = DragonFLIChannel(fli_) - return comm_channel - - -@pytest.fixture(scope="module") -def the_backbone( - the_storage: t.Any, the_worker_channel: DragonFLIChannel -) -> BackboneFeatureStore: - """Fixture to create a distributed dragon dictionary and wrap it - in a BackboneFeatureStore. - - :param the_storage: The dragon storage engine to use - :param the_worker_channel: Pre-configured worker channel - """ - - backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True) - backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor - - return backbone - - def test_eventconsumer_eventpublisher_integration( the_backbone: t.Any, test_dir: str ) -> None: diff --git a/tests/dragon/test_featurestore.py b/tests/dragon/test_featurestore.py index a97accd64..019dcde7a 100644 --- a/tests/dragon/test_featurestore.py +++ b/tests/dragon/test_featurestore.py @@ -36,15 +36,12 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel -from smartsim._core.mli.comm.channel.dragon_util import create_local from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, ) from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( time as bbtime, ) -from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim.log import get_logger logger = get_logger(__name__) @@ -63,39 +60,6 @@ pytestmark = pytest.mark.dragon -@pytest.fixture(scope="module") -def the_storage() -> t.Dict[str, str]: - """Fixture to instantiate a dragon distributed dictionary.""" - return create_ddict(1, 2, 4 * 1024**2) - - -@pytest.fixture(scope="module") -def the_worker_channel() -> DragonFLIChannel: - """Fixture to create a valid descriptor for a worker channel - that can be attached to. Does not modify environment vars.""" - channel_ = create_local() - fli_ = fli.FLInterface(main_ch=channel_, manager_ch=None) - comm_channel = DragonFLIChannel(fli_) - return comm_channel - - -@pytest.fixture(scope="module") -def the_backbone( - the_storage: t.Any, the_worker_channel: DragonFLIChannel -) -> BackboneFeatureStore: - """Fixture to create a distributed dragon dictionary and wrap it - in a BackboneFeatureStore. - - :param the_storage: The dragon storage engine to use - :param the_worker_channel: Pre-configured worker channel - """ - - backbone = BackboneFeatureStore(the_storage, allow_reserved_writes=True) - backbone[BackboneFeatureStore.MLI_WORKER_QUEUE] = the_worker_channel.descriptor - - return backbone - - def test_backbone_wait_for_no_keys( the_backbone: BackboneFeatureStore, monkeypatch: pytest.MonkeyPatch ) -> None: diff --git a/tests/dragon/test_featurestore_integration.py b/tests/dragon/test_featurestore_integration.py index e9fa3d5dd..23fdc55ab 100644 --- a/tests/dragon/test_featurestore_integration.py +++ b/tests/dragon/test_featurestore_integration.py @@ -41,10 +41,6 @@ from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.dragon_util import ( - create_ddict, - dragon_ddict, -) # isort: off from dragon.channels import Channel @@ -59,12 +55,6 @@ pytestmark = pytest.mark.dragon -@pytest.fixture(scope="module") -def the_storage() -> dragon_ddict.DDict: - """Fixture to instantiate a dragon distributed dictionary.""" - return create_ddict(1, 2, 32 * 1024**2) - - @pytest.fixture(scope="module") def the_worker_channel() -> DragonCommChannel: """Fixture to create a valid descriptor for a worker channel @@ -74,16 +64,6 @@ def the_worker_channel() -> DragonCommChannel: return wmgr_channel -@pytest.fixture(scope="module") -def the_backbone(the_storage: t.Any) -> BackboneFeatureStore: - """Fixture to create a distributed dragon dictionary and wrap it - in a BackboneFeatureStore. - - :param the_storage: The dragon storage engine to use - """ - return BackboneFeatureStore(the_storage, allow_reserved_writes=True) - - @pytest.mark.parametrize( "num_events, batch_timeout, max_batches_expected", [ diff --git a/tests/dragon/test_protoclient.py b/tests/dragon/test_protoclient.py index fff5fac47..f84417107 100644 --- a/tests/dragon/test_protoclient.py +++ b/tests/dragon/test_protoclient.py @@ -42,7 +42,6 @@ from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim.error.errors import SmartSimError from smartsim.log import get_logger @@ -60,27 +59,6 @@ logger = get_logger(__name__) -@pytest.fixture(scope="module") -def the_storage() -> DDict: - """Fixture that creates a dragon distributed dictionary. - - :returns: The attached distributed dictionary - """ - return create_ddict(1, 2, 32 * 1024**2) - - -@pytest.fixture(scope="module") -def the_backbone(the_storage) -> BackboneFeatureStore: - """Fixture that creates a dragon backbone feature store. - - :param storage_for_dragon_fs: the distributed dictionary to use in backbone - :returns: The backbone feature store - :returns: The attached `BackboneFeatureStore` - """ - - return BackboneFeatureStore(the_storage, allow_reserved_writes=True) - - @pytest.fixture(scope="module") def the_worker_queue(the_backbone: BackboneFeatureStore) -> DragonFLIChannel: """Fixture that creates a dragon FLI channel as a stand-in for the diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index b8b725f79..db656998a 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -35,7 +35,7 @@ import numpy as np import pytest -import conftest +from . import conftest pytest.importorskip("dragon") @@ -86,12 +86,6 @@ pass -@pytest.fixture(scope="module") -def the_storage() -> DDict: - """Fixture to instantiate a dragon distributed dictionary.""" - return create_ddict(1, 2, 4 * 1024**2) - - @pytest.mark.parametrize("num_iterations", [4]) def test_request_dispatcher( msg_pump_factory: _MsgPumpFactory, num_iterations: int, the_storage: DDict From 608d6bd75b9dc219aaca05bc6a61dbbde5a37bee Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 8 Oct 2024 14:53:40 -0500 Subject: [PATCH 37/40] remove unused import --- tests/dragon/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/dragon/conftest.py b/tests/dragon/conftest.py index 3084a2f38..6ce9ad148 100644 --- a/tests/dragon/conftest.py +++ b/tests/dragon/conftest.py @@ -41,7 +41,6 @@ from dragon.channels import Channel from dragon.data.ddict.ddict import DDict from dragon.fli import FLInterface -from dragon.mpbridge.queues import DragonQueue # isort: on From 68d0d0c2155f3534804bdaf661ac6dcf76fcaa79 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Tue, 8 Oct 2024 19:50:12 -0500 Subject: [PATCH 38/40] fix send-multi with FLI after sender-supplied channel removal --- smartsim/_core/mli/comm/channel/dragon_fli.py | 30 +++++++++++++----- .../control/request_dispatcher.py | 1 + smartsim/_core/mli/message_handler.py | 1 + tests/dragon/conftest.py | 19 ++++++++---- tests/dragon/test_request_dispatcher.py | 18 +++++------ tests/dragon/utils/msg_pump.py | 31 ++++++++++--------- 6 files changed, 61 insertions(+), 39 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index aa9be8897..5283ba2dd 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -26,7 +26,6 @@ # isort: off from dragon import fli -import dragon.channels as dch # isort: on @@ -59,9 +58,6 @@ def __init__( self._fli = fli_ """The underlying dragon FLInterface used by this CommChannel for communications""" - self._channel: t.Optional["dch.Channel"] = None - """The underlying dragon Channel used by a sender-side DragonFLIChannel - to attach to the main FLI channel""" self._buffer_size: int = buffer_size """Maximum number of messages that can be buffered before sending""" @@ -73,18 +69,36 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: :raises SmartSimError: If sending message fails """ try: - if self._channel is None: - self._channel = drg_util.create_local(self._buffer_size) + channel = drg_util.create_local(self._buffer_size) - with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: + with self._fli.sendh(timeout=None, stream_channel=channel) as sendh: sendh.send_bytes(value, timeout=timeout) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: - self._channel = None raise SmartSimError( f"Error sending via DragonFLIChannel {self.descriptor}" ) from e + def send_multiple(self, values: t.Sequence[bytes], timeout: float = 0.001) -> None: + """Send a message through the underlying communication channel. + + :param values: The values to send + :param timeout: Maximum time to wait (in seconds) for messages to send + :raises SmartSimError: If sending message fails + """ + try: + channel = drg_util.create_local(self._buffer_size) + + with self._fli.sendh(timeout=None, stream_channel=channel) as sendh: + for value in values: + sendh.send_bytes(value) + logger.debug(f"DragonFLIChannel {self.descriptor} sent message") + except Exception as e: + self._channel = None + raise SmartSimError( + f"Error sending via DragonFLIChannel {self.descriptor} {e}" + ) from e + def recv(self, timeout: float = 0.001) -> t.List[bytes]: """Receives message(s) through the underlying communication channel. diff --git a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py index 3cc8f88da..e22a2c8f6 100644 --- a/smartsim/_core/mli/infrastructure/control/request_dispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/request_dispatcher.py @@ -371,6 +371,7 @@ def _on_iteration(self) -> None: None, ) + logger.debug(f"Dispatcher is processing {len(bytes_list)} messages") request_bytes = bytes_list[0] tensor_bytes_list = bytes_list[1:] self._perf_timer.start_timings() diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 2511e9d25..e3d46a7ab 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -38,6 +38,7 @@ class MessageHandler: """Utility methods for transforming capnproto messages to and from internal representations. """ + @staticmethod def build_tensor_descriptor( order: "tensor_capnp.Order", diff --git a/tests/dragon/conftest.py b/tests/dragon/conftest.py index 6ce9ad148..6903f7b9d 100644 --- a/tests/dragon/conftest.py +++ b/tests/dragon/conftest.py @@ -50,9 +50,11 @@ from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( - DragonFeatureStore, -) +from smartsim.log import get_logger + +logger = get_logger(__name__) +msg_pump_path = pathlib.Path(__file__).parent / "utils" / "msg_pump.py" + class MsgPumpRequest(t.NamedTuple): """Fields required for starting a simulated inference request producer.""" @@ -116,17 +118,22 @@ def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen: :param request: A request containing all parameters required to invoke the message pump entrypoint :returns: The Popen object for the subprocess that was started""" - # /tests/dragon/utils/msg_pump.py - msg_pump_script = "tests/dragon/utils/msg_pump.py" - msg_pump_path = pathlib.Path(__file__).parent / msg_pump_script + assert request.backbone_descriptor + assert request.callback_descriptor + assert request.work_queue_descriptor + # /tests/dragon/utils/msg_pump.py cmd = [sys.executable, str(msg_pump_path.absolute()), *request.as_command()] + logger.debug(f"Executing msg_pump with command: {cmd}") popen = subprocess.Popen( args=cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) + + assert popen is not None + assert popen.returncode is None return popen return run_message_pump diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index db656998a..a6c4ac5dd 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -73,7 +73,6 @@ from smartsim.log import get_logger logger = get_logger(__name__) -mock_msg_pump_path = pathlib.Path(__file__).parent / "utils" / "msg_pump.py" _MsgPumpFactory = t.Callable[[conftest.MsgPumpRequest], sp.Popen] # The tests in this file belong to the dragon group @@ -129,8 +128,8 @@ def test_request_dispatcher( ) request_dispatcher._on_start() - pump_processes: t.List[sp.Popen] = [] + # put some messages into the work queue for the dispatcher to pickup for i in range(num_iterations): batch: t.Optional[RequestBatch] = None mem_allocs = [] @@ -149,18 +148,22 @@ def test_request_dispatcher( ) msg_pump = msg_pump_factory(request) - pump_processes.append(msg_pump) + + assert msg_pump is not None, "Msg Pump Process Creation Failed" + assert msg_pump.wait() == 0 time.sleep(1) - for _ in range(200): + for i in range(15): try: request_dispatcher._on_iteration() batch = request_dispatcher.task_queue.get(timeout=0.1) break except Empty: + logger.warning(f"Task queue is empty on iteration {i}") continue except Exception as exc: + logger.error(f"Task queue exception on iteration {i}") raise exc assert batch is not None @@ -219,13 +222,6 @@ def test_request_dispatcher( assert model_key not in request_dispatcher._active_queues assert model_key not in request_dispatcher._queues - msg_pump.wait() - - for msg_pump in pump_processes: - if msg_pump.returncode is not None: - continue - msg_pump.terminate() - # Try to remove the dispatcher and free the memory del request_dispatcher gc.collect() diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py index 835bccd2b..4b9833b91 100644 --- a/tests/dragon/utils/msg_pump.py +++ b/tests/dragon/utils/msg_pump.py @@ -27,7 +27,7 @@ import io import logging import pathlib -import time +import sys import typing as t import pytest @@ -44,7 +44,6 @@ # isort: on -from smartsim._core.mli.comm.channel.dragon_channel import DragonCommChannel from smartsim._core.mli.comm.channel.dragon_fli import DragonFLIChannel from smartsim._core.mli.infrastructure.storage.backbone_feature_store import ( BackboneFeatureStore, @@ -124,6 +123,8 @@ def mock_messages( feature_store = BackboneFeatureStore.from_descriptor(fs_descriptor) request_dispatcher_queue = DragonFLIChannel.from_descriptor(dispatch_fli_descriptor) + feature_store[model_key] = load_model() + for iteration_number in range(2): logged_iteration = offset + iteration_number logger.debug(f"Sending mock message {logged_iteration}") @@ -164,13 +165,9 @@ def mock_messages( logger.info( f"Retrieving {iteration_number} from callback channel: {callback_descriptor}" ) - callback_channel = DragonCommChannel.from_descriptor(callback_descriptor) - # Results will be empty. The test pulls messages off the queue before they - # can be serviced by a worker. Just ensure the callback channel works. - results = callback_channel.recv(timeout=0.1) - logger.debug(f"Received mock message results on callback channel: {results}") - time.sleep(1) + # send the header & body together so they arrive together + request_dispatcher_queue.send_multiple([request_bytes, tensor.tobytes()]) if __name__ == "__main__": @@ -185,9 +182,15 @@ def mock_messages( args = args.parse_args() - mock_messages( - args.dispatch_fli_descriptor, - args.fs_descriptor, - args.parent_iteration, - args.callback_descriptor, - ) + try: + mock_messages( + args.dispatch_fli_descriptor, + args.fs_descriptor, + args.parent_iteration, + args.callback_descriptor, + ) + except Exception as ex: + logger.exception("The message pump did not execute properly") + sys.exit(100) + + sys.exit(0) From c28870f19e31c28ea1803ac8fc40a114ea502e2a Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Wed, 9 Oct 2024 16:44:50 -0500 Subject: [PATCH 39/40] Update dispatch tests to use dragon processes --- smartsim/_core/mli/comm/channel/dragon_fli.py | 16 ++- tests/dragon/conftest.py | 123 ++++++------------ tests/dragon/test_request_dispatcher.py | 44 ++++--- tests/dragon/utils/msg_pump.py | 81 ++++++++---- 4 files changed, 130 insertions(+), 134 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 5283ba2dd..0b462af54 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -26,6 +26,7 @@ # isort: off from dragon import fli +from dragon.channels import Channel # isort: on @@ -56,6 +57,10 @@ def __init__( descriptor = drg_util.channel_to_descriptor(fli_) super().__init__(descriptor) + self._channel: t.Optional["Channel"] = None + """The underlying dragon Channel used by a sender-side DragonFLIChannel + to attach to the main FLI channel""" + self._fli = fli_ """The underlying dragon FLInterface used by this CommChannel for communications""" self._buffer_size: int = buffer_size @@ -79,7 +84,11 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: f"Error sending via DragonFLIChannel {self.descriptor}" ) from e - def send_multiple(self, values: t.Sequence[bytes], timeout: float = 0.001) -> None: + def send_multiple( + self, + values: t.Sequence[bytes], + timeout: float = 0.001, + ) -> None: """Send a message through the underlying communication channel. :param values: The values to send @@ -87,9 +96,10 @@ def send_multiple(self, values: t.Sequence[bytes], timeout: float = 0.001) -> No :raises SmartSimError: If sending message fails """ try: - channel = drg_util.create_local(self._buffer_size) + if self._channel is None: + self._channel = drg_util.create_local(self._buffer_size) - with self._fli.sendh(timeout=None, stream_channel=channel) as sendh: + with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: for value in values: sendh.send_bytes(value) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") diff --git a/tests/dragon/conftest.py b/tests/dragon/conftest.py index 6903f7b9d..d54270017 100644 --- a/tests/dragon/conftest.py +++ b/tests/dragon/conftest.py @@ -26,7 +26,9 @@ from __future__ import annotations +import os import pathlib +import socket import subprocess import sys import typing as t @@ -37,9 +39,10 @@ # isort: off import dragon.data.ddict.ddict as dragon_ddict +import dragon.infrastructure.policy as dragon_policy +import dragon.infrastructure.process_desc as dragon_process_desc +import dragon.native.process as dragon_process -from dragon.channels import Channel -from dragon.data.ddict.ddict import DDict from dragon.fli import FLInterface # isort: on @@ -53,90 +56,6 @@ from smartsim.log import get_logger logger = get_logger(__name__) -msg_pump_path = pathlib.Path(__file__).parent / "utils" / "msg_pump.py" - - -class MsgPumpRequest(t.NamedTuple): - """Fields required for starting a simulated inference request producer.""" - - backbone_descriptor: str - """The descriptor to use when connecting the message pump to a - backbone featurestore. - - Passed to the message pump as `--fs-descriptor` - """ - work_queue_descriptor: str - """The descriptor to use for sending work from the pump to the worker manager. - - Passed to the message pump as `--dispatch-fli-descriptor` - """ - callback_descriptor: str - """The descriptor the worker should use to returning results. - - Passed to the message pump as `--callback-descriptor` - """ - iteration_index: int = 1 - """If calling the message pump repeatedly, supply an iteration index to ensure - that logged messages appear unique instead of apparing to be duplicated logs. - - Passed to the message pump as `--parent-iteration` - """ - - def as_command(self) -> t.List[str]: - """Produce CLI arguments suitable for calling subprocess.Popen that - to execute the msg pump. - - NOTE: does NOT include the `[sys.executable, msg_pump_path, ...]` - portion of the necessary parameters to Popen. - - :returns: The arguments of the request formatted appropriately to - Popen the `/tests/dragon/utils/msg_pump.py`""" - return [ - "--dispatch-fli-descriptor", - self.work_queue_descriptor, - "--fs-descriptor", - self.backbone_descriptor, - "--parent-iteration", - str(self.iteration_index), - "--callback-descriptor", - self.callback_descriptor, - ] - - -@pytest.fixture(scope="session") -def msg_pump_factory() -> t.Callable[[MsgPumpRequest], subprocess.Popen]: - """A pytest fixture used to create a mock event producer capable of - feeding asynchronous inference requests to tests requiring them. - - :returns: A function that opens a subprocess running a mock message pump - """ - - def run_message_pump(request: MsgPumpRequest) -> subprocess.Popen: - """Invoke the message pump entry-point with the descriptors - from the request. - - :param request: A request containing all parameters required to - invoke the message pump entrypoint - :returns: The Popen object for the subprocess that was started""" - assert request.backbone_descriptor - assert request.callback_descriptor - assert request.work_queue_descriptor - - # /tests/dragon/utils/msg_pump.py - cmd = [sys.executable, str(msg_pump_path.absolute()), *request.as_command()] - logger.debug(f"Executing msg_pump with command: {cmd}") - - popen = subprocess.Popen( - args=cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - assert popen is not None - assert popen.returncode is None - return popen - - return run_message_pump @pytest.fixture(scope="module") @@ -176,3 +95,35 @@ def the_backbone( def backbone_descriptor(the_backbone: BackboneFeatureStore) -> str: # create a shared backbone featurestore return the_backbone.descriptor + + +def function_as_dragon_proc( + entrypoint_fn: t.Callable[[t.Any], None], + args: t.List[t.Any], + cpu_affinity: t.List[int], + gpu_affinity: t.List[int], +) -> dragon_process.Process: + """Execute a function as an independent dragon process. + + :param entrypoint_fn: The function to execute + :param args: The arguments for the entrypoint function + :param cpu_affinity: The cpu affinity for the process + :param gpu_affinity: The gpu affinity for the process + :returns: The dragon process handle + """ + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=socket.gethostname(), + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + return dragon_process.Process( + target=entrypoint_fn, + args=args, + cwd=os.getcwd(), + policy=local_policy, + options=options, + stderr=dragon_process.Popen.STDOUT, + stdout=dragon_process.Popen.STDOUT, + ) diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index a6c4ac5dd..70d73e243 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -26,7 +26,6 @@ import gc import os -import pathlib import subprocess as sp import time import typing as t @@ -36,6 +35,7 @@ import pytest from . import conftest +from .utils import msg_pump pytest.importorskip("dragon") @@ -68,12 +68,10 @@ from smartsim._core.mli.infrastructure.storage.dragon_feature_store import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.dragon_util import create_ddict from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim.log import get_logger logger = get_logger(__name__) -_MsgPumpFactory = t.Callable[[conftest.MsgPumpRequest], sp.Popen] # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon @@ -87,7 +85,9 @@ @pytest.mark.parametrize("num_iterations", [4]) def test_request_dispatcher( - msg_pump_factory: _MsgPumpFactory, num_iterations: int, the_storage: DDict + num_iterations: int, + the_storage: DDict, + test_dir: str, ) -> None: """Test the request dispatcher batching and queueing system @@ -113,7 +113,7 @@ def test_request_dispatcher( ) request_dispatcher = RequestDispatcher( - batch_timeout=0, + batch_timeout=1000, batch_size=2, config_loader=config_loader, worker_type=TorchWorker, @@ -130,6 +130,8 @@ def test_request_dispatcher( request_dispatcher._on_start() # put some messages into the work queue for the dispatcher to pickup + channels = [] + processes = [] for i in range(num_iterations): batch: t.Optional[RequestBatch] = None mem_allocs = [] @@ -139,27 +141,31 @@ def test_request_dispatcher( # down when mock_messages terms but before the final response message is sent callback_channel = DragonCommChannel.from_local() - - request = conftest.MsgPumpRequest( - backbone_fs.descriptor, - worker_queue.descriptor, - callback_channel.descriptor, - i, + channels.append(callback_channel) + + process = conftest.function_as_dragon_proc( + msg_pump.mock_messages, + [ + worker_queue.descriptor, + backbone_fs.descriptor, + i, + callback_channel.descriptor, + ], + [], + [], ) + processes.append(process) + process.start() + assert process.returncode is None, "The message pump failed to start" - msg_pump = msg_pump_factory(request) - - assert msg_pump is not None, "Msg Pump Process Creation Failed" - assert msg_pump.wait() == 0 - - time.sleep(1) - + # give dragon some time to populate the message queues for i in range(15): try: request_dispatcher._on_iteration() - batch = request_dispatcher.task_queue.get(timeout=0.1) + batch = request_dispatcher.task_queue.get(timeout=1.0) break except Empty: + time.sleep(2) logger.warning(f"Task queue is empty on iteration {i}") continue except Exception as exc: diff --git a/tests/dragon/utils/msg_pump.py b/tests/dragon/utils/msg_pump.py index 4b9833b91..8d69e57c6 100644 --- a/tests/dragon/utils/msg_pump.py +++ b/tests/dragon/utils/msg_pump.py @@ -28,6 +28,7 @@ import logging import pathlib import sys +import time import typing as t import pytest @@ -109,13 +110,13 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: return model_path -def mock_messages( +def _mock_messages( dispatch_fli_descriptor: str, fs_descriptor: str, parent_iteration: int, callback_descriptor: str, ) -> None: - """Mock event producer for triggering the inference pipeline""" + """Mock event producer for triggering the inference pipeline.""" model_key = "mini-model" # mock_message sends 2 messages, so we offset by 2 * (# of iterations in caller) offset = 2 * parent_iteration @@ -131,8 +132,6 @@ def mock_messages( output_key = f"output-{iteration_number}" - feature_store[model_key] = load_model() - tensor = ( (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32) ).numpy() @@ -156,18 +155,53 @@ def mock_messages( logger.info(f"Sending request {iteration_number} to request_dispatcher_queue") request_bytes = MessageHandler.serialize_request(request) - with request_dispatcher_queue._fli.sendh( - timeout=None, stream_channel=request_dispatcher_queue._channel - ) as sendh: - sendh.send_bytes(request_bytes) - sendh.send_bytes(tensor.tobytes()) - - logger.info( - f"Retrieving {iteration_number} from callback channel: {callback_descriptor}" - ) + + logger.info("Sending msg_envelope") + + # cuid = request_dispatcher_queue._channel.cuid + # logger.info(f"\tInternal cuid: {cuid}") # send the header & body together so they arrive together - request_dispatcher_queue.send_multiple([request_bytes, tensor.tobytes()]) + try: + request_dispatcher_queue.send_multiple([request_bytes, tensor.tobytes()]) + logger.info(f"\tenvelope 0: {request_bytes[:5]}...") + logger.info(f"\tenvelope 1: {tensor.tobytes()[:5]}...") + except Exception as ex: + logger.exception("Unable to send request envelope") + + logger.info("All messages sent") + + # keep the process alive for an extra 15 seconds to let the processor + # have access to the channels before they're destroyed + for _ in range(15): + time.sleep(1) + + +def mock_messages( + dispatch_fli_descriptor: str, + fs_descriptor: str, + parent_iteration: int, + callback_descriptor: str, +) -> int: + """Mock event producer for triggering the inference pipeline. Used + when starting using multiprocessing.""" + logger.info(f"{dispatch_fli_descriptor=}") + logger.info(f"{fs_descriptor=}") + logger.info(f"{parent_iteration=}") + logger.info(f"{callback_descriptor=}") + + try: + return _mock_messages( + dispatch_fli_descriptor, + fs_descriptor, + parent_iteration, + callback_descriptor, + ) + except Exception as ex: + logger.exception() + return 1 + + return 0 if __name__ == "__main__": @@ -182,15 +216,10 @@ def mock_messages( args = args.parse_args() - try: - mock_messages( - args.dispatch_fli_descriptor, - args.fs_descriptor, - args.parent_iteration, - args.callback_descriptor, - ) - except Exception as ex: - logger.exception("The message pump did not execute properly") - sys.exit(100) - - sys.exit(0) + return_code = mock_messages( + args.dispatch_fli_descriptor, + args.fs_descriptor, + args.parent_iteration, + args.callback_descriptor, + ) + sys.exit(return_code) From 78d5598b2f7021cb94a25989180a59fd709b0c95 Mon Sep 17 00:00:00 2001 From: Chris McBride Date: Wed, 9 Oct 2024 16:47:27 -0500 Subject: [PATCH 40/40] Use cached FLI channel on single-send --- smartsim/_core/mli/comm/channel/dragon_fli.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragon_fli.py b/smartsim/_core/mli/comm/channel/dragon_fli.py index 0b462af54..5fb0790a8 100644 --- a/smartsim/_core/mli/comm/channel/dragon_fli.py +++ b/smartsim/_core/mli/comm/channel/dragon_fli.py @@ -74,12 +74,14 @@ def send(self, value: bytes, timeout: float = 0.001) -> None: :raises SmartSimError: If sending message fails """ try: - channel = drg_util.create_local(self._buffer_size) + if self._channel is None: + self._channel = drg_util.create_local(self._buffer_size) - with self._fli.sendh(timeout=None, stream_channel=channel) as sendh: + with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value, timeout=timeout) logger.debug(f"DragonFLIChannel {self.descriptor} sent message") except Exception as e: + self._channel = None raise SmartSimError( f"Error sending via DragonFLIChannel {self.descriptor}" ) from e