From f85db2a584a146f54801dc71107369202c93f1cf Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Thu, 18 Jul 2024 16:54:43 -0500 Subject: [PATCH 01/49] Add file system descriptor to tensor & model keys --- doc/changelog.md | 1 + .../infrastructure/control/workermanager.py | 212 +++++++----------- .../mli/infrastructure/environmentloader.py | 36 ++- .../storage/dragonfeaturestore.py | 7 + .../infrastructure/storage/featurestore.py | 17 ++ .../_core/mli/infrastructure/worker/worker.py | 149 +++++++++--- smartsim/_core/mli/message_handler.py | 11 +- .../mli_schemas/data/data_references.capnp | 2 + .../data/data_references_capnp.pyi | 2 + tests/dragon/{utils => }/featurestore.py | 14 ++ tests/dragon/test_environment_loader.py | 89 +++++--- tests/mli/featurestore.py | 14 ++ .../mli/test_core_machine_learning_worker.py | 83 ++++--- tests/mli/test_torch_worker.py | 4 +- tests/mli/test_worker_manager.py | 13 +- .../test_build_model_key.py | 7 +- .../test_build_tensor_key.py | 6 +- .../test_output_descriptor.py | 3 +- tests/test_message_handler/test_request.py | 12 +- tests/test_message_handler/test_response.py | 5 +- 20 files changed, 432 insertions(+), 255 deletions(-) rename tests/dragon/{utils => }/featurestore.py (92%) diff --git a/doc/changelog.md b/doc/changelog.md index 0ada4e4ec..809ad5e8e 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Enable dynamic feature store selection - Fix dragon package installation bug - Adjust schemas for better performance - Add TorchWorker first implementation and mock inference app example diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 27f5bfc97..79ffc6dbd 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -25,18 +25,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys - -# isort: off -import dragon -from dragon import fli - -# isort: on - import time import typing as t -import numpy as np - from .....error import SmartSimError from .....log import get_logger from ....entrypoints.service import Service @@ -63,96 +54,23 @@ logger = get_logger(__name__) -def deserialize_message( - data_blob: bytes, - channel_type: t.Type[CommChannelBase], - device: t.Literal["cpu", "gpu"], -) -> InferenceRequest: - """Deserialize a message from a byte stream into an InferenceRequest - :param data_blob: The byte stream to deserialize""" - # todo: consider moving to XxxCore and only making - # workers implement the inputs and model conversion? - - # alternatively, consider passing the capnproto models - # to this method instead of the data_blob... - - # something is definitely wrong here... client shouldn't have to touch - # callback (or batch size) - - request = MessageHandler.deserialize_request(data_blob) - # return request - model_key: t.Optional[str] = None - model_bytes: t.Optional[Model] = None - - if request.model.which() == "key": - model_key = request.model.key.key - elif request.model.which() == "data": - model_bytes = request.model.data - - callback_key = request.replyChannel.descriptor - - # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel` - comm_channel = channel_type(callback_key) - # comm_channel = DragonCommChannel(request.replyChannel) - - input_keys: t.Optional[t.List[str]] = None - input_bytes: t.Optional[t.List[bytes]] = None - - output_keys: t.Optional[t.List[str]] = None - - input_meta: t.Optional[t.List[TensorDescriptor]] = None - - if request.input.which() == "keys": - input_keys = [input_key.key for input_key in request.input.keys] - elif request.input.which() == "descriptors": - input_meta = request.input.descriptors # type: ignore - - if request.output: - output_keys = [tensor_key.key for tensor_key in request.output] - - inference_request = InferenceRequest( - model_key=model_key, - callback=comm_channel, - raw_inputs=input_bytes, - input_keys=input_keys, - input_meta=input_meta, - output_keys=output_keys, - raw_model=model_bytes, - batch_size=0, - ) - return inference_request - - -def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: +def build_failure_reply(status: "StatusEnum", message: str) -> Response: + """Build a response indicating a failure occurred + :param status: The status of the response + :param message: The error message to include in the response""" return MessageHandler.build_response( - status=status, - message=message, - result=[], + status=status, # todo: need to indicate correct status + message=message, # todo: decide what these will be + result=None, custom_attributes=None, ) -def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: - prepared_outputs: t.List[t.Any] = [] - if reply.output_keys: - for key in reply.output_keys: - if not key: - continue - msg_key = MessageHandler.build_tensor_key(key) - prepared_outputs.append(msg_key) - elif reply.outputs: - for _ in reply.outputs: - msg_tensor_desc = MessageHandler.build_tensor_descriptor( - "c", - "float32", - [1], - ) - prepared_outputs.append(msg_tensor_desc) - return prepared_outputs - - -def build_reply(reply: InferenceReply) -> ResponseBuilder: - results = prepare_outputs(reply) +def build_reply(worker: MachineLearningWorkerBase, reply: InferenceReply) -> Response: + """Builds a response for a successful inference request + :param worker: A worker to process the reply with + :param reply: The internal representation of the reply""" + results = worker.prepare_outputs(reply) return MessageHandler.build_response( status=reply.status_enum, @@ -210,10 +128,6 @@ def __init__( self._task_queue: t.Optional[CommChannelBase] = config_loader.get_queue() """the queue the manager monitors for new tasks""" - self._feature_store: t.Optional[FeatureStore] = ( - config_loader.get_feature_store() - ) - """a feature store to retrieve models from""" self._worker = worker """The ML Worker implementation""" self._comm_channel_type = comm_channel_type @@ -222,37 +136,68 @@ def __init__( """Device on which workers need to run""" self._cached_models: dict[str, t.Any] = {} """Dictionary of previously loaded models""" + self._feature_stores = config_loader.get_feature_stores() + """A collection of attached feature stores""" + + def _check_feature_stores(self, request: InferenceRequest) -> bool: + """Ensures that all feature stores required by the request are available + :param request: The request to validate""" + # collect all feature stores required by the request + fs_model = {request.model_key.descriptor} + fs_inputs = {key.descriptor for key in request.input_keys} + fs_outputs = {key.descriptor for key in request.output_keys} + + # identify which feature stores are requested and unknown + fs_desired = fs_model + fs_inputs + fs_outputs + fs_actual = {key for key in self._feature_stores} + fs_missing = fs_desired - fs_actual + + # exit if all desired feature stores are not available + if fs_missing: + logger.error(f"Missing feature store(s): {fs_missing}") + return False - def _validate_request(self, request: InferenceRequest) -> bool: - """Ensure the request can be processed. - :param request: The request to validate - :return: True if the request is valid, False otherwise""" - if not self._feature_store: - if request.model_key: - logger.error("Unable to load model by key without feature store") - return False + return True - if request.input_keys: - logger.error("Unable to load inputs by key without feature store") - return False + def _check_model(self, request: InferenceRequest) -> bool: + """Ensure that a model is available for the request + :param request: The request to validate""" + if request.model_key or request.raw_model: + return True - if request.output_keys: - logger.error("Unable to persist outputs by key without feature store") - return False + logger.error("Unable to continue without model bytes or feature store key") + return False - if not request.model_key and not request.raw_model: - logger.error("Unable to continue without model bytes or feature store key") - return False + def _check_inputs(self, request: InferenceRequest) -> bool: + """Ensure that inputs are available for the request + :param request: The request to validate""" + if request.input_keys or request.raw_inputs: + return True - if not request.input_keys and not request.raw_inputs: - logger.error("Unable to continue without input bytes or feature store keys") - return False + logger.error("Unable to continue without input bytes or feature store keys") + return False - if request.callback is None: - logger.error("No callback channel provided in request") - return False + def _check_callback(self, request: InferenceRequest) -> bool: + """Ensure that a callback channel is available for the request + :param request: The request to validate""" + if request.callback is not None: + return True - return True + logger.error("No callback channel provided in request") + return False + + def _validate_request(self, request: InferenceRequest) -> bool: + """Ensure the request can be processed. + :param request: The request to validate + :return: True if the request is valid, False otherwise""" + checks = [ + self._check_feature_stores(request), + self._check_model(request), + self._check_inputs(request), + self._check_callback(request), + ] + + return all(checks) def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete @@ -279,8 +224,8 @@ def _on_iteration(self) -> None: tensor_bytes_list = bytes_list[1:] interm = time.perf_counter() # timing - request = deserialize_message( - request_bytes, self._comm_channel_type, self._device + request = self._worker.deserialize_message( + request_bytes, self._comm_channel_type ) if request.input_meta and tensor_bytes_list: @@ -302,10 +247,12 @@ def _on_iteration(self) -> None: "Could not find model key or model.", ) return - if request.model_key in self._cached_models: + if request.model_key.key in self._cached_models: timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing - model_result = LoadModelResult(self._cached_models[request.model_key]) + model_result = LoadModelResult( + self._cached_models[request.model_key.key] + ) else: timings.append(time.perf_counter() - interm) # timing @@ -328,7 +275,7 @@ def _on_iteration(self) -> None: fetch_result=fetch_model_result, device=self._device, ) - self._cached_models[request.model_key] = model_result.model + self._cached_models[request.model_key.key] = model_result.model except Exception as e: exception_handler( e, request.callback, "Failed while loading the model." @@ -407,9 +354,7 @@ def _on_iteration(self) -> None: if request.output_keys: try: reply.output_keys = self._worker.place_output( - request, - transformed_output, - self._feature_store, + request, transformed_output, self._feature_stores ) except Exception as e: exception_handler( @@ -425,9 +370,10 @@ def _on_iteration(self) -> None: if reply.outputs is None or not reply.outputs: response = build_failure_reply("fail", "Outputs not found.") else: - reply.status_enum = "complete" - reply.message = "Success" - response = build_reply(reply) + if reply.outputs is None or not reply.outputs: + response = build_failure_reply("fail", "no-results") + + response = build_reply(self._worker, reply) timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 9f6770623..4502ec2fc 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -33,6 +33,10 @@ from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.error.errors import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) class EnvironmentConfigLoader: @@ -47,15 +51,35 @@ def __init__(self) -> None: ) self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) self.feature_store: t.Optional[FeatureStore] = None + self.feature_stores: t.Optional[t.Dict[FeatureStore]] = None self.queue: t.Optional[DragonFLIChannel] = None - def get_feature_store(self) -> t.Optional[FeatureStore]: - """Loads the Feature Store previously set in SSFeatureStore""" - if self._feature_store_descriptor is not None: - self.feature_store = pickle.loads( - base64.b64decode(self._feature_store_descriptor) + def _load_feature_store(self, env_var: str) -> FeatureStore: + """Load a feature store from a descriptor + :param descriptor: The descriptor of the feature store + :returns: The hydrated feature store""" + logger.debug(f"Loading feature store from env: {env_var}") + + value = os.getenv(env_var) + if not value: + raise SmartSimError(f"Empty feature store descriptor in environment: {env_var}") + + try: + return pickle.loads(base64.b64decode(value)) + except: + raise SmartSimError( + f"Invalid feature store descriptor in environment: {env_var}" ) - return self.feature_store + + def get_feature_stores(self) -> t.Dict[str, FeatureStore]: + """Loads multiple Feature Stores by scanning environment for variables + prefixed with `SSFeatureStore`""" + prefix = "SSFeatureStore" + if self.feature_stores is None: + env_vars = [var for var in os.environ if var.startswith(prefix)] + stores = [self._load_feature_store(var) for var in env_vars] + self.feature_stores = {fs.descriptor: fs for fs in stores} + return self.feature_stores def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]: """Returns the Queue previously set in SSQueue""" diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index af592ed0a..d5344198a 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -69,3 +69,10 @@ def __contains__(self, key: str) -> bool: Return `True` if the key is found, `False` otherwise :param key: Unique key of an item to retrieve from the feature store""" return key in self._storage + + @property + def descriptor(self) -> str: + """Return a unique identifier enabling a client to connect to + the feature store + :returns: A descriptor encoded as a string""" + return str(self._storage.serialize()) diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index 553e13b10..5291b2db3 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -28,6 +28,16 @@ from abc import ABC, abstractmethod +class FeatureStoreKey: + """A key-value pair enabling retrieval of an item in a feature store""" + + def __init__(self, key: str, descriptor: str) -> None: + self.key = key + """The unique key of an item in the feature store""" + self.descriptor = descriptor + """The unique identifier of the feature store containing the key""" + + class FeatureStore(ABC): """Abstract base class providing the common interface for retrieving values from a feature store implementation""" @@ -48,3 +58,10 @@ def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. Return `True` if the key is found, `False` otherwise :param key: Unique key of an item to retrieve from the feature store""" + + @property + @abstractmethod + def descriptor(self) -> str: + """Return a unique identifier enabling a client to connect to + the feature store + :returns: A descriptor encoded as a string""" diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index bb8d82231..f7b053245 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -27,10 +27,13 @@ import typing as t from abc import ABC, abstractmethod +import numpy as np + from .....error import SmartSimError from .....log import get_logger from ...comm.channel.channel import CommChannelBase -from ...infrastructure.storage.featurestore import FeatureStore +from ...infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey +from ...message_handler import MessageHandler from ...mli_schemas.model.model_capnp import Model if t.TYPE_CHECKING: @@ -44,26 +47,32 @@ class InferenceRequest: def __init__( self, - model_key: t.Optional[str] = None, + model_key: t.Optional[FeatureStoreKey] = None, callback: t.Optional[CommChannelBase] = None, raw_inputs: t.Optional[t.List[bytes]] = None, - # todo: copying byte array is likely to create a copy of the data in - # capnproto and will be a performance issue later - input_keys: t.Optional[t.List[str]] = None, + input_keys: t.Optional[t.List[FeatureStoreKey]] = None, input_meta: t.Optional[t.List[t.Any]] = None, - output_keys: t.Optional[t.List[str]] = None, + output_keys: t.Optional[t.List[FeatureStoreKey]] = None, raw_model: t.Optional[Model] = None, batch_size: int = 0, ): """Initialize the object""" self.model_key = model_key + """A tuple containing a (key, descriptor) pair""" self.raw_model = raw_model + """Raw bytes of an ML model""" self.callback = callback + """The channel used for notification of inference completion""" self.raw_inputs = raw_inputs or [] + """Raw bytes of tensor inputs""" self.input_keys = input_keys or [] + """A list of tuples containing a (key, descriptor) pair""" self.input_meta = input_meta or [] + """Metadata about the input data""" self.output_keys = output_keys or [] + """A list of tuples containing a (key, descriptor) pair""" self.batch_size = batch_size + """The batch size to apply when batching""" class InferenceReply: @@ -148,13 +157,93 @@ def __init__(self, result: bytes) -> None: class MachineLearningWorkerCore: """Basic functionality of ML worker that is shared across all worker types""" + @staticmethod + def deserialize_message( + data_blob: bytes, + channel_type: t.Type[CommChannelBase], + ) -> InferenceRequest: + """Deserialize a message from a byte stream into an InferenceRequest + :param data_blob: The byte stream to deserialize""" + request = MessageHandler.deserialize_request(data_blob) + model_key: t.Optional[FeatureStoreKey] = None + model_bytes: t.Optional[Model] = None + + if request.model.which() == "key": + model_key = FeatureStoreKey( + request.model.key.key, request.model.key.featureStoreDescriptor + ) + elif request.model.which() == "data": + model_bytes = request.model.data + + callback_key = request.replyChannel.reply + comm_channel = channel_type(callback_key) + + input_keys: t.Optional[t.List[FeatureStoreKey]] = None + input_bytes: t.Optional[t.List[bytes]] = None + input_meta: t.List[t.Any] = [] + + if request.input.which() == "keys": + input_keys = [ + FeatureStoreKey(input_key.key, input_key.featureStoreDescriptor) + for input_key in request.input.keys + ] + elif request.input.which() == "data": + input_bytes = [data.blob for data in request.input.data] + input_meta = [data.tensorDescriptor for data in request.input.data] + + output_keys: t.List[FeatureStoreKey] = [] + if request.output: + output_keys = [ + FeatureStoreKey(output_key.key, output_key.featureStoreDescriptor) + for output_key in request.output + ] + + inference_request = InferenceRequest( + model_key=model_key, + callback=comm_channel, + raw_inputs=input_bytes, + input_meta=input_meta, + input_keys=input_keys, + output_keys=output_keys, + raw_model=model_bytes, + batch_size=0, + ) + return inference_request + + @staticmethod + def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: + prepared_outputs: t.List[t.Any] = [] + if reply.output_keys: + for fs_key in reply.output_keys: + if not fs_key: + continue + + msg_key = MessageHandler.build_tensor_key(fs_key.key, fs_key.descriptor) + prepared_outputs.append(msg_key) + elif reply.outputs: + arrays: t.List[np.ndarray[t.Any, np.dtype[t.Any]]] = [ + output.numpy() for output in reply.outputs + ] + for tensor in arrays: + # todo: need to have the output attributes specified in the req? + # maybe, add `MessageHandler.dtype_of(tensor)`? + # can `build_tensor` do dtype and shape? + msg_tensor = MessageHandler.build_tensor( + tensor, + "c", + "float32", + [1], + ) + prepared_outputs.append(msg_tensor) + return prepared_outputs + @staticmethod def fetch_model( - request: InferenceRequest, feature_store: t.Optional[FeatureStore] + request: InferenceRequest, feature_stores: t.Dict[str, FeatureStore] ) -> FetchModelResult: """Given a resource key, retrieve the raw model from a feature store :param request: The request that triggered the pipeline - :param feature_store: The feature store used for persistence + :param feature_stores: Available feature stores used for persistence :return: Raw bytes of the model""" if request.raw_model: @@ -164,7 +253,7 @@ def fetch_model( # short-circuit and return the directly supplied model return FetchModelResult(request.raw_model.data) - if not feature_store: + if not feature_stores: raise ValueError("Feature store is required for model retrieval") if not request.model_key: @@ -172,44 +261,47 @@ def fetch_model( "Key must be provided to retrieve model from feature store" ) + key, fsd = request.model_key.key, request.model_key.descriptor + try: - raw_bytes: bytes = t.cast(bytes, feature_store[request.model_key]) + feature_store = feature_stores[fsd] + raw_bytes: bytes = t.cast(bytes, feature_store[key]) return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) - raise SmartSimError( - f"Model could not be retrieved with key {request.model_key}" - ) from ex + raise SmartSimError(f"Model could not be retrieved with key {key}") from ex @staticmethod def fetch_inputs( - request: InferenceRequest, feature_store: t.Optional[FeatureStore] + request: InferenceRequest, feature_stores: t.Dict[str, FeatureStore] ) -> FetchInputResult: """Given a collection of ResourceKeys, identify the physical location and input metadata :param request: The request that triggered the pipeline - :param feature_store: The feature store used for persistence + :param feature_stores: Available feature stores used for persistence :return: the fetched input""" if request.raw_inputs: return FetchInputResult(request.raw_inputs, request.input_meta) - if not feature_store: + if not feature_stores: raise ValueError("No input and no feature store provided") if request.input_keys: data: t.List[bytes] = [] - for input_ in request.input_keys: + + for fs_key in request.input_keys: try: - tensor_bytes = t.cast(bytes, feature_store[input_]) + feature_store = feature_stores[fs_key.descriptor] + tensor_bytes = t.cast(bytes, feature_store[fs_key.key]) data.append(tensor_bytes) except KeyError as ex: logger.exception(ex) raise SmartSimError( - f"Model could not be retrieved with key {input_}" + f"Model could not be retrieved with key {fs_key.key}" ) from ex return FetchInputResult( - data, None + data, meta=None ) # fixme: need to get both tensor and descriptor raise ValueError("No input source") @@ -231,25 +323,26 @@ def batch_requests( def place_output( request: InferenceRequest, transform_result: TransformOutputResult, - feature_store: t.Optional[FeatureStore], - ) -> t.Collection[t.Optional[str]]: + feature_stores: t.Dict[str, FeatureStore], + ) -> t.Collection[t.Optional[FeatureStoreKey]]: """Given a collection of data, make it available as a shared resource in the feature store :param request: The request that triggered the pipeline :param execute_result: Results from inference - :param feature_store: The feature store used for persistence + :param feature_stores: Available feature stores used for persistence :return: A collection of keys that were placed in the feature store""" - if not feature_store: + if not feature_stores: raise ValueError("Feature store is required for output persistence") - keys: t.List[t.Optional[str]] = [] + keys: t.List[t.Optional[FeatureStoreKey]] = [] # need to decide how to get back to original sub-batch inputs so they can be # accurately placed, datum might need to include this. # Consider parallelizing all PUT feature_store operations - for k, v in zip(request.output_keys, transform_result.outputs): - feature_store[k] = v - keys.append(k) + for fs_key, v in zip(request.output_keys, transform_result.outputs): + feature_store = feature_stores[fs_key.descriptor] + feature_store[fs_key.key] = v + keys.append(fs_key) return keys diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 00670dce8..aafc4a4c2 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -92,7 +92,9 @@ def build_output_tensor_descriptor( return description @staticmethod - def build_tensor_key(key: str) -> data_references_capnp.TensorKey: + def build_tensor_key( + key: str, feature_store_descriptor: str + ) -> data_references_capnp.TensorKey: """ Builds a new TensorKey message with the provided key. @@ -102,6 +104,7 @@ def build_tensor_key(key: str) -> data_references_capnp.TensorKey: try: tensor_key = data_references_capnp.TensorKey.new_message() tensor_key.key = key + tensor_key.featureStoreDescriptor = feature_store_descriptor except Exception as e: raise ValueError("Error building tensor key.") from e return tensor_key @@ -126,7 +129,9 @@ def build_model(data: bytes, name: str, version: str) -> model_capnp.Model: return model @staticmethod - def build_model_key(key: str) -> data_references_capnp.ModelKey: + def build_model_key( + key: str, feature_store_descriptor: str + ) -> data_references_capnp.ModelKey: """ Builds a new ModelKey message with the provided key. @@ -136,6 +141,7 @@ def build_model_key(key: str) -> data_references_capnp.ModelKey: try: model_key = data_references_capnp.ModelKey.new_message() model_key.key = key + model_key.featureStoreDescriptor = feature_store_descriptor except Exception as e: raise ValueError("Error building model key.") from e return model_key @@ -498,6 +504,7 @@ def build_response( result: t.Union[ t.List[tensor_capnp.TensorDescriptor], t.List[data_references_capnp.TensorKey], + None ], custom_attributes: t.Union[ response_attributes_capnp.TorchResponseAttributes, diff --git a/smartsim/_core/mli/mli_schemas/data/data_references.capnp b/smartsim/_core/mli/mli_schemas/data/data_references.capnp index f37a95726..699abe5d2 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references.capnp +++ b/smartsim/_core/mli/mli_schemas/data/data_references.capnp @@ -28,8 +28,10 @@ struct ModelKey { key @0 :Text; + featureStoreDescriptor @1 :Text; } struct TensorKey { key @0 :Text; + featureStoreDescriptor @1 :Text; } diff --git a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi index 6f775cf8f..bcf53e0a0 100644 --- a/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi +++ b/smartsim/_core/mli/mli_schemas/data/data_references_capnp.pyi @@ -36,6 +36,7 @@ from typing import Iterator class ModelKey: key: str + featureStoreDescriptor: str @staticmethod @contextmanager def from_bytes( @@ -71,6 +72,7 @@ class ModelKeyBuilder(ModelKey): class TensorKey: key: str + featureStoreDescriptor: str @staticmethod @contextmanager def from_bytes( diff --git a/tests/dragon/utils/featurestore.py b/tests/dragon/featurestore.py similarity index 92% rename from tests/dragon/utils/featurestore.py rename to tests/dragon/featurestore.py index 93b313431..f9d4a1da2 100644 --- a/tests/dragon/utils/featurestore.py +++ b/tests/dragon/featurestore.py @@ -57,6 +57,13 @@ def __contains__(self, key: str) -> bool: :param key: Unique key of an item to retrieve from the feature store""" return key in self._storage + @property + def descriptor(self) -> str: + """Return a unique identifier enabling a client to connect to + the feature store + :returns: A descriptor encoded as a string""" + return "file-system-fs" + class FileSystemFeatureStore(FeatureStore): """Alternative feature store implementation for testing. Stores all @@ -103,6 +110,13 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path: return value + @property + def descriptor(self) -> str: + """Return a unique identifier enabling a client to connect to + the feature store + :returns: A descriptor encoded as a string""" + return "in-memory-fs" + class DragonDict: """Mock implementation of a dragon dictionary""" diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 00db0a9d3..d4adb3587 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -27,9 +27,13 @@ import base64 import os import pickle +import typing as t import pytest +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.error.errors import SmartSimError + dragon = pytest.importorskip("dragon") import dragon.utils as du @@ -42,7 +46,7 @@ DragonFeatureStore, ) -from .utils.featurestore import MemoryFeatureStore +from .featurestore import FileSystemFeatureStore, MemoryFeatureStore # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon @@ -93,59 +97,70 @@ def test_environment_loader_FLI_fails(monkeypatch): @pytest.mark.parametrize( - "expected_keys, expected_values", + "feature_stores", [ - pytest.param(["key1", "key2", "key3"], ["value1", "value2", "value3"]), - pytest.param(["another key"], ["another value"]), + pytest.param([], id="No feature stores"), + pytest.param([MemoryFeatureStore()], id="Single feature store"), + pytest.param( + [MemoryFeatureStore(), FileSystemFeatureStore()], + id="Multiple feature stores", + ), ], ) -def test_environment_loader_memory_featurestore( - expected_keys, expected_values, monkeypatch +def test_environment_loader_featurestores( + feature_stores: t.List[FeatureStore], monkeypatch: pytest.MonkeyPatch ): - """MemoryFeatureStores can be correctly serialized and deserialized""" - feature_store = MemoryFeatureStore() - key_value_pairs = zip(expected_keys, expected_values) - for k, v in key_value_pairs: - feature_store[k] = v - monkeypatch.setenv( - "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") - ) - config = EnvironmentConfigLoader() - config_feature_store = config.get_feature_store() + """FeatureStore can be correctly identified, serialized and deserialized""" + with monkeypatch.context() as m: + for fs in feature_stores: + value = base64.b64encode(pickle.dumps(fs)).decode("utf-8") + key = f"SSFeatureStore.{fs.descriptor}" + m.setenv(key, value) + + config = EnvironmentConfigLoader() + actual_feature_stores = config.get_feature_stores() + + for fs in feature_stores: + # Confirm that the descriptors were used as keys in the loaded feature stores + assert fs.descriptor in actual_feature_stores - for k, _ in key_value_pairs: - assert config_feature_store[k] == feature_store[k] + # Confirm that the value loaded from env var is a FeatureStore + # and it is consistent w/the key identifying it + loaded_fs = actual_feature_stores[fs.descriptor] + assert loaded_fs.descriptor == fs.descriptor @pytest.mark.parametrize( - "expected_keys, expected_values", + "value_to_use,error_filter", [ - pytest.param(["key1", "key2", "key3"], ["value1", "value2", "value3"]), - pytest.param(["another key"], ["another value"]), + pytest.param("", "empty", id="Empty value"), + pytest.param("abcd", "invalid", id="Incorrectly serialized value"), ], ) -def test_environment_loader_dragon_featurestore( - expected_keys, expected_values, monkeypatch +def test_environment_loader_featurestores_errors( + value_to_use: str, error_filter: str, monkeypatch: pytest.MonkeyPatch ): - """DragonFeatureStores can be correctly serialized and deserialized""" - storage = DDict() - feature_store = DragonFeatureStore(storage) - key_value_pairs = zip(expected_keys, expected_values) - for k, v in key_value_pairs: - feature_store[k] = v - monkeypatch.setenv( - "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") - ) - config = EnvironmentConfigLoader() - config_feature_store = config.get_feature_store() + """Verify that the environment loader reports an error when a feature store + env var is populated with something that cannot be loaded properly""" + + fs = FileSystemFeatureStore() # just use for descriptor... + key = f"SSFeatureStore.{fs.descriptor}" + + with monkeypatch.context() as m, pytest.raises(SmartSimError) as ex: + m.setenv(key, value_to_use) # <----- simulate incorrect value in env var + + config = EnvironmentConfigLoader() + config.get_feature_stores() # <---- kick off validation - for k, _ in key_value_pairs: - assert config_feature_store[k] == feature_store[k] + # confirm the specific key is reported in error message + assert key in ex.value.args[0] + # ensure the failure occurred during loading + assert error_filter in ex.value.args[0].lower() def test_environment_variables_not_set(): """EnvironmentConfigLoader getters return None when environment variables are not set""" config = EnvironmentConfigLoader() - assert config.get_feature_store() == None + assert config.get_feature_stores() == {} assert config.get_queue() == None diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py index 93b313431..f9d4a1da2 100644 --- a/tests/mli/featurestore.py +++ b/tests/mli/featurestore.py @@ -57,6 +57,13 @@ def __contains__(self, key: str) -> bool: :param key: Unique key of an item to retrieve from the feature store""" return key in self._storage + @property + def descriptor(self) -> str: + """Return a unique identifier enabling a client to connect to + the feature store + :returns: A descriptor encoded as a string""" + return "file-system-fs" + class FileSystemFeatureStore(FeatureStore): """Alternative feature store implementation for testing. Stores all @@ -103,6 +110,13 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path: return value + @property + def descriptor(self) -> str: + """Return a unique identifier enabling a client to connect to + the feature store + :returns: A descriptor encoded as a string""" + return "in-memory-fs" + class DragonDict: """Mock implementation of a dragon dictionary""" diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/mli/test_core_machine_learning_worker.py index cff02c9c1..5e56671c3 100644 --- a/tests/mli/test_core_machine_learning_worker.py +++ b/tests/mli/test_core_machine_learning_worker.py @@ -31,6 +31,7 @@ import torch import smartsim.error as sse +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStoreKey from smartsim._core.mli.infrastructure.worker.worker import ( InferenceRequest, MachineLearningWorkerCore, @@ -90,11 +91,12 @@ def test_fetch_model_disk(persist_torch_model: pathlib.Path) -> None: worker = MachineLearningWorkerCore key = str(persist_torch_model) feature_store = FileSystemFeatureStore() + fsd = feature_store.descriptor feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=key) + request = InferenceRequest(model_key=FeatureStoreKey(key, fsd)) - fetch_result = worker.fetch_model(request, feature_store) + fetch_result = worker.fetch_model(request, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -104,13 +106,14 @@ def test_fetch_model_disk_missing() -> None: when given an invalid (file system) key""" worker = MachineLearningWorkerCore feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor key = "/path/that/doesnt/exist" - request = InferenceRequest(model_key=key) + request = InferenceRequest(model_key=FeatureStoreKey(key, fsd)) with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_model(request, feature_store) + worker.fetch_model(request, {fsd: feature_store}) # ensure the error message includes key-identifying information assert key in ex.value.args[0] @@ -127,10 +130,11 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None: # put model bytes into the feature store feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=key) - fetch_result = worker.fetch_model(request, feature_store) + request = InferenceRequest(model_key=FeatureStoreKey(key, feature_store.descriptor)) + fetch_result = worker.fetch_model(request, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -142,12 +146,15 @@ def test_fetch_model_feature_store_missing() -> None: bad_key = "some-key" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor - request = InferenceRequest(model_key=bad_key) + request = InferenceRequest( + model_key=FeatureStoreKey(bad_key, feature_store.descriptor) + ) # todo: consider that raising this exception shows impl. replace... with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_model(request, feature_store) + worker.fetch_model(request, {fsd: feature_store}) # ensure the error message includes key-identifying information assert bad_key in ex.value.args[0] @@ -161,11 +168,12 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None: key = "test-model" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=key) + request = InferenceRequest(model_key=FeatureStoreKey(key, feature_store.descriptor)) - fetch_result = worker.fetch_model(request, feature_store) + fetch_result = worker.fetch_model(request, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -176,13 +184,14 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None: when given a valid (file system) key""" tensor_name = str(persist_torch_tensor) - request = InferenceRequest(input_keys=[tensor_name]) + feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor + request = InferenceRequest(input_keys=[FeatureStoreKey(tensor_name, fsd)]) worker = MachineLearningWorkerCore - feature_store = MemoryFeatureStore() feature_store[tensor_name] = persist_torch_tensor.read_bytes() - fetch_result = worker.fetch_inputs(request, feature_store) + fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) assert fetch_result.inputs is not None @@ -191,16 +200,17 @@ def test_fetch_input_disk_missing() -> None: when given an invalid (file system) key""" worker = MachineLearningWorkerCore - key = "/path/that/doesnt/exist" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor + key = "/path/that/doesnt/exist", fsd - request = InferenceRequest(input_keys=[key]) + request = InferenceRequest(input_keys=[FeatureStoreKey(key, fsd)]) with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_inputs(request, feature_store) + worker.fetch_inputs(request, {fsd: feature_store}) # ensure the error message includes key-identifying information - assert key in ex.value.args[0] + assert key[0] in ex.value.args[0] @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") @@ -211,13 +221,14 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None: tensor_name = "test-tensor" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor - request = InferenceRequest(input_keys=[tensor_name]) + request = InferenceRequest(input_keys=[FeatureStoreKey(tensor_name, fsd)]) # put model bytes into the feature store feature_store[tensor_name] = persist_torch_tensor.read_bytes() - fetch_result = worker.fetch_inputs(request, feature_store) + fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) assert fetch_result.inputs assert list(fetch_result.inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10] @@ -230,6 +241,7 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> tensor_name = "test-tensor" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor # put model bytes into the feature store body1 = persist_torch_tensor.read_bytes() @@ -242,10 +254,14 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> feature_store[tensor_name + "3"] = body3 request = InferenceRequest( - input_keys=[tensor_name + "1", tensor_name + "2", tensor_name + "3"] + input_keys=[ + FeatureStoreKey(tensor_name + "1", fsd), + FeatureStoreKey(tensor_name + "2", fsd), + FeatureStoreKey(tensor_name + "3", fsd), + ] ) - fetch_result = worker.fetch_inputs(request, feature_store) + fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) raw_bytes = list(fetch_result.inputs) assert raw_bytes @@ -261,10 +277,11 @@ def test_fetch_input_feature_store_missing() -> None: bad_key = "some-key" feature_store = MemoryFeatureStore() - request = InferenceRequest(input_keys=[bad_key]) + fsd = feature_store.descriptor + request = InferenceRequest(input_keys=[FeatureStoreKey(bad_key, fsd)]) with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_inputs(request, feature_store) + worker.fetch_inputs(request, {fsd: feature_store}) # ensure the error message includes key-identifying information assert bad_key in ex.value.args[0] @@ -276,12 +293,13 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None: when given a valid (file system) key""" worker = MachineLearningWorkerCore feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor model_name = "test-model" feature_store[model_name] = persist_torch_tensor.read_bytes() - request = InferenceRequest(input_keys=[model_name]) + request = InferenceRequest(input_keys=[FeatureStoreKey(model_name, fsd)]) - fetch_result = worker.fetch_inputs(request, feature_store) + fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) assert fetch_result.inputs is not None @@ -304,18 +322,23 @@ def test_place_outputs() -> None: key_name = "test-model" feature_store = MemoryFeatureStore() + fsd = feature_store.descriptor # create a key to retrieve from the feature store - keys = [key_name + "1", key_name + "2", key_name + "3"] + keys = [ + FeatureStoreKey(key_name + "1", fsd), + FeatureStoreKey(key_name + "2", fsd), + FeatureStoreKey(key_name + "3", fsd), + ] data = [b"abcdef", b"ghijkl", b"mnopqr"] - for k, v in zip(keys, data): - feature_store[k] = v + for fsk, v in zip(keys, data): + feature_store[fsk.key] = v request = InferenceRequest(output_keys=keys) transform_result = TransformOutputResult(data, [1], "c", "float32") - worker.place_output(request, transform_result, feature_store) + worker.place_output(request, transform_result, {fsd: feature_store}) for i in range(3): - assert feature_store[keys[i]] == data[i] + assert feature_store[keys[i].key] == data[i] diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py index b73e4a31b..29865ac5b 100644 --- a/tests/mli/test_torch_worker.py +++ b/tests/mli/test_torch_worker.py @@ -26,12 +26,12 @@ import io -import numpy as np import pytest import torch from torch import nn from torch.nn import functional as F +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStoreKey from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, @@ -102,7 +102,7 @@ def get_request() -> InferenceRequest: ] return InferenceRequest( - model_key="model", + model_key=FeatureStoreKey("model", ""), callback=None, raw_inputs=tensor_numpy, input_keys=None, diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index df4b0a637..5abc3852f 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -44,7 +44,6 @@ from smartsim.log import get_logger from .channel import FileSystemCommChannel -from .featurestore import FileSystemFeatureStore from .worker import IntegratedTorchWorker logger = get_logger(__name__) @@ -139,10 +138,11 @@ def mock_messages( tensor = torch.randn((1, 2), dtype=torch.float32) torch.save(tensor, buffer) feature_store[input_key] = buffer.getvalue() + fsd = feature_store.descriptor() - message_tensor_output_key = MessageHandler.build_tensor_key(output_key) - message_tensor_input_key = MessageHandler.build_tensor_key(input_key) - message_model_key = MessageHandler.build_model_key(model_key) + message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) + message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) + message_model_key = MessageHandler.build_model_key(model_key, fsd) request = MessageHandler.build_request( reply_channel=callback_channel.descriptor, @@ -183,11 +183,14 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: ) # create a mock client application to populate the request queue + feature_stores = config_loader.get_feature_stores() + fs_list = list(feature_stores.values()) + msg_pump = mp.Process( target=mock_messages, args=( config_loader.get_queue(), - config_loader.get_feature_store(), + fs_list[0], fs_path, comm_path, ), diff --git a/tests/test_message_handler/test_build_model_key.py b/tests/test_message_handler/test_build_model_key.py index 135e96798..c09c787fc 100644 --- a/tests/test_message_handler/test_build_model_key.py +++ b/tests/test_message_handler/test_build_model_key.py @@ -35,10 +35,13 @@ def test_build_model_key_successful(): - model_key = handler.build_model_key("tensor_key") + fsd = "mock-feature-store-descriptor" + model_key = handler.build_model_key("tensor_key", fsd) assert model_key.key == "tensor_key" + assert model_key.featureStoreDescriptor == fsd def test_build_model_key_unsuccessful(): with pytest.raises(ValueError): - model_key = handler.build_model_key(100) + fsd = "mock-feature-store-descriptor" + model_key = handler.build_model_key(100, fsd) diff --git a/tests/test_message_handler/test_build_tensor_key.py b/tests/test_message_handler/test_build_tensor_key.py index 7abe9e853..6a28b80c4 100644 --- a/tests/test_message_handler/test_build_tensor_key.py +++ b/tests/test_message_handler/test_build_tensor_key.py @@ -35,10 +35,12 @@ def test_build_tensor_key_successful(): - tensor_key = handler.build_tensor_key("tensor_key") + fsd = "mock-feature-store-descriptor" + tensor_key = handler.build_tensor_key("tensor_key", fsd) assert tensor_key.key == "tensor_key" def test_build_tensor_key_unsuccessful(): with pytest.raises(ValueError): - tensor_key = handler.build_tensor_key(100) + fsd = "mock-feature-store-descriptor" + tensor_key = handler.build_tensor_key(100, fsd) diff --git a/tests/test_message_handler/test_output_descriptor.py b/tests/test_message_handler/test_output_descriptor.py index fd21eeb0d..beb9a4765 100644 --- a/tests/test_message_handler/test_output_descriptor.py +++ b/tests/test_message_handler/test_output_descriptor.py @@ -33,7 +33,8 @@ handler = MessageHandler() -tensor_key = handler.build_tensor_key("key") +fsd = "mock-feature-store-descriptor" +tensor_key = handler.build_tensor_key("key", fsd) @pytest.mark.parametrize( diff --git a/tests/test_message_handler/test_request.py b/tests/test_message_handler/test_request.py index 4cfc11584..ea9b04d64 100644 --- a/tests/test_message_handler/test_request.py +++ b/tests/test_message_handler/test_request.py @@ -31,14 +31,16 @@ # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a -model_key = MessageHandler.build_model_key("model_key") +fsd = "mock-feature-store-descriptor" + +model_key = MessageHandler.build_model_key("model_key", fsd) model = MessageHandler.build_model(b"model data", "model_name", "v0.0.1") -input_key1 = MessageHandler.build_tensor_key("input_key1") -input_key2 = MessageHandler.build_tensor_key("input_key2") +input_key1 = MessageHandler.build_tensor_key("input_key1", fsd) +input_key2 = MessageHandler.build_tensor_key("input_key2", fsd) -output_key1 = MessageHandler.build_tensor_key("output_key1") -output_key2 = MessageHandler.build_tensor_key("output_key2") +output_key1 = MessageHandler.build_tensor_key("output_key1", fsd) +output_key2 = MessageHandler.build_tensor_key("output_key2", fsd) output_descriptor1 = MessageHandler.build_output_tensor_descriptor( "c", [output_key1, output_key2], "int64", [] diff --git a/tests/test_message_handler/test_response.py b/tests/test_message_handler/test_response.py index 03bd9ba73..d6894eb5c 100644 --- a/tests/test_message_handler/test_response.py +++ b/tests/test_message_handler/test_response.py @@ -31,9 +31,10 @@ # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a +fsd = "mock-feature-store-descriptor" -result_key1 = MessageHandler.build_tensor_key("result_key1") -result_key2 = MessageHandler.build_tensor_key("result_key2") +result_key1 = MessageHandler.build_tensor_key("result_key1", fsd) +result_key2 = MessageHandler.build_tensor_key("result_key2", fsd) torch_attributes = MessageHandler.build_torch_response_attributes() tf_attributes = MessageHandler.build_tf_response_attributes() From f2528061dffacd97e56df6a24f2461e5295c87c2 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Thu, 18 Jul 2024 18:51:06 -0500 Subject: [PATCH 02/49] post-merge tweaks --- ex/high_throughput_inference/mock_app.py | 2 +- .../infrastructure/control/workermanager.py | 51 +++++++++---------- .../mli/infrastructure/environmentloader.py | 18 +++---- .../_core/mli/infrastructure/worker/worker.py | 47 +++++++---------- smartsim/_core/mli/message_handler.py | 3 +- tests/dragon/test_reply_building.py | 31 +---------- tests/mli/test_worker_manager.py | 7 ++- 7 files changed, 56 insertions(+), 103 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index e244c93e0..236fab419 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -115,7 +115,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("build_tensor_descriptor") built_model = None if isinstance(model, str): - model_arg = MessageHandler.build_model_key(model) + model_arg = MessageHandler.build_model_key(model) # todo: this needs FSD else: model_arg = MessageHandler.build_model(model, "resnet-50", "1.0") request = MessageHandler.build_request( diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 79ffc6dbd..e34f711f5 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -34,7 +34,6 @@ from ...comm.channel.channel import CommChannelBase from ...comm.channel.dragonchannel import DragonCommChannel from ...infrastructure.environmentloader import EnvironmentConfigLoader -from ...infrastructure.storage.featurestore import FeatureStore from ...infrastructure.worker.worker import ( InferenceReply, InferenceRequest, @@ -54,32 +53,18 @@ logger = get_logger(__name__) -def build_failure_reply(status: "StatusEnum", message: str) -> Response: +def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: """Build a response indicating a failure occurred :param status: The status of the response :param message: The error message to include in the response""" return MessageHandler.build_response( - status=status, # todo: need to indicate correct status - message=message, # todo: decide what these will be + status=status, + message=message, result=None, custom_attributes=None, ) -def build_reply(worker: MachineLearningWorkerBase, reply: InferenceReply) -> Response: - """Builds a response for a successful inference request - :param worker: A worker to process the reply with - :param reply: The internal representation of the reply""" - results = worker.prepare_outputs(reply) - - return MessageHandler.build_response( - status=reply.status_enum, - message=reply.message, - result=results, - custom_attributes=None, - ) - - def exception_handler( exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str ) -> None: @@ -143,13 +128,15 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: """Ensures that all feature stores required by the request are available :param request: The request to validate""" # collect all feature stores required by the request - fs_model = {request.model_key.descriptor} + fs_model: t.Set[str] = set() + if request.model_key: + fs_model = {request.model_key.descriptor} fs_inputs = {key.descriptor for key in request.input_keys} fs_outputs = {key.descriptor for key in request.output_keys} # identify which feature stores are requested and unknown - fs_desired = fs_model + fs_inputs + fs_outputs - fs_actual = {key for key in self._feature_stores} + fs_desired = fs_model.union(fs_inputs).union(fs_outputs) + fs_actual = {item.descriptor for item in self._feature_stores.values()} fs_missing = fs_desired - fs_actual # exit if all desired feature stores are not available @@ -259,7 +246,7 @@ def _on_iteration(self) -> None: interm = time.perf_counter() # timing try: fetch_model_result = self._worker.fetch_model( - request, self._feature_store + request, self._feature_stores ) except Exception as e: exception_handler( @@ -287,7 +274,7 @@ def _on_iteration(self) -> None: interm = time.perf_counter() # timing try: fetch_model_result = self._worker.fetch_model( - request, self._feature_store + request, self._feature_stores ) except Exception as e: exception_handler( @@ -310,7 +297,9 @@ def _on_iteration(self) -> None: timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing try: - fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) + fetch_input_result = self._worker.fetch_inputs( + request, self._feature_stores + ) except Exception as e: exception_handler(e, request.callback, "Failed while fetching the inputs.") return @@ -370,10 +359,16 @@ def _on_iteration(self) -> None: if reply.outputs is None or not reply.outputs: response = build_failure_reply("fail", "Outputs not found.") else: - if reply.outputs is None or not reply.outputs: - response = build_failure_reply("fail", "no-results") - - response = build_reply(self._worker, reply) + reply.status_enum = "complete" + reply.message = "Success" + + results = self._worker.prepare_outputs(reply) + response = MessageHandler.build_response( + status=reply.status_enum, + message=reply.message, + result=results, + custom_attributes=None, + ) timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 4502ec2fc..9125ac1cd 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -46,13 +46,10 @@ class EnvironmentConfigLoader: """ def __init__(self) -> None: - self._feature_store_descriptor: t.Optional[str] = os.getenv( - "SSFeatureStore", None - ) self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) - self.feature_store: t.Optional[FeatureStore] = None - self.feature_stores: t.Optional[t.Dict[FeatureStore]] = None + self.feature_stores: t.Optional[t.Dict[str, FeatureStore]] = None self.queue: t.Optional[DragonFLIChannel] = None + self._prefix = "SSFeatureStore" def _load_feature_store(self, env_var: str) -> FeatureStore: """Load a feature store from a descriptor @@ -62,10 +59,12 @@ def _load_feature_store(self, env_var: str) -> FeatureStore: value = os.getenv(env_var) if not value: - raise SmartSimError(f"Empty feature store descriptor in environment: {env_var}") + raise SmartSimError( + f"Empty feature store descriptor in environment: {env_var}" + ) try: - return pickle.loads(base64.b64decode(value)) + return t.cast(FeatureStore, pickle.loads(base64.b64decode(value))) except: raise SmartSimError( f"Invalid feature store descriptor in environment: {env_var}" @@ -74,9 +73,8 @@ def _load_feature_store(self, env_var: str) -> FeatureStore: def get_feature_stores(self) -> t.Dict[str, FeatureStore]: """Loads multiple Feature Stores by scanning environment for variables prefixed with `SSFeatureStore`""" - prefix = "SSFeatureStore" - if self.feature_stores is None: - env_vars = [var for var in os.environ if var.startswith(prefix)] + if not self.feature_stores: + env_vars = [var for var in os.environ if var.startswith(self._prefix)] stores = [self._load_feature_store(var) for var in env_vars] self.feature_stores = {fs.descriptor: fs for fs in stores} return self.feature_stores diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index f7b053245..74efec976 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -27,8 +27,6 @@ import typing as t from abc import ABC, abstractmethod -import numpy as np - from .....error import SmartSimError from .....log import get_logger from ...comm.channel.channel import CommChannelBase @@ -38,6 +36,7 @@ if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.response.response_capnp import Status + from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor logger = get_logger(__name__) @@ -81,13 +80,13 @@ class InferenceReply: def __init__( self, outputs: t.Optional[t.Collection[t.Any]] = None, - output_keys: t.Optional[t.Collection[str]] = None, + output_keys: t.Optional[t.Collection[FeatureStoreKey]] = None, status_enum: "Status" = "running", message: str = "In progress", ) -> None: """Initialize the object""" self.outputs: t.Collection[t.Any] = outputs or [] - self.output_keys: t.Collection[t.Optional[str]] = output_keys or [] + self.output_keys: t.Collection[t.Optional[FeatureStoreKey]] = output_keys or [] self.status_enum = status_enum self.message = message @@ -175,27 +174,25 @@ def deserialize_message( elif request.model.which() == "data": model_bytes = request.model.data - callback_key = request.replyChannel.reply + callback_key = request.replyChannel.descriptor comm_channel = channel_type(callback_key) - input_keys: t.Optional[t.List[FeatureStoreKey]] = None input_bytes: t.Optional[t.List[bytes]] = None - input_meta: t.List[t.Any] = [] + output_keys: t.Optional[t.List[FeatureStoreKey]] = None + input_meta: t.Optional[t.List[TensorDescriptor]] = None if request.input.which() == "keys": input_keys = [ - FeatureStoreKey(input_key.key, input_key.featureStoreDescriptor) - for input_key in request.input.keys + FeatureStoreKey(value.key, value.featureStoreDescriptor) + for value in request.input.keys ] - elif request.input.which() == "data": - input_bytes = [data.blob for data in request.input.data] - input_meta = [data.tensorDescriptor for data in request.input.data] + elif request.input.which() == "descriptors": + input_meta = request.input.descriptors # type: ignore - output_keys: t.List[FeatureStoreKey] = [] if request.output: output_keys = [ - FeatureStoreKey(output_key.key, output_key.featureStoreDescriptor) - for output_key in request.output + FeatureStoreKey(value.key, value.featureStoreDescriptor) + for value in request.output ] inference_request = InferenceRequest( @@ -214,27 +211,19 @@ def deserialize_message( def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: prepared_outputs: t.List[t.Any] = [] if reply.output_keys: - for fs_key in reply.output_keys: - if not fs_key: + for value in reply.output_keys: + if not value: continue - - msg_key = MessageHandler.build_tensor_key(fs_key.key, fs_key.descriptor) + msg_key = MessageHandler.build_tensor_key(value.key, value.descriptor) prepared_outputs.append(msg_key) elif reply.outputs: - arrays: t.List[np.ndarray[t.Any, np.dtype[t.Any]]] = [ - output.numpy() for output in reply.outputs - ] - for tensor in arrays: - # todo: need to have the output attributes specified in the req? - # maybe, add `MessageHandler.dtype_of(tensor)`? - # can `build_tensor` do dtype and shape? - msg_tensor = MessageHandler.build_tensor( - tensor, + for _ in reply.outputs: + msg_tensor_desc = MessageHandler.build_tensor_descriptor( "c", "float32", [1], ) - prepared_outputs.append(msg_tensor) + prepared_outputs.append(msg_tensor_desc) return prepared_outputs @staticmethod diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index aafc4a4c2..bbd74fd28 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -439,6 +439,7 @@ def _assign_result( result: t.Union[ t.List[tensor_capnp.TensorDescriptor], t.List[data_references_capnp.TensorKey], + None, ], ) -> None: """ @@ -504,7 +505,7 @@ def build_response( result: t.Union[ t.List[tensor_capnp.TensorDescriptor], t.List[data_references_capnp.TensorKey], - None + None, ], custom_attributes: t.Union[ response_attributes_capnp.TorchResponseAttributes, diff --git a/tests/dragon/test_reply_building.py b/tests/dragon/test_reply_building.py index d1c4d226b..5f179bbae 100644 --- a/tests/dragon/test_reply_building.py +++ b/tests/dragon/test_reply_building.py @@ -30,10 +30,7 @@ dragon = pytest.importorskip("dragon") -from smartsim._core.mli.infrastructure.control.workermanager import ( - build_failure_reply, - build_reply, -) +from smartsim._core.mli.infrastructure.control.workermanager import build_failure_reply from smartsim._core.mli.infrastructure.worker.worker import InferenceReply if t.TYPE_CHECKING: @@ -63,29 +60,3 @@ def test_build_failure_reply_fails(): response = build_failure_reply("not a status enum", "message") assert "Error assigning status to response" in ex.value.args[0] - - -@pytest.mark.parametrize( - "status, message", - [ - pytest.param("complete", "Success", id="complete"), - ], -) -def test_build_reply(status: "Status", message: str): - "Ensures replies can be built successfully" - reply = InferenceReply() - reply.status_enum = status - reply.message = message - response = build_reply(reply) - assert response.status == status - assert response.message == message - - -def test_build_reply_fails(): - "Ensures ValueError is raised if a Status Enum is not used" - with pytest.raises(ValueError) as ex: - reply = InferenceReply() - reply.status_enum = "not a status enum" - response = build_reply(reply) - - assert "Error assigning status to response" in ex.value.args[0] diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 5abc3852f..dc4c026c0 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -32,6 +32,8 @@ import pytest +from tests.mli.featurestore import FileSystemFeatureStore + torch = pytest.importorskip("torch") dragon = pytest.importorskip("dragon") @@ -183,14 +185,11 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: ) # create a mock client application to populate the request queue - feature_stores = config_loader.get_feature_stores() - fs_list = list(feature_stores.values()) - msg_pump = mp.Process( target=mock_messages, args=( config_loader.get_queue(), - fs_list[0], + FileSystemFeatureStore(fs_path), fs_path, comm_path, ), From 09eff200fc397e02fe83c3e62ba5b212856a5712 Mon Sep 17 00:00:00 2001 From: Christopher McBride <3595025+ankona@users.noreply.github.com> Date: Thu, 18 Jul 2024 20:56:54 -0400 Subject: [PATCH 03/49] update upstream tests --- .../mli/infrastructure/environmentloader.py | 4 +- .../infrastructure/storage/featurestore.py | 15 +++--- .../_core/mli/infrastructure/worker/worker.py | 12 +++-- smartsim/_core/mli/message_handler.py | 34 ++++++++---- tests/dragon/test_error_handling.py | 6 +-- .../mli/test_core_machine_learning_worker.py | 54 +++++++++++-------- tests/mli/test_torch_worker.py | 2 +- 7 files changed, 77 insertions(+), 50 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 9125ac1cd..983afc00c 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -49,7 +49,7 @@ def __init__(self) -> None: self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) self.feature_stores: t.Optional[t.Dict[str, FeatureStore]] = None self.queue: t.Optional[DragonFLIChannel] = None - self._prefix = "SSFeatureStore" + self._feature_store_prefix = "SSFeatureStore" def _load_feature_store(self, env_var: str) -> FeatureStore: """Load a feature store from a descriptor @@ -74,7 +74,7 @@ def get_feature_stores(self) -> t.Dict[str, FeatureStore]: """Loads multiple Feature Stores by scanning environment for variables prefixed with `SSFeatureStore`""" if not self.feature_stores: - env_vars = [var for var in os.environ if var.startswith(self._prefix)] + env_vars = [var for var in os.environ if var.startswith(self._feature_store_prefix)] stores = [self._load_feature_store(var) for var in env_vars] self.feature_stores = {fs.descriptor: fs for fs in stores} return self.feature_stores diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index 5291b2db3..49f16af8a 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -27,15 +27,16 @@ import typing as t from abc import ABC, abstractmethod +from pydantic import BaseModel, Field -class FeatureStoreKey: - """A key-value pair enabling retrieval of an item in a feature store""" - def __init__(self, key: str, descriptor: str) -> None: - self.key = key - """The unique key of an item in the feature store""" - self.descriptor = descriptor - """The unique identifier of the feature store containing the key""" +class FeatureStoreKey(BaseModel): + """A key,descriptor pair enabling retrieval of an item from a feature store""" + + key: str = Field(min_length=1) + """The unique key of an item in a feature store""" + descriptor: str = Field(min_length=1) + """The unique identifier of the feature store containing the key""" class FeatureStore(ABC): diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 74efec976..984fd10df 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -162,14 +162,18 @@ def deserialize_message( channel_type: t.Type[CommChannelBase], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest - :param data_blob: The byte stream to deserialize""" + :param data_blob: The byte stream to deserialize + :param channel_type: Type to be used for callback communications + :returns: The raw input message deserialized into an InferenceRequest + """ request = MessageHandler.deserialize_request(data_blob) model_key: t.Optional[FeatureStoreKey] = None model_bytes: t.Optional[Model] = None if request.model.which() == "key": model_key = FeatureStoreKey( - request.model.key.key, request.model.key.featureStoreDescriptor + key=request.model.key.key, + descriptor=request.model.key.featureStoreDescriptor, ) elif request.model.which() == "data": model_bytes = request.model.data @@ -183,7 +187,7 @@ def deserialize_message( if request.input.which() == "keys": input_keys = [ - FeatureStoreKey(value.key, value.featureStoreDescriptor) + FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) for value in request.input.keys ] elif request.input.which() == "descriptors": @@ -191,7 +195,7 @@ def deserialize_message( if request.output: output_keys = [ - FeatureStoreKey(value.key, value.featureStoreDescriptor) + FeatureStoreKey(key=value.key, descriptor=value.featureStoreDescriptor) for value in request.output ] diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index bbd74fd28..cb5e56df2 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -99,6 +99,8 @@ def build_tensor_key( Builds a new TensorKey message with the provided key. :param key: String to set the TensorKey + :param feature_store_descriptor: A descriptor identifying the feature store + containing the key :raises ValueError: if building fails """ try: @@ -136,6 +138,8 @@ def build_model_key( Builds a new ModelKey message with the provided key. :param key: String to set the ModelKey + :param feature_store_descriptor: A descriptor identifying the feature store + containing the key :raises ValueError: if building fails """ try: @@ -222,8 +226,10 @@ def _assign_model( elif class_name == "ModelKey": request.model.key = model # type: ignore else: - raise ValueError("""Invalid custom attribute class name. - Expected 'Model' or 'ModelKey'.""") + raise ValueError( + """Invalid custom attribute class name. + Expected 'Model' or 'ModelKey'.""" + ) except Exception as e: raise ValueError("Error building model portion of request.") from e @@ -267,8 +273,10 @@ def _assign_inputs( elif input_class_name == "TensorKey": request.input.keys = inputs # type: ignore else: - raise ValueError("""Invalid input class name. Expected - 'TensorDescriptor' or 'TensorKey'.""") + raise ValueError( + """Invalid input class name. Expected + 'TensorDescriptor' or 'TensorKey'.""" + ) except Exception as e: raise ValueError("Error building inputs portion of request.") from e @@ -337,9 +345,11 @@ def _assign_custom_request_attributes( elif custom_attribute_class_name == "TensorFlowRequestAttributes": request.customAttributes.tf = custom_attrs # type: ignore else: - raise ValueError("""Invalid custom attribute class name. + raise ValueError( + """Invalid custom attribute class name. Expected 'TensorFlowRequestAttributes' or - 'TorchRequestAttributes'.""") + 'TorchRequestAttributes'.""" + ) except Exception as e: raise ValueError( "Error building custom attributes portion of request." @@ -459,8 +469,10 @@ def _assign_result( elif result_class_name == "TensorKey": response.result.keys = result # type: ignore else: - raise ValueError("""Invalid custom attribute class name. - Expected 'TensorDescriptor' or 'TensorKey'.""") + raise ValueError( + """Invalid custom attribute class name. + Expected 'TensorDescriptor' or 'TensorKey'.""" + ) except Exception as e: raise ValueError("Error assigning result to response.") from e @@ -492,9 +504,11 @@ def _assign_custom_response_attributes( elif custom_attribute_class_name == "TensorFlowResponseAttributes": response.customAttributes.tf = custom_attrs # type: ignore else: - raise ValueError("""Invalid custom attribute class name. + raise ValueError( + """Invalid custom attribute class name. Expected 'TensorFlowResponseAttributes' or - 'TorchResponseAttributes'.""") + 'TorchResponseAttributes'.""" + ) except Exception as e: raise ValueError("Error assigning custom attributes to response.") from e diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 151bdd2fc..c929c2341 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -84,7 +84,7 @@ def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch): comm_channel_type=FileSystemCommChannel, ) - tensor_key = MessageHandler.build_tensor_key("key") + tensor_key = MessageHandler.build_tensor_key("key", feature_store.descriptor) model = MessageHandler.build_model(b"model", "model name", "v 0.0.1") request = MessageHandler.build_request( test_dir, model, [tensor_key], [tensor_key], [], None @@ -116,8 +116,8 @@ def setup_worker_manager_model_key(test_dir, monkeypatch: pytest.MonkeyPatch): comm_channel_type=FileSystemCommChannel, ) - tensor_key = MessageHandler.build_tensor_key("key") - model_key = MessageHandler.build_model_key("model key") + tensor_key = MessageHandler.build_tensor_key("key", feature_store.descriptor) + model_key = MessageHandler.build_model_key("model key", feature_store.descriptor) request = MessageHandler.build_request( test_dir, model_key, [tensor_key], [tensor_key], [], None ) diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/mli/test_core_machine_learning_worker.py index 5e56671c3..c7e1cb286 100644 --- a/tests/mli/test_core_machine_learning_worker.py +++ b/tests/mli/test_core_machine_learning_worker.py @@ -94,7 +94,7 @@ def test_fetch_model_disk(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=FeatureStoreKey(key, fsd)) + request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd)) fetch_result = worker.fetch_model(request, {fsd: feature_store}) assert fetch_result.model_bytes @@ -110,7 +110,7 @@ def test_fetch_model_disk_missing() -> None: key = "/path/that/doesnt/exist" - request = InferenceRequest(model_key=FeatureStoreKey(key, fsd)) + request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd)) with pytest.raises(sse.SmartSimError) as ex: worker.fetch_model(request, {fsd: feature_store}) @@ -133,7 +133,9 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=FeatureStoreKey(key, feature_store.descriptor)) + request = InferenceRequest( + model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + ) fetch_result = worker.fetch_model(request, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -144,12 +146,12 @@ def test_fetch_model_feature_store_missing() -> None: when given an invalid (feature store) key""" worker = MachineLearningWorkerCore - bad_key = "some-key" + key = "some-key" feature_store = MemoryFeatureStore() fsd = feature_store.descriptor request = InferenceRequest( - model_key=FeatureStoreKey(bad_key, feature_store.descriptor) + model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) ) # todo: consider that raising this exception shows impl. replace... @@ -157,7 +159,7 @@ def test_fetch_model_feature_store_missing() -> None: worker.fetch_model(request, {fsd: feature_store}) # ensure the error message includes key-identifying information - assert bad_key in ex.value.args[0] + assert key in ex.value.args[0] @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") @@ -171,7 +173,9 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=FeatureStoreKey(key, feature_store.descriptor)) + request = InferenceRequest( + model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + ) fetch_result = worker.fetch_model(request, {fsd: feature_store}) assert fetch_result.model_bytes @@ -186,7 +190,9 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest(input_keys=[FeatureStoreKey(tensor_name, fsd)]) + request = InferenceRequest( + input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)] + ) worker = MachineLearningWorkerCore feature_store[tensor_name] = persist_torch_tensor.read_bytes() @@ -202,9 +208,9 @@ def test_fetch_input_disk_missing() -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - key = "/path/that/doesnt/exist", fsd + key = "/path/that/doesnt/exist" - request = InferenceRequest(input_keys=[FeatureStoreKey(key, fsd)]) + request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) with pytest.raises(sse.SmartSimError) as ex: worker.fetch_inputs(request, {fsd: feature_store}) @@ -223,7 +229,9 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest(input_keys=[FeatureStoreKey(tensor_name, fsd)]) + request = InferenceRequest( + input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)] + ) # put model bytes into the feature store feature_store[tensor_name] = persist_torch_tensor.read_bytes() @@ -255,9 +263,9 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> request = InferenceRequest( input_keys=[ - FeatureStoreKey(tensor_name + "1", fsd), - FeatureStoreKey(tensor_name + "2", fsd), - FeatureStoreKey(tensor_name + "3", fsd), + FeatureStoreKey(key=tensor_name + "1", descriptor=fsd), + FeatureStoreKey(key=tensor_name + "2", descriptor=fsd), + FeatureStoreKey(key=tensor_name + "3", descriptor=fsd), ] ) @@ -275,16 +283,16 @@ def test_fetch_input_feature_store_missing() -> None: when given an invalid (feature store) key""" worker = MachineLearningWorkerCore - bad_key = "some-key" + key = "bad-key" feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest(input_keys=[FeatureStoreKey(bad_key, fsd)]) + request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) with pytest.raises(sse.SmartSimError) as ex: worker.fetch_inputs(request, {fsd: feature_store}) # ensure the error message includes key-identifying information - assert bad_key in ex.value.args[0] + assert key in ex.value.args[0] @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") @@ -295,9 +303,9 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - model_name = "test-model" - feature_store[model_name] = persist_torch_tensor.read_bytes() - request = InferenceRequest(input_keys=[FeatureStoreKey(model_name, fsd)]) + key = "test-model" + feature_store[key] = persist_torch_tensor.read_bytes() + request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) assert fetch_result.inputs is not None @@ -326,9 +334,9 @@ def test_place_outputs() -> None: # create a key to retrieve from the feature store keys = [ - FeatureStoreKey(key_name + "1", fsd), - FeatureStoreKey(key_name + "2", fsd), - FeatureStoreKey(key_name + "3", fsd), + FeatureStoreKey(key=key_name + "1", descriptor=fsd), + FeatureStoreKey(key=key_name + "2", descriptor=fsd), + FeatureStoreKey(key=key_name + "3", descriptor=fsd), ] data = [b"abcdef", b"ghijkl", b"mnopqr"] diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py index 29865ac5b..1e8bba7e3 100644 --- a/tests/mli/test_torch_worker.py +++ b/tests/mli/test_torch_worker.py @@ -102,7 +102,7 @@ def get_request() -> InferenceRequest: ] return InferenceRequest( - model_key=FeatureStoreKey("model", ""), + model_key=FeatureStoreKey(key="model", descriptor="xyz"), callback=None, raw_inputs=tensor_numpy, input_keys=None, From 2cedfb336830629c9621dcf837d36c135382c54b Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Thu, 25 Jul 2024 12:35:41 -0500 Subject: [PATCH 04/49] dynamic fs attachment, add backbone to ML worker mgr & config loader --- ex/high_throughput_inference/mli_driver.py | 21 ++- ex/high_throughput_inference/mock_app.py | 40 ++++-- .../standalone_workermanager.py | 24 ++-- smartsim/_core/mli/comm/channel/dragonfli.py | 12 +- .../infrastructure/control/workermanager.py | 50 ++++++-- .../mli/infrastructure/environmentloader.py | 89 +++++++------ .../storage/dragonfeaturestore.py | 38 +++++- .../_core/mli/infrastructure/worker/worker.py | 4 +- smartsim/_core/mli/message_handler.py | 30 ++--- tests/dragon/featurestore.py | 19 ++- tests/dragon/test_environment_loader.py | 120 ++++++++---------- tests/dragon/test_error_handling.py | 82 ++++++++---- tests/dragon/utils/channel.py | 11 ++ tests/dragon/utils/worker.py | 4 +- tests/mli/channel.py | 13 ++ tests/mli/featurestore.py | 21 ++- tests/mli/test_worker_manager.py | 74 ++++++----- tests/mli/worker.py | 4 +- 18 files changed, 412 insertions(+), 244 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 6da559aa6..0cf87ef2e 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -1,5 +1,4 @@ - - +import argparse import os import base64 import cloudpickle @@ -26,11 +25,23 @@ torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") -worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str]) +worker_manager_rs = exp.create_run_settings( + sys.executable, + [ + worker_manager_script_name, + "--device", + device, + "--worker_class", + torch_worker_str, + ], +) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) -app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app_rs = exp.create_run_settings( + sys.executable, + exe_args=[app_script_name, "--device", device], +) app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) @@ -47,4 +58,4 @@ break time.sleep(5) -print("Exiting.") \ No newline at end of file +print("Exiting.") diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 236fab419..e34b2676a 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -44,16 +44,21 @@ import numbers from collections import OrderedDict +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger logger = get_logger("App") + class ProtoClient: def __init__(self, timing_on: bool): connect_to_infrastructure() ddict_str = os.environ["SS_DRG_DDICT"] self._ddict = DDict.attach(ddict_str) + self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor to_worker_fli_str = None while to_worker_fli_str is None: try: @@ -88,17 +93,23 @@ def start_timings(self, batch_size: int): def end_timings(self): if self._timing_on: self._add_label_to_timings("total_time") - self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start)) + self._timings["total_time"].append( + self._format_number(time.perf_counter() - self._start) + ) def measure_time(self, label: str): if self._timing_on: self._add_label_to_timings(label) - self._timings[label].append(self._format_number(time.perf_counter()-self._interm)) + self._timings[label].append( + self._format_number(time.perf_counter() - self._interm) + ) self._interm = time.perf_counter() def print_timings(self, to_file: bool = False): print(" ".join(self._timings.keys())) - value_array = numpy.array([value for value in self._timings.values()], dtype=float) + value_array = numpy.array( + [value for value in self._timings.values()], dtype=float + ) value_array = numpy.transpose(value_array) for i in range(value_array.shape[0]): print(" ".join(self._format_number(value) for value in value_array[i])) @@ -106,21 +117,21 @@ def print_timings(self, to_file: bool = False): numpy.save("timings.npy", value_array) numpy.savetxt("timings.txt", value_array) - def run_model(self, model: bytes | str, batch: torch.Tensor): tensors = [batch.numpy()] self.start_timings(batch.shape[0]) built_tensor_desc = MessageHandler.build_tensor_descriptor( - "c", "float32", list(batch.shape)) + "c", "float32", list(batch.shape) + ) self.measure_time("build_tensor_descriptor") built_model = None if isinstance(model, str): - model_arg = MessageHandler.build_model_key(model) # todo: this needs FSD + model_arg = MessageHandler.build_model_key(model, self._backbone_descriptor) else: model_arg = MessageHandler.build_model(model, "resnet-50", "1.0") request = MessageHandler.build_request( reply_channel=self._from_worker_ch_serialized, - model= model_arg, + model=model_arg, inputs=[built_tensor_desc], outputs=[], output_descriptors=[], @@ -129,10 +140,12 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("build_request") request_bytes = MessageHandler.serialize_request(request) self.measure_time("serialize_request") - with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: + with self._to_worker_fli.sendh( + timeout=None, stream_channel=self._to_worker_ch + ) as to_sendh: to_sendh.send_bytes(request_bytes) for t in tensors: - to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!! + to_sendh.send_bytes(t.tobytes()) # TODO NOT FAST ENOUGH!!! # to_sendh.send_bytes(bytes(t.data)) logger.info(f"Message size: {len(request_bytes)} bytes") @@ -159,7 +172,7 @@ def set_model(self, key: str, model: bytes): self._ddict[key] = model -class ResNetWrapper(): +class ResNetWrapper: def __init__(self, name: str, model: str): self._model = torch.jit.load(model) self._name = name @@ -168,7 +181,7 @@ def __init__(self, name: str, model: str): torch.jit.save(scripted, buffer) self._serialized_model = buffer.getvalue() - def get_batch(self, batch_size: int=32): + def get_batch(self, batch_size: int = 32): return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) @property @@ -179,6 +192,7 @@ def model(self): def name(self): return self._name + if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") @@ -194,8 +208,8 @@ def name(self): for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: logger.info(f"Batch size: {batch_size}") - for iteration_number in range(total_iterations + int(batch_size==1)): + for iteration_number in range(total_iterations + int(batch_size == 1)): logger.info(f"Iteration: {iteration_number}") client.run_model(resnet.name, resnet.get_batch(batch_size)) - client.print_timings(to_file=True) \ No newline at end of file + client.print_timings(to_file=True) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index c56e11a7c..466d2d669 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -31,17 +31,19 @@ from dragon.data.ddict.ddict import DDict from dragon.utils import b64decode, b64encode from dragon.globalservices.api_setup import connect_to_infrastructure + # isort: on import argparse import base64 import cloudpickle -import pickle +import optparse import os from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import DragonFeatureStore from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader @@ -75,22 +77,22 @@ to_worker_fli_serialized = to_worker_fli.serialize() ddict["to_worker_fli"] = to_worker_fli_serialized - torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))() - - dfs = DragonFeatureStore(ddict) - comm_channel = DragonFLIChannel(to_worker_fli_serialized) + worker_type_name = base64.b64decode(args.worker_class.encode("ascii")) + torch_worker = cloudpickle.loads(worker_type_name)() - os.environ["SSFeatureStore"] = base64.b64encode(pickle.dumps(dfs)).decode("utf-8") os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - config_loader = EnvironmentConfigLoader() + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel, + queue_factory=DragonFLIChannel.from_descriptor, + ) worker_manager = WorkerManager( config_loader=config_loader, worker=torch_worker, as_service=True, cooldown=10, - comm_channel_type=DragonCommChannel, - device = args.device, + device=args.device, ) worker_manager.execute() diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 28b4c2bf3..503c17ad3 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -30,7 +30,7 @@ # isort: on -import sys +import base64 import typing as t import smartsim._core.mli.comm.channel.channel as cch @@ -70,3 +70,13 @@ def recv(self) -> t.List[bytes]: except fli.FLIEOT as exc: eot = True return messages + + @classmethod + def from_descriptor( + cls, + descriptor: str, + ) -> "DragonFLIChannel": + return DragonFLIChannel( + fli_desc=base64.b64decode(descriptor), + sender_supplied=True, + ) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index e34f711f5..b7e409e46 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,11 +24,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import sys import time import typing as t -from .....error import SmartSimError +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore + from .....log import get_logger from ....entrypoints.service import Service from ...comm.channel.channel import CommChannelBase @@ -41,14 +41,15 @@ MachineLearningWorkerBase, ) from ...message_handler import MessageHandler -from ...mli_schemas.response.response_capnp import Response, ResponseBuilder +from ...mli_schemas.response.response_capnp import ResponseBuilder if t.TYPE_CHECKING: from dragon.fli import FLInterface - from smartsim._core.mli.mli_schemas.model.model_capnp import Model + # from smartsim._core.mli.mli_schemas.model.model_capnp import Model from smartsim._core.mli.mli_schemas.response.response_capnp import Status - from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor + + # from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor logger = get_logger(__name__) @@ -95,9 +96,10 @@ def __init__( self, config_loader: EnvironmentConfigLoader, worker: MachineLearningWorkerBase, + # fs_factory: t.Callable[[str], FeatureStore], as_service: bool = False, cooldown: int = 0, - comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, + # comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: """Initialize the WorkerManager @@ -115,14 +117,18 @@ def __init__( """the queue the manager monitors for new tasks""" self._worker = worker """The ML Worker implementation""" - self._comm_channel_type = comm_channel_type + self._callback_factory = config_loader._callback_factory """The type of communication channel to construct for callbacks""" self._device = device """Device on which workers need to run""" self._cached_models: dict[str, t.Any] = {} """Dictionary of previously loaded models""" - self._feature_stores = config_loader.get_feature_stores() + self._feature_stores: t.Dict[str, FeatureStore] = {} """A collection of attached feature stores""" + self._fs_factory = config_loader._featurestore_factory + """A factory method to create a desired feature store client type""" + self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone() + """The backbone feature store""" def _check_feature_stores(self, request: InferenceRequest) -> bool: """Ensures that all feature stores required by the request are available @@ -139,11 +145,17 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: fs_actual = {item.descriptor for item in self._feature_stores.values()} fs_missing = fs_desired - fs_actual - # exit if all desired feature stores are not available - if fs_missing: - logger.error(f"Missing feature store(s): {fs_missing}") + if self._fs_factory is None: + logger.warning("No feature store factory configured") return False + # create the feature stores we need to service request + if fs_missing: + logger.info(f"Missing feature store(s): {fs_missing}") + for descriptor in fs_missing: + feature_store = self._fs_factory(descriptor) + self._feature_stores[descriptor] = feature_store + return True def _check_model(self, request: InferenceRequest) -> bool: @@ -212,7 +224,7 @@ def _on_iteration(self) -> None: interm = time.perf_counter() # timing request = self._worker.deserialize_message( - request_bytes, self._comm_channel_type + request_bytes, self._callback_factory ) if request.input_meta and tensor_bytes_list: @@ -234,6 +246,12 @@ def _on_iteration(self) -> None: "Could not find model key or model.", ) return + + # if request.model_key.descriptor not in self._feature_stores: + # self._fs_factory(request.model_key.descriptor) + # todo: decide if we should load here or in _check_feature_stores. + # todo: should i raise error here? + if request.model_key.key in self._cached_models: timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing @@ -265,7 +283,9 @@ def _on_iteration(self) -> None: self._cached_models[request.model_key.key] = model_result.model except Exception as e: exception_handler( - e, request.callback, "Failed while loading the model." + e, + request.callback, + "Failed while loading model from feature store.", ) return @@ -290,7 +310,9 @@ def _on_iteration(self) -> None: ) except Exception as e: exception_handler( - e, request.callback, "Failed while loading the model." + e, + request.callback, + "Failed while loading model from feature store.", ) return diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 983afc00c..ec38a56dd 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -24,16 +24,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import base64 import os -import pickle import typing as t -from dragon.fli import FLInterface # pylint: disable=all - -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim.error.errors import SmartSimError from smartsim.log import get_logger logger = get_logger(__name__) @@ -45,45 +40,55 @@ class EnvironmentConfigLoader: into the WorkerManager. """ - def __init__(self) -> None: + def __init__( + self, + featurestore_factory: t.Callable[[str], FeatureStore], + callback_factory: t.Callable[[bytes], CommChannelBase], + queue_factory: t.Callable[[str], CommChannelBase], + ) -> None: self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) - self.feature_stores: t.Optional[t.Dict[str, FeatureStore]] = None - self.queue: t.Optional[DragonFLIChannel] = None - self._feature_store_prefix = "SSFeatureStore" - - def _load_feature_store(self, env_var: str) -> FeatureStore: - """Load a feature store from a descriptor - :param descriptor: The descriptor of the feature store - :returns: The hydrated feature store""" - logger.debug(f"Loading feature store from env: {env_var}") + """The descriptor used to attach to the incoming event queue""" + self.queue: t.Optional[CommChannelBase] = None + """The attached incoming event queue channel""" + self._backbone_descriptor: t.Optional[str] = os.getenv("SS_DRG_DDICT", None) + """The descriptor used to attach to the backbone feature store""" + self.backbone: t.Optional[FeatureStore] = None + """The attached backbone feature store""" + self._featurestore_factory = featurestore_factory + """A factory method to instantiate a FeatureStore""" + self._callback_factory = callback_factory + """A factory method to instantiate a concrete CommChannelBase + for inference callbacks""" + self._queue_factory = queue_factory + """A factory method to instantiate a concrete CommChannelBase + for inference requests""" - value = os.getenv(env_var) - if not value: - raise SmartSimError( - f"Empty feature store descriptor in environment: {env_var}" - ) + def get_backbone(self) -> t.Optional[FeatureStore]: + """Create the backbone feature store using the descriptor found in + an environment variable""" + descriptor = self._backbone_descriptor or os.getenv("SS_DRG_DDICT", None) + if self._featurestore_factory is None: + logger.warning("No feature store factory is configured") + return None - try: - return t.cast(FeatureStore, pickle.loads(base64.b64decode(value))) - except: - raise SmartSimError( - f"Invalid feature store descriptor in environment: {env_var}" - ) + if descriptor is not None: + self.backbone = self._featurestore_factory(descriptor) + self._backbone_descriptor = descriptor + return self.backbone - def get_feature_stores(self) -> t.Dict[str, FeatureStore]: - """Loads multiple Feature Stores by scanning environment for variables - prefixed with `SSFeatureStore`""" - if not self.feature_stores: - env_vars = [var for var in os.environ if var.startswith(self._feature_store_prefix)] - stores = [self._load_feature_store(var) for var in env_vars] - self.feature_stores = {fs.descriptor: fs for fs in stores} - return self.feature_stores - - def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]: + def get_queue(self) -> t.Optional[CommChannelBase]: """Returns the Queue previously set in SSQueue""" - if self._queue_descriptor is not None: - self.queue = DragonFLIChannel( - fli_desc=base64.b64decode(self._queue_descriptor), - sender_supplied=sender_supplied, - ) + descriptor = self._queue_descriptor or os.getenv("SSQueue", None) + if self._queue_factory is None: + logger.warning("No queue factory is configured") + return None + + if descriptor is not None: + # , sender_supplied: bool = True + # self.queue = DragonFLIChannel( + # fli_desc=base64.b64decode(descriptor), + # sender_supplied=sender_supplied, + # ) + self.queue = self._queue_factory(descriptor) + self._queue_descriptor = descriptor return self.queue diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index d5344198a..213d29cf4 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -26,13 +26,15 @@ import typing as t -import smartsim.error as sse -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim.log import get_logger +# pylint: disable=import-error +# isort: off +import dragon.data.ddict.ddict as dragon_ddict -if t.TYPE_CHECKING: - from dragon.data.ddict.ddict import DDict +# isort: on +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.error import SmartSimError +from smartsim.log import get_logger logger = get_logger(__name__) @@ -40,7 +42,7 @@ class DragonFeatureStore(FeatureStore): """A feature store backed by a dragon distributed dictionary""" - def __init__(self, storage: "DDict") -> None: + def __init__(self, storage: "dragon_ddict.DDict") -> None: """Initialize the DragonFeatureStore instance""" self._storage = storage @@ -54,7 +56,7 @@ def __getitem__(self, key: str) -> t.Union[str, bytes]: raise ex except Exception as ex: # note: explicitly avoid round-trip to check for key existence - raise sse.SmartSimError( + raise SmartSimError( f"Could not get value for existing key {key}, error:\n{ex}" ) from ex @@ -76,3 +78,25 @@ def descriptor(self) -> str: the feature store :returns: A descriptor encoded as a string""" return str(self._storage.serialize()) + + @classmethod + def from_descriptor( + cls, + descriptor: str, + # b64encoded: bool = False, + ) -> "DragonFeatureStore": + # import dragon.data.ddict.ddict as dragon_ddict # pylint: disable=import-outside-toplevel + + # # if b64encoded: + # # descriptor = base64.b64decode(descriptor).encode("utf-8") + # # ddict = DDict.attach(descriptor) + # # ddict.attach(descriptor) + + # storage = dragon_ddict.DDict() + # storage.attach(descriptor) + # return DragonFeatureStore(storage) + + if descriptor is None: + print("foo") + return None + return DragonFeatureStore({"tmp": "here"}) diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 984fd10df..f1d0775f0 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -159,7 +159,7 @@ class MachineLearningWorkerCore: @staticmethod def deserialize_message( data_blob: bytes, - channel_type: t.Type[CommChannelBase], + callback_factory: t.Callable[[bytes], CommChannelBase], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest :param data_blob: The byte stream to deserialize @@ -179,7 +179,7 @@ def deserialize_message( model_bytes = request.model.data callback_key = request.replyChannel.descriptor - comm_channel = channel_type(callback_key) + comm_channel = callback_factory(callback_key) input_keys: t.Optional[t.List[FeatureStoreKey]] = None input_bytes: t.Optional[t.List[bytes]] = None output_keys: t.Optional[t.List[FeatureStoreKey]] = None diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index cb5e56df2..ee632e24e 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -226,10 +226,8 @@ def _assign_model( elif class_name == "ModelKey": request.model.key = model # type: ignore else: - raise ValueError( - """Invalid custom attribute class name. - Expected 'Model' or 'ModelKey'.""" - ) + raise ValueError("""Invalid custom attribute class name. + Expected 'Model' or 'ModelKey'.""") except Exception as e: raise ValueError("Error building model portion of request.") from e @@ -273,10 +271,8 @@ def _assign_inputs( elif input_class_name == "TensorKey": request.input.keys = inputs # type: ignore else: - raise ValueError( - """Invalid input class name. Expected - 'TensorDescriptor' or 'TensorKey'.""" - ) + raise ValueError("""Invalid input class name. Expected + 'TensorDescriptor' or 'TensorKey'.""") except Exception as e: raise ValueError("Error building inputs portion of request.") from e @@ -345,11 +341,9 @@ def _assign_custom_request_attributes( elif custom_attribute_class_name == "TensorFlowRequestAttributes": request.customAttributes.tf = custom_attrs # type: ignore else: - raise ValueError( - """Invalid custom attribute class name. + raise ValueError("""Invalid custom attribute class name. Expected 'TensorFlowRequestAttributes' or - 'TorchRequestAttributes'.""" - ) + 'TorchRequestAttributes'.""") except Exception as e: raise ValueError( "Error building custom attributes portion of request." @@ -469,10 +463,8 @@ def _assign_result( elif result_class_name == "TensorKey": response.result.keys = result # type: ignore else: - raise ValueError( - """Invalid custom attribute class name. - Expected 'TensorDescriptor' or 'TensorKey'.""" - ) + raise ValueError("""Invalid custom attribute class name. + Expected 'TensorDescriptor' or 'TensorKey'.""") except Exception as e: raise ValueError("Error assigning result to response.") from e @@ -504,11 +496,9 @@ def _assign_custom_response_attributes( elif custom_attribute_class_name == "TensorFlowResponseAttributes": response.customAttributes.tf = custom_attrs # type: ignore else: - raise ValueError( - """Invalid custom attribute class name. + raise ValueError("""Invalid custom attribute class name. Expected 'TensorFlowResponseAttributes' or - 'TorchResponseAttributes'.""" - ) + 'TorchResponseAttributes'.""") except Exception as e: raise ValueError("Error assigning custom attributes to response.") from e diff --git a/tests/dragon/featurestore.py b/tests/dragon/featurestore.py index f9d4a1da2..a249620fb 100644 --- a/tests/dragon/featurestore.py +++ b/tests/dragon/featurestore.py @@ -115,7 +115,24 @@ def descriptor(self) -> str: """Return a unique identifier enabling a client to connect to the feature store :returns: A descriptor encoded as a string""" - return "in-memory-fs" + if not self._storage_dir: + raise ValueError("No storage path configured") + return self._storage_dir.as_posix() + + @classmethod + def from_descriptor( + cls, + descriptor: str, + # b64encoded: bool = False, + ) -> "FileSystemFeatureStore": + # if b64encoded: + # descriptor = base64.b64decode(descriptor).encode("utf-8") + path = pathlib.Path(descriptor) + if not path.is_dir(): + raise ValueError("FileSystemFeatureStore requires a directory path") + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + return FileSystemFeatureStore(path) class DragonDict: diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index d4adb3587..12893d3b2 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -24,15 +24,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import base64 -import os -import pickle -import typing as t +import pathlib import pytest -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim.error.errors import SmartSimError +from tests.mli.channel import FileSystemCommChannel dragon = pytest.importorskip("dragon") @@ -46,7 +42,7 @@ DragonFeatureStore, ) -from .featurestore import FileSystemFeatureStore, MemoryFeatureStore +from .featurestore import FileSystemFeatureStore # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon @@ -59,13 +55,17 @@ pytest.param(b"new byte string"), ], ) -def test_environment_loader_attach_FLI(content, monkeypatch): +def test_environment_loader_attach_FLI(content: bytes, monkeypatch: pytest.MonkeyPatch): """A descriptor can be stored, loaded, and reattached""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) - config = EnvironmentConfigLoader() + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=FileSystemCommChannel.from_descriptor, + ) config_queue = config.get_queue() new_sender = config_queue.send(content) @@ -75,92 +75,78 @@ def test_environment_loader_attach_FLI(content, monkeypatch): assert result == content -def test_environment_loader_serialize_FLI(monkeypatch): +def test_environment_loader_serialize_FLI(monkeypatch: pytest.MonkeyPatch): """The serialized descriptors of a loaded and unloaded queue are the same""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) - config = EnvironmentConfigLoader() + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=FileSystemCommChannel.from_descriptor, + ) config_queue = config.get_queue() assert config_queue._fli.serialize() == queue.serialize() -def test_environment_loader_FLI_fails(monkeypatch): +def test_environment_loader_FLI_fails(monkeypatch: pytest.MonkeyPatch): """An incorrect serialized descriptor will fails to attach""" monkeypatch.setenv("SSQueue", "randomstring") - config = EnvironmentConfigLoader() + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=FileSystemCommChannel.from_descriptor, + ) with pytest.raises(DragonFLIError): config_queue = config.get_queue() -@pytest.mark.parametrize( - "feature_stores", - [ - pytest.param([], id="No feature stores"), - pytest.param([MemoryFeatureStore()], id="Single feature store"), - pytest.param( - [MemoryFeatureStore(), FileSystemFeatureStore()], - id="Multiple feature stores", - ), - ], -) -def test_environment_loader_featurestores( - feature_stores: t.List[FeatureStore], monkeypatch: pytest.MonkeyPatch +def test_environment_loader_backbone_load_fs( + monkeypatch: pytest.MonkeyPatch, test_dir: str ): - """FeatureStore can be correctly identified, serialized and deserialized""" - with monkeypatch.context() as m: - for fs in feature_stores: - value = base64.b64encode(pickle.dumps(fs)).decode("utf-8") - key = f"SSFeatureStore.{fs.descriptor}" - m.setenv(key, value) - - config = EnvironmentConfigLoader() - actual_feature_stores = config.get_feature_stores() + """Verify the file system feature store is loaded correctly by + the EnvironmentConfigLoader to demonstrate fs_factory correctness""" + fs = FileSystemFeatureStore(pathlib.Path(test_dir)) + monkeypatch.setenv("SS_DRG_DDICT", fs.descriptor) - for fs in feature_stores: - # Confirm that the descriptors were used as keys in the loaded feature stores - assert fs.descriptor in actual_feature_stores + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=FileSystemCommChannel.from_descriptor, + ) - # Confirm that the value loaded from env var is a FeatureStore - # and it is consistent w/the key identifying it - loaded_fs = actual_feature_stores[fs.descriptor] - assert loaded_fs.descriptor == fs.descriptor + backbone = config.get_backbone() + assert backbone is not None -@pytest.mark.parametrize( - "value_to_use,error_filter", - [ - pytest.param("", "empty", id="Empty value"), - pytest.param("abcd", "invalid", id="Incorrectly serialized value"), - ], -) -def test_environment_loader_featurestores_errors( - value_to_use: str, error_filter: str, monkeypatch: pytest.MonkeyPatch +def test_environment_loader_backbone_load_dfs( + monkeypatch: pytest.MonkeyPatch, test_dir: str ): - """Verify that the environment loader reports an error when a feature store - env var is populated with something that cannot be loaded properly""" - - fs = FileSystemFeatureStore() # just use for descriptor... - key = f"SSFeatureStore.{fs.descriptor}" - - with monkeypatch.context() as m, pytest.raises(SmartSimError) as ex: - m.setenv(key, value_to_use) # <----- simulate incorrect value in env var + """Verify the dragon feature store is loaded correctly by + the EnvironmentConfigLoader to demonstrate fs_factory correctness""" + fs = DragonFeatureStore(DDict()) + monkeypatch.setenv("SS_DRG_DDICT", fs.descriptor) - config = EnvironmentConfigLoader() - config.get_feature_stores() # <---- kick off validation + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=FileSystemCommChannel.from_descriptor, + ) - # confirm the specific key is reported in error message - assert key in ex.value.args[0] - # ensure the failure occurred during loading - assert error_filter in ex.value.args[0].lower() + backbone = config.get_backbone() + assert backbone is not None def test_environment_variables_not_set(): """EnvironmentConfigLoader getters return None when environment variables are not set""" - config = EnvironmentConfigLoader() - assert config.get_feature_stores() == {} + config = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=FileSystemCommChannel.from_descriptor, + ) + assert config.get_backbone() == None assert config.get_queue() == None diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index c929c2341..3231b4af2 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -24,12 +24,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import base64 -import pickle from unittest.mock import MagicMock import pytest +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel + dragon = pytest.importorskip("dragon") import dragon.utils as du @@ -45,6 +45,7 @@ from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( DragonFeatureStore, ) +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, FetchInputResult, @@ -64,30 +65,51 @@ @pytest.fixture -def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch): +def backbone_descriptor() -> str: + # create a shared backbone featurestore + feature_store = DragonFeatureStore(DDict()) + return feature_store.descriptor + + +@pytest.fixture +def app_feature_store() -> FeatureStore: + # create a standalone feature store to mimic a user application putting + # data into an application-owned resource (app should not access backbone) + app_fs = DragonFeatureStore(DDict()) + return app_fs + + +@pytest.fixture +def setup_worker_manager_model_bytes( + test_dir, + monkeypatch: pytest.MonkeyPatch, + backbone_descriptor: str, + app_feature_store: FeatureStore, +): integrated_worker = IntegratedTorchWorker() chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) - storage = DDict() - feature_store = DragonFeatureStore(storage) - monkeypatch.setenv( - "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") - ) + # Put backbone descriptor into env var for the `EnvironmentConfigLoader` + monkeypatch.setenv("SS_DRG_DDICT", backbone_descriptor) worker_manager = WorkerManager( - EnvironmentConfigLoader(), + EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ), integrated_worker, as_service=False, cooldown=3, - comm_channel_type=FileSystemCommChannel, ) - tensor_key = MessageHandler.build_tensor_key("key", feature_store.descriptor) + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + output_key = MessageHandler.build_tensor_key("key", f"{test_dir}/out") model = MessageHandler.build_model(b"model", "model name", "v 0.0.1") request = MessageHandler.build_request( - test_dir, model, [tensor_key], [tensor_key], [], None + test_dir, model, [tensor_key], [output_key], [], None ) ser_request = MessageHandler.serialize_request(request) worker_manager._task_queue.send(ser_request) @@ -96,30 +118,38 @@ def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch): @pytest.fixture -def setup_worker_manager_model_key(test_dir, monkeypatch: pytest.MonkeyPatch): +def setup_worker_manager_model_key( + test_dir: str, + monkeypatch: pytest.MonkeyPatch, + backbone_descriptor: str, + app_feature_store: FeatureStore, +): integrated_worker = IntegratedTorchWorker() chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) - storage = DDict() - feature_store = DragonFeatureStore(storage) - monkeypatch.setenv( - "SSFeatureStore", base64.b64encode(pickle.dumps(feature_store)).decode("utf-8") - ) + # Put backbone descriptor into env var for the `EnvironmentConfigLoader` + monkeypatch.setenv("SS_DRG_DDICT", backbone_descriptor) worker_manager = WorkerManager( - EnvironmentConfigLoader(), + EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ), integrated_worker, as_service=False, cooldown=3, - comm_channel_type=FileSystemCommChannel, ) - tensor_key = MessageHandler.build_tensor_key("key", feature_store.descriptor) - model_key = MessageHandler.build_model_key("model key", feature_store.descriptor) + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + output_key = MessageHandler.build_tensor_key("key", f"{test_dir}/out") + model_key = MessageHandler.build_model_key( + "model key", app_feature_store.descriptor + ) request = MessageHandler.build_request( - test_dir, model_key, [tensor_key], [tensor_key], [], None + test_dir, model_key, [tensor_key], [output_key], [], None ) ser_request = MessageHandler.serialize_request(request) worker_manager._task_queue.send(ser_request) @@ -162,7 +192,11 @@ def mock_exception_handler(exc, reply_channel, failure_message): pytest.param( "fetch_model", "Failed while fetching the model.", id="fetch model" ), - pytest.param("load_model", "Failed while loading the model.", id="load model"), + pytest.param( + "load_model", + "Failed while loading model from feature store.", + id="load model", + ), pytest.param( "fetch_inputs", "Failed while fetching the inputs.", id="fetch inputs" ), diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index df76c484b..4c677eb4c 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -62,3 +62,14 @@ def recv(self) -> bytes: """Receieve a message through the underlying communication channel :returns: the received message""" ... + + @classmethod + def from_descriptor( + cls, + descriptor: t.Union[str, bytes], + ) -> "FileSystemCommChannel": + if isinstance(descriptor, str): + path = pathlib.Path(descriptor) + else: + path = pathlib.Path(descriptor.decode("utf-8")) + return FileSystemCommChannel(path) diff --git a/tests/dragon/utils/worker.py b/tests/dragon/utils/worker.py index b1de28018..f6c8120e0 100644 --- a/tests/dragon/utils/worker.py +++ b/tests/dragon/utils/worker.py @@ -47,7 +47,7 @@ class IntegratedTorchWorker(mliw.MachineLearningWorkerBase): @staticmethod def load_model( - request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult + request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult, device: str ) -> mliw.LoadModelResult: model_bytes = fetch_result.model_bytes or request.raw_model if not model_bytes: @@ -61,6 +61,7 @@ def load_model( def transform_input( request: mliw.InferenceRequest, fetch_result: mliw.FetchInputResult, + device: str, ) -> mliw.TransformInputResult: # extra metadata for assembly can be found in request.input_meta raw_inputs = request.raw_inputs or fetch_result.inputs @@ -93,6 +94,7 @@ def execute( def transform_output( request: mliw.InferenceRequest, execute_result: mliw.ExecuteResult, + result_device: str, ) -> mliw.TransformOutputResult: # transformed = [item.clone() for item in execute_result.predictions] # return OutputTransformResult(transformed) diff --git a/tests/mli/channel.py b/tests/mli/channel.py index 4bc2014ea..743a21595 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -57,3 +57,16 @@ def send(self, value: bytes) -> None: f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" ) self._file_path.write_bytes(value) + + def recv(self) -> t.List[bytes]: + """Receieve a message through the underlying communication channel + :returns: the received message""" + self._file_path.read_bytes() + + @classmethod + def from_descriptor( + cls, + descriptor: str, + ) -> "FileSystemCommChannel": + path = pathlib.Path(descriptor) + return FileSystemCommChannel(path) diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py index f9d4a1da2..c15a20a34 100644 --- a/tests/mli/featurestore.py +++ b/tests/mli/featurestore.py @@ -62,7 +62,7 @@ def descriptor(self) -> str: """Return a unique identifier enabling a client to connect to the feature store :returns: A descriptor encoded as a string""" - return "file-system-fs" + return "in-memory-fs" class FileSystemFeatureStore(FeatureStore): @@ -115,7 +115,24 @@ def descriptor(self) -> str: """Return a unique identifier enabling a client to connect to the feature store :returns: A descriptor encoded as a string""" - return "in-memory-fs" + if not self._storage_dir: + raise ValueError("No storage path configured") + return self._storage_dir.as_posix() + + @classmethod + def from_descriptor( + cls, + descriptor: str, + # b64encoded: bool = False, + ) -> "FileSystemFeatureStore": + # if b64encoded: + # descriptor = base64.b64decode(descriptor).encode("utf-8") + path = pathlib.Path(descriptor) + if not path.is_dir(): + raise ValueError("FileSystemFeatureStore requires a directory path") + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + return FileSystemFeatureStore(path) class DragonDict: diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index dc4c026c0..f48395a76 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -32,50 +32,38 @@ import pytest -from tests.mli.featurestore import FileSystemFeatureStore +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel torch = pytest.importorskip("torch") dragon = pytest.importorskip("dragon") +import base64 +import os + +import dragon.channels as dch +from dragon import fli + +from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.infrastructure.control.workermanager import ( EnvironmentConfigLoader, WorkerManager, ) +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger +from tests.mli.featurestore import FileSystemFeatureStore from .channel import FileSystemCommChannel -from .worker import IntegratedTorchWorker logger = get_logger(__name__) # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon -def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None: - """Mock event producer for triggering the inference pipeline""" - # todo: move to unit tests - while True: - time.sleep(1) - # 1. for demo, ignore upstream and just put stuff into downstream - # 2. for demo, only one downstream but we'd normally have to filter - # msg content and send to the correct downstream (worker) queue - timestamp = time.time_ns() - output_dir = "/lus/bnchlu1/mcbridch/code/ss/_tmp" - output_path = pathlib.Path(output_dir) - - mock_channel = output_path / f"brainstorm-{timestamp}.txt" - mock_model = output_path / "brainstorm.pt" - - output_path.mkdir(parents=True, exist_ok=True) - mock_channel.touch() - mock_model.touch() - - msg = f"PyTorch:{mock_model}:MockInputToReplace:{mock_channel}" - worker_manager_queue.put(msg.encode("utf-8")) - - def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: """Create a simple torch model and persist to disk for testing purposes. @@ -95,7 +83,7 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: def mock_messages( - worker_manager_queue: "mp.Queue[bytes]", + worker_manager_queue: CommChannelBase, feature_store: FeatureStore, feature_store_root_dir: pathlib.Path, comm_channel_root_dir: pathlib.Path, @@ -140,7 +128,7 @@ def mock_messages( tensor = torch.randn((1, 2), dtype=torch.float32) torch.save(tensor, buffer) feature_store[input_key] = buffer.getvalue() - fsd = feature_store.descriptor() + fsd = feature_store.descriptor message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) @@ -155,7 +143,7 @@ def mock_messages( custom_attributes=None, ) request_bytes = MessageHandler.serialize_request(request) - worker_manager_queue.put(request_bytes) + worker_manager_queue.send(request_bytes) @pytest.fixture @@ -173,22 +161,42 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: fs_path = test_path / "feature_store" comm_path = test_path / "comm_store" - config_loader = EnvironmentConfigLoader() - integrated_worker = IntegratedTorchWorker() + to_worker_channel = dch.Channel.make_process_local() + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + to_worker_fli_serialized = to_worker_fli.serialize() + + # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader + # or test environment may be unable to send messages w/queue + os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + integrated_worker = TorchWorker() worker_manager = WorkerManager( config_loader, integrated_worker, as_service=True, - cooldown=10, - comm_channel_type=FileSystemCommChannel, + cooldown=5, + # comm_channel_type=FileSystemCommChannel, + # featurestore_factory=FileSystemFeatureStore.from_descriptor, + device="cpu", ) + worker_queue = config_loader.get_queue() + if worker_queue is None: + logger.warn( + f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" + ) + # create a mock client application to populate the request queue msg_pump = mp.Process( target=mock_messages, args=( - config_loader.get_queue(), + worker_queue, FileSystemFeatureStore(fs_path), fs_path, comm_path, diff --git a/tests/mli/worker.py b/tests/mli/worker.py index b1de28018..f6c8120e0 100644 --- a/tests/mli/worker.py +++ b/tests/mli/worker.py @@ -47,7 +47,7 @@ class IntegratedTorchWorker(mliw.MachineLearningWorkerBase): @staticmethod def load_model( - request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult + request: mliw.InferenceRequest, fetch_result: mliw.FetchModelResult, device: str ) -> mliw.LoadModelResult: model_bytes = fetch_result.model_bytes or request.raw_model if not model_bytes: @@ -61,6 +61,7 @@ def load_model( def transform_input( request: mliw.InferenceRequest, fetch_result: mliw.FetchInputResult, + device: str, ) -> mliw.TransformInputResult: # extra metadata for assembly can be found in request.input_meta raw_inputs = request.raw_inputs or fetch_result.inputs @@ -93,6 +94,7 @@ def execute( def transform_output( request: mliw.InferenceRequest, execute_result: mliw.ExecuteResult, + result_device: str, ) -> mliw.TransformOutputResult: # transformed = [item.clone() for item in execute_result.predictions] # return OutputTransformResult(transformed) From 24df7da7574dbe4af8479a4b5bcabb2c92254cbf Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Mon, 29 Jul 2024 09:32:31 -0500 Subject: [PATCH 05/49] add missing from_descriptor methods, --- .../_core/mli/comm/channel/dragonchannel.py | 12 +++++ smartsim/_core/mli/comm/channel/dragonfli.py | 12 +++-- .../mli/infrastructure/environmentloader.py | 7 +-- .../storage/dragonfeaturestore.py | 19 ++------ .../infrastructure/storage/featurestore.py | 4 ++ tests/dragon/featurestore.py | 48 +++++++------------ tests/dragon/utils/channel.py | 25 +++++++--- tests/mli/channel.py | 21 ++++++-- tests/mli/featurestore.py | 48 +++++++------------ 9 files changed, 100 insertions(+), 96 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 672fce75b..c52c9f68c 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import base64 import sys import typing as t @@ -59,3 +60,14 @@ def recv(self) -> t.List[bytes]: with self._channel.recvh(timeout=None) as recvh: message_bytes: bytes = recvh.recv_bytes(timeout=None) return [message_bytes] + + @classmethod + def from_descriptor( + cls, + descriptor: str, + ) -> "DragonCommChannel": + try: + return DragonCommChannel(base64.b64decode(descriptor)) + except: + print(f"failed to create dragon comm channel: {descriptor}") + raise diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 503c17ad3..84f49fd52 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -76,7 +76,11 @@ def from_descriptor( cls, descriptor: str, ) -> "DragonFLIChannel": - return DragonFLIChannel( - fli_desc=base64.b64decode(descriptor), - sender_supplied=True, - ) + try: + return DragonFLIChannel( + fli_desc=base64.b64decode(descriptor), + sender_supplied=True, + ) + except: + logger.error(f"Error while creating DragonFLIChannel: {descriptor}") + raise diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index ec38a56dd..3c64fffe9 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -83,12 +83,7 @@ def get_queue(self) -> t.Optional[CommChannelBase]: logger.warning("No queue factory is configured") return None - if descriptor is not None: - # , sender_supplied: bool = True - # self.queue = DragonFLIChannel( - # fli_desc=base64.b64decode(descriptor), - # sender_supplied=sender_supplied, - # ) + if descriptor is not None and descriptor: self.queue = self._queue_factory(descriptor) self._queue_descriptor = descriptor return self.queue diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index 213d29cf4..65ebd57b7 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -85,18 +85,9 @@ def from_descriptor( descriptor: str, # b64encoded: bool = False, ) -> "DragonFeatureStore": - # import dragon.data.ddict.ddict as dragon_ddict # pylint: disable=import-outside-toplevel - # # if b64encoded: - # # descriptor = base64.b64decode(descriptor).encode("utf-8") - # # ddict = DDict.attach(descriptor) - # # ddict.attach(descriptor) - - # storage = dragon_ddict.DDict() - # storage.attach(descriptor) - # return DragonFeatureStore(storage) - - if descriptor is None: - print("foo") - return None - return DragonFeatureStore({"tmp": "here"}) + try: + return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor)) + except: + print(f"error creating dragon feature store: {descriptor}") + raise diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index 49f16af8a..4531f6696 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -29,6 +29,10 @@ from pydantic import BaseModel, Field +from smartsim.log import get_logger + +logger = get_logger(__name__) + class FeatureStoreKey(BaseModel): """A key,descriptor pair enabling retrieval of an item from a feature store""" diff --git a/tests/dragon/featurestore.py b/tests/dragon/featurestore.py index a249620fb..352cd8661 100644 --- a/tests/dragon/featurestore.py +++ b/tests/dragon/featurestore.py @@ -29,6 +29,9 @@ import smartsim.error as sse from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.log import get_logger + +logger = get_logger(__name__) class MemoryFeatureStore(FeatureStore): @@ -69,9 +72,13 @@ class FileSystemFeatureStore(FeatureStore): """Alternative feature store implementation for testing. Stores all data on the file system""" - def __init__(self, storage_dir: t.Optional[pathlib.Path] = None) -> None: + def __init__( + self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None + ) -> None: """Initialize the FileSystemFeatureStore instance :param storage_dir: (optional) root directory to store all data relative to""" + if isinstance(storage_dir, str): + storage_dir = pathlib.Path(storage_dir) self._storage_dir = storage_dir def __getitem__(self, key: str) -> bytes: @@ -127,33 +134,14 @@ def from_descriptor( ) -> "FileSystemFeatureStore": # if b64encoded: # descriptor = base64.b64decode(descriptor).encode("utf-8") - path = pathlib.Path(descriptor) - if not path.is_dir(): - raise ValueError("FileSystemFeatureStore requires a directory path") - if not path.exists(): + try: + path = pathlib.Path(descriptor) path.mkdir(parents=True, exist_ok=True) - return FileSystemFeatureStore(path) - - -class DragonDict: - """Mock implementation of a dragon dictionary""" - - def __init__(self) -> None: - """Initialize the mock DragonDict instance""" - self._storage: t.Dict[bytes, t.Any] = {} - - def __getitem__(self, key: bytes) -> t.Any: - """Retrieve an item using key - :param key: Unique key of an item to retrieve from the feature store""" - return self._storage[key] - - def __setitem__(self, key: bytes, value: t.Any) -> None: - """Assign a value using key - :param key: Unique key of an item to set in the feature store - :param value: Value to persist in the feature store""" - self._storage[key] = value - - def __contains__(self, key: bytes) -> bool: - """Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" - return key in self._storage + if not path.is_dir(): + raise ValueError("FileSystemFeatureStore requires a directory path") + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + return FileSystemFeatureStore(path) + except: + logger.error(f"Error while creating FileSystemFeatureStore: {descriptor}") + raise diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index 4c677eb4c..7141eacec 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pathlib +import threading import typing as t from smartsim._core.mli.comm.channel.channel import CommChannelBase @@ -38,6 +39,8 @@ class FileSystemCommChannel(CommChannelBase): def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: """Initialize the FileSystemCommChannel instance""" + self._lock = threading.RLock() + if not isinstance(key, bytes): super().__init__(key.as_posix().encode("utf-8")) self._file_path = key @@ -56,20 +59,28 @@ def send(self, value: bytes) -> None: logger.debug( f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" ) - self._file_path.write_bytes(value) + with self._lock: + self._file_path.write_bytes(value) def recv(self) -> bytes: """Receieve a message through the underlying communication channel :returns: the received message""" - ... + with self._lock: + if self._file_path.exists(): + incoming = self._file_path.read_bytes() + self._file_path.unlink() + return incoming @classmethod def from_descriptor( cls, descriptor: t.Union[str, bytes], ) -> "FileSystemCommChannel": - if isinstance(descriptor, str): - path = pathlib.Path(descriptor) - else: - path = pathlib.Path(descriptor.decode("utf-8")) - return FileSystemCommChannel(path) + try: + if isinstance(descriptor, str): + path = pathlib.Path(descriptor) + else: + path = pathlib.Path(descriptor.decode("utf-8")) + return FileSystemCommChannel(path) + except: + print("failed to create FS comm channel: {descriptor}") diff --git a/tests/mli/channel.py b/tests/mli/channel.py index 743a21595..bf155b24b 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pathlib +import threading import typing as t from smartsim._core.mli.comm.channel.channel import CommChannelBase @@ -38,6 +39,7 @@ class FileSystemCommChannel(CommChannelBase): def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: """Initialize the FileSystemCommChannel instance""" + self._lock = threading.RLock() if not isinstance(key, bytes): super().__init__(key.as_posix().encode("utf-8")) self._file_path = key @@ -56,17 +58,26 @@ def send(self, value: bytes) -> None: logger.debug( f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" ) - self._file_path.write_bytes(value) + with self._lock: + self._file_path.write_bytes(value) - def recv(self) -> t.List[bytes]: + def recv(self) -> bytes: """Receieve a message through the underlying communication channel :returns: the received message""" - self._file_path.read_bytes() + with self._lock: + if self._file_path.exists(): + incoming = self._file_path.read_bytes() + self._file_path.unlink() + return incoming @classmethod def from_descriptor( cls, descriptor: str, ) -> "FileSystemCommChannel": - path = pathlib.Path(descriptor) - return FileSystemCommChannel(path) + try: + path = pathlib.Path(descriptor) + return FileSystemCommChannel(path) + except: + print(f"failed to create fs comm channel: {descriptor}") + raise diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py index c15a20a34..ecae32203 100644 --- a/tests/mli/featurestore.py +++ b/tests/mli/featurestore.py @@ -29,6 +29,9 @@ import smartsim.error as sse from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim.log import get_logger + +logger = get_logger(__name__) class MemoryFeatureStore(FeatureStore): @@ -69,9 +72,13 @@ class FileSystemFeatureStore(FeatureStore): """Alternative feature store implementation for testing. Stores all data on the file system""" - def __init__(self, storage_dir: t.Optional[pathlib.Path] = None) -> None: + def __init__( + self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None + ) -> None: """Initialize the FileSystemFeatureStore instance :param storage_dir: (optional) root directory to store all data relative to""" + if isinstance(storage_dir, str): + storage_dir = pathlib.Path(storage_dir) self._storage_dir = storage_dir def __getitem__(self, key: str) -> bytes: @@ -127,33 +134,14 @@ def from_descriptor( ) -> "FileSystemFeatureStore": # if b64encoded: # descriptor = base64.b64decode(descriptor).encode("utf-8") - path = pathlib.Path(descriptor) - if not path.is_dir(): - raise ValueError("FileSystemFeatureStore requires a directory path") - if not path.exists(): + try: + path = pathlib.Path(descriptor) path.mkdir(parents=True, exist_ok=True) - return FileSystemFeatureStore(path) - - -class DragonDict: - """Mock implementation of a dragon dictionary""" - - def __init__(self) -> None: - """Initialize the mock DragonDict instance""" - self._storage: t.Dict[bytes, t.Any] = {} - - def __getitem__(self, key: bytes) -> t.Any: - """Retrieve an item using key - :param key: Unique key of an item to retrieve from the feature store""" - return self._storage[key] - - def __setitem__(self, key: bytes, value: t.Any) -> None: - """Assign a value using key - :param key: Unique key of an item to set in the feature store - :param value: Value to persist in the feature store""" - self._storage[key] = value - - def __contains__(self, key: bytes) -> bool: - """Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" - return key in self._storage + if not path.is_dir(): + raise ValueError("FileSystemFeatureStore requires a directory path") + if not path.exists(): + path.mkdir(parents=True, exist_ok=True) + return FileSystemFeatureStore(path) + except: + logger.error(f"Error while creating FileSystemFeatureStore: {descriptor}") + raise From 82fb67a853bf3dd90723fa27759443d1dc3f3f8a Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Mon, 29 Jul 2024 10:45:29 -0500 Subject: [PATCH 06/49] fix --- .../_core/mli/comm/channel/dragonchannel.py | 2 +- .../storage/dragonfeaturestore.py | 2 +- tests/dragon/test_environment_loader.py | 32 +++++++++---------- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index c52c9f68c..d8c0a22ac 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -69,5 +69,5 @@ def from_descriptor( try: return DragonCommChannel(base64.b64decode(descriptor)) except: - print(f"failed to create dragon comm channel: {descriptor}") + logger.error(f"Failed to create dragon comm channel: {descriptor}") raise diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index 65ebd57b7..96940886b 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -89,5 +89,5 @@ def from_descriptor( try: return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor)) except: - print(f"error creating dragon feature store: {descriptor}") + logger.error(f"Error creating dragon feature store: {descriptor}") raise diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 12893d3b2..72c3ba4f9 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -24,11 +24,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pathlib - import pytest -from tests.mli.channel import FileSystemCommChannel dragon = pytest.importorskip("dragon") @@ -42,7 +39,8 @@ DragonFeatureStore, ) -from .featurestore import FileSystemFeatureStore +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel + # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon @@ -63,8 +61,8 @@ def test_environment_loader_attach_FLI(content: bytes, monkeypatch: pytest.Monke config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=FileSystemCommChannel.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonCommChannel.from_descriptor, ) config_queue = config.get_queue() @@ -84,8 +82,8 @@ def test_environment_loader_serialize_FLI(monkeypatch: pytest.MonkeyPatch): config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=FileSystemCommChannel.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonCommChannel.from_descriptor, ) config_queue = config.get_queue() assert config_queue._fli.serialize() == queue.serialize() @@ -96,8 +94,8 @@ def test_environment_loader_FLI_fails(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("SSQueue", "randomstring") config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=FileSystemCommChannel.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonCommChannel.from_descriptor, ) with pytest.raises(DragonFLIError): @@ -109,13 +107,13 @@ def test_environment_loader_backbone_load_fs( ): """Verify the file system feature store is loaded correctly by the EnvironmentConfigLoader to demonstrate fs_factory correctness""" - fs = FileSystemFeatureStore(pathlib.Path(test_dir)) + fs = DragonFeatureStore(DDict()) monkeypatch.setenv("SS_DRG_DDICT", fs.descriptor) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=FileSystemCommChannel.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonCommChannel.from_descriptor, ) backbone = config.get_backbone() @@ -132,8 +130,8 @@ def test_environment_loader_backbone_load_dfs( config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=FileSystemCommChannel.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonCommChannel.from_descriptor, ) backbone = config.get_backbone() @@ -145,8 +143,8 @@ def test_environment_variables_not_set(): variables are not set""" config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=FileSystemCommChannel.from_descriptor, + callback_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonCommChannel.from_descriptor, ) assert config.get_backbone() == None assert config.get_queue() == None From 65cf4d1dc7a4547b630b9d4988ec0f0696c40810 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:01:11 -0500 Subject: [PATCH 07/49] fix env loader tests --- tests/dragon/test_environment_loader.py | 58 ++++++++----------------- 1 file changed, 18 insertions(+), 40 deletions(-) diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 72c3ba4f9..1b338e1d9 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -26,7 +26,6 @@ import pytest - dragon = pytest.importorskip("dragon") import dragon.utils as du @@ -34,14 +33,13 @@ from dragon.data.ddict.ddict import DDict from dragon.fli import DragonFLIError, FLInterface +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( DragonFeatureStore, ) -from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel - - # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon @@ -53,7 +51,7 @@ pytest.param(b"new byte string"), ], ) -def test_environment_loader_attach_FLI(content: bytes, monkeypatch: pytest.MonkeyPatch): +def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.MonkeyPatch): """A descriptor can be stored, loaded, and reattached""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) @@ -62,18 +60,18 @@ def test_environment_loader_attach_FLI(content: bytes, monkeypatch: pytest.Monke config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) config_queue = config.get_queue() - new_sender = config_queue.send(content) + _ = config_queue.send(content) old_recv = queue.recvh() result, _ = old_recv.recv_bytes() assert result == content -def test_environment_loader_serialize_FLI(monkeypatch: pytest.MonkeyPatch): +def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): """The serialized descriptors of a loaded and unloaded queue are the same""" chan = Channel.make_process_local() @@ -83,55 +81,35 @@ def test_environment_loader_serialize_FLI(monkeypatch: pytest.MonkeyPatch): config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, ) config_queue = config.get_queue() assert config_queue._fli.serialize() == queue.serialize() -def test_environment_loader_FLI_fails(monkeypatch: pytest.MonkeyPatch): +def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): """An incorrect serialized descriptor will fails to attach""" monkeypatch.setenv("SSQueue", "randomstring") config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonCommChannel.from_descriptor, + callback_factory=None, + queue_factory=DragonFLIChannel.from_descriptor, ) with pytest.raises(DragonFLIError): - config_queue = config.get_queue() + config.get_queue() -def test_environment_loader_backbone_load_fs( - monkeypatch: pytest.MonkeyPatch, test_dir: str -): - """Verify the file system feature store is loaded correctly by - the EnvironmentConfigLoader to demonstrate fs_factory correctness""" - fs = DragonFeatureStore(DDict()) - monkeypatch.setenv("SS_DRG_DDICT", fs.descriptor) - - config = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonCommChannel.from_descriptor, - ) - - backbone = config.get_backbone() - assert backbone is not None - - -def test_environment_loader_backbone_load_dfs( - monkeypatch: pytest.MonkeyPatch, test_dir: str -): +def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): """Verify the dragon feature store is loaded correctly by the EnvironmentConfigLoader to demonstrate fs_factory correctness""" - fs = DragonFeatureStore(DDict()) - monkeypatch.setenv("SS_DRG_DDICT", fs.descriptor) + feature_store = DragonFeatureStore(DDict()) + monkeypatch.setenv("SS_DRG_DDICT", feature_store.descriptor) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonCommChannel.from_descriptor, + callback_factory=None, + queue_factory=None, ) backbone = config.get_backbone() @@ -146,5 +124,5 @@ def test_environment_variables_not_set(): callback_factory=DragonCommChannel.from_descriptor, queue_factory=DragonCommChannel.from_descriptor, ) - assert config.get_backbone() == None - assert config.get_queue() == None + assert config.get_backbone() is None + assert config.get_queue() is None From 15806fe33b3ef0f22e08dc4bea5b24a9d1c95f79 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:10:41 -0500 Subject: [PATCH 08/49] move import below conditional --- tests/mli/test_worker_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index f48395a76..d2fe85d00 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -32,8 +32,6 @@ import pytest -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel - torch = pytest.importorskip("torch") dragon = pytest.importorskip("dragon") @@ -44,6 +42,7 @@ from dragon import fli from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.control.workermanager import ( EnvironmentConfigLoader, WorkerManager, From cb962be2d05a724c04f953b82ef82e239fe267c1 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:24:04 -0500 Subject: [PATCH 09/49] sort imports for dragon --- tests/dragon/test_error_handling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 3231b4af2..ecd2c8e41 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -28,7 +28,6 @@ import pytest -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel dragon = pytest.importorskip("dragon") @@ -37,6 +36,7 @@ from dragon.data.ddict.ddict import DDict from dragon.fli import FLInterface +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.control.workermanager import ( WorkerManager, exception_handler, From 36883c9301ca0ad3d8260134736d5d2a82446b0a Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:44:26 -0500 Subject: [PATCH 10/49] fix feature store type interleaving bug --- tests/dragon/test_error_handling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index ecd2c8e41..e071f80ea 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -106,7 +106,7 @@ def setup_worker_manager_model_bytes( ) tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) - output_key = MessageHandler.build_tensor_key("key", f"{test_dir}/out") + output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) model = MessageHandler.build_model(b"model", "model name", "v 0.0.1") request = MessageHandler.build_request( test_dir, model, [tensor_key], [output_key], [], None @@ -144,7 +144,7 @@ def setup_worker_manager_model_key( ) tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) - output_key = MessageHandler.build_tensor_key("key", f"{test_dir}/out") + output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) model_key = MessageHandler.build_model_key( "model key", app_feature_store.descriptor ) From 2e9f146553dbb846487569d9d8a0ae61e489fecd Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:51:17 -0500 Subject: [PATCH 11/49] isort --- tests/dragon/test_error_handling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index e071f80ea..73757014d 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -28,7 +28,6 @@ import pytest - dragon = pytest.importorskip("dragon") import dragon.utils as du From e011b70ff1646748252b0c8b9af0cc01c0b79612 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Mon, 29 Jul 2024 12:30:06 -0500 Subject: [PATCH 12/49] fix test failing new validation check --- tests/mli/test_core_machine_learning_worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/mli/test_core_machine_learning_worker.py index c7e1cb286..6fa9f9944 100644 --- a/tests/mli/test_core_machine_learning_worker.py +++ b/tests/mli/test_core_machine_learning_worker.py @@ -85,12 +85,12 @@ def persist_torch_tensor(test_dir: str) -> pathlib.Path: @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") -def test_fetch_model_disk(persist_torch_model: pathlib.Path) -> None: +def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> None: """Verify that the ML worker successfully retrieves a model when given a valid (file system) key""" worker = MachineLearningWorkerCore key = str(persist_torch_model) - feature_store = FileSystemFeatureStore() + feature_store = FileSystemFeatureStore(test_dir) fsd = feature_store.descriptor feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes() From e6dae22ba8c40f41ff2984ff5aa7c3b761f62d6d Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 17:50:37 -0500 Subject: [PATCH 13/49] revert gh workflow changes that will be merged later --- .github/workflows/run_tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index b8e96f05b..8ed348cbd 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -109,7 +109,7 @@ jobs: - name: Install SmartSim (with ML backends) run: | python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis - python -m pip install .[dev,mypy,ml] + python -m pip install .[dev,ml] - name: Install ML Runtimes with Smart (with pt, tf, and onnx support) if: (contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')) && ( matrix.subset != 'dragon' ) @@ -129,6 +129,7 @@ jobs: - name: Run mypy run: | + python -m pip install .[mypy] make check-mypy - name: Run Pylint From 4548fec2a75207ae1cd9336fababc613661e0dcc Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 17:55:42 -0500 Subject: [PATCH 14/49] add missing docstrings, remove commented parameters --- smartsim/_core/mli/comm/channel/dragonchannel.py | 3 +++ smartsim/_core/mli/comm/channel/dragonfli.py | 2 ++ .../_core/mli/infrastructure/storage/dragonfeaturestore.py | 5 +++-- tests/dragon/featurestore.py | 6 +++--- tests/dragon/utils/channel.py | 3 +++ tests/mli/channel.py | 3 +++ tests/mli/featurestore.py | 6 +++--- 7 files changed, 20 insertions(+), 8 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index d8c0a22ac..c9eca9046 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -66,6 +66,9 @@ def from_descriptor( cls, descriptor: str, ) -> "DragonCommChannel": + """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached DragonCommChannel""" try: return DragonCommChannel(base64.b64decode(descriptor)) except: diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 84f49fd52..ff95b2889 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -76,6 +76,8 @@ def from_descriptor( cls, descriptor: str, ) -> "DragonFLIChannel": + """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource""" try: return DragonFLIChannel( fli_desc=base64.b64decode(descriptor), diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index 96940886b..a90c1f901 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -83,9 +83,10 @@ def descriptor(self) -> str: def from_descriptor( cls, descriptor: str, - # b64encoded: bool = False, ) -> "DragonFeatureStore": - + """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached DragonFeatureStore""" try: return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor)) except: diff --git a/tests/dragon/featurestore.py b/tests/dragon/featurestore.py index 352cd8661..f8c645f6e 100644 --- a/tests/dragon/featurestore.py +++ b/tests/dragon/featurestore.py @@ -130,10 +130,10 @@ def descriptor(self) -> str: def from_descriptor( cls, descriptor: str, - # b64encoded: bool = False, ) -> "FileSystemFeatureStore": - # if b64encoded: - # descriptor = base64.b64decode(descriptor).encode("utf-8") + """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached FileSystemFeatureStore""" try: path = pathlib.Path(descriptor) path.mkdir(parents=True, exist_ok=True) diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index 7141eacec..7efe9b523 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -76,6 +76,9 @@ def from_descriptor( cls, descriptor: t.Union[str, bytes], ) -> "FileSystemCommChannel": + """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached FileSystemCommChannel""" try: if isinstance(descriptor, str): path = pathlib.Path(descriptor) diff --git a/tests/mli/channel.py b/tests/mli/channel.py index bf155b24b..9ae61a89b 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -75,6 +75,9 @@ def from_descriptor( cls, descriptor: str, ) -> "FileSystemCommChannel": + """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached FileSystemCommChannel""" try: path = pathlib.Path(descriptor) return FileSystemCommChannel(path) diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py index ecae32203..5545168b7 100644 --- a/tests/mli/featurestore.py +++ b/tests/mli/featurestore.py @@ -130,10 +130,10 @@ def descriptor(self) -> str: def from_descriptor( cls, descriptor: str, - # b64encoded: bool = False, ) -> "FileSystemFeatureStore": - # if b64encoded: - # descriptor = base64.b64decode(descriptor).encode("utf-8") + """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached FileSystemFeatureStore""" try: path = pathlib.Path(descriptor) path.mkdir(parents=True, exist_ok=True) From c29dc6b3d4defdc083bc5509f0a431f8e62f2c41 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 17:56:49 -0500 Subject: [PATCH 15/49] docstring --- smartsim/_core/mli/comm/channel/dragonfli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index ff95b2889..2cbcb6944 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -77,7 +77,8 @@ def from_descriptor( descriptor: str, ) -> "DragonFLIChannel": """A factory method that creates an instance from a descriptor string - :param descriptor: The descriptor that uniquely identifies the resource""" + :param descriptor: The descriptor that uniquely identifies the resource + :returns: An attached DragonFLIChannel""" try: return DragonFLIChannel( fli_desc=base64.b64decode(descriptor), From 24cbef2b78fb41e378b186bc47d495428769a806 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 17:57:24 -0500 Subject: [PATCH 16/49] remove commented out imports --- smartsim/_core/mli/infrastructure/control/workermanager.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index b7e409e46..6f960ced9 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -45,12 +45,8 @@ if t.TYPE_CHECKING: from dragon.fli import FLInterface - - # from smartsim._core.mli.mli_schemas.model.model_capnp import Model from smartsim._core.mli.mli_schemas.response.response_capnp import Status - # from smartsim._core.mli.mli_schemas.tensor.tensor_capnp import TensorDescriptor - logger = get_logger(__name__) From 4eb29b993a9b5bab0ae6061d6df454e7ba5bd863 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 17:59:07 -0500 Subject: [PATCH 17/49] remove commented out code --- smartsim/_core/mli/infrastructure/control/workermanager.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 6f960ced9..2ffb4d97e 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -92,10 +92,8 @@ def __init__( self, config_loader: EnvironmentConfigLoader, worker: MachineLearningWorkerBase, - # fs_factory: t.Callable[[str], FeatureStore], as_service: bool = False, cooldown: int = 0, - # comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: """Initialize the WorkerManager From eb793b600e3b516324a54909d378b118182b3dbd Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:02:36 -0500 Subject: [PATCH 18/49] improve documentation on purpose of backbone fs --- smartsim/_core/mli/infrastructure/control/workermanager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 2ffb4d97e..940f70f98 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -122,7 +122,8 @@ def __init__( self._fs_factory = config_loader._featurestore_factory """A factory method to create a desired feature store client type""" self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone() - """The backbone feature store""" + """A standalone, system-created feature store used to share internal + information among MLI components""" def _check_feature_stores(self, request: InferenceRequest) -> bool: """Ensures that all feature stores required by the request are available From 318deacb2f8c17c95c897a892550136a2fe044cc Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:10:51 -0500 Subject: [PATCH 19/49] improve documentatoin about backbone usage --- smartsim/_core/mli/infrastructure/environmentloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 3c64fffe9..81551093f 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -65,7 +65,8 @@ def __init__( def get_backbone(self) -> t.Optional[FeatureStore]: """Create the backbone feature store using the descriptor found in - an environment variable""" + an environment variable. The backbone is a standalone, system-created + feature store used to share internal information among MLI components""" descriptor = self._backbone_descriptor or os.getenv("SS_DRG_DDICT", None) if self._featurestore_factory is None: logger.warning("No feature store factory is configured") From 0eac344f0f4112971043de4ae3ec3240576a39da Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:14:15 -0500 Subject: [PATCH 20/49] remove deprecated & add missing docstring params --- smartsim/_core/mli/infrastructure/control/workermanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 940f70f98..57254de93 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -103,7 +103,7 @@ def __init__( :param as_service: Specifies run-once or run-until-complete behavior of service :param cooldown: Number of seconds to wait before shutting down after shutdown criteria are met - :param comm_channel_type: The type of communication channel used for callbacks + :param device: The type of hardware the workers must be executed on """ super().__init__(as_service, cooldown) From d3b951284323e6347e5612948e186d591f6b09fd Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:15:41 -0500 Subject: [PATCH 21/49] fix renamed param in docstring --- smartsim/_core/mli/infrastructure/worker/worker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index f1d0775f0..89fb63524 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -163,7 +163,8 @@ def deserialize_message( ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest :param data_blob: The byte stream to deserialize - :param channel_type: Type to be used for callback communications + :param callback_factory: A factory method that can create an instance + of the desired concrete comm channel type :returns: The raw input message deserialized into an InferenceRequest """ request = MessageHandler.deserialize_request(data_blob) From 6e387e80dbf881c0c28c4cbc0b40df3593017f1a Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:19:26 -0500 Subject: [PATCH 22/49] remove commented lines --- tests/mli/test_worker_manager.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index d2fe85d00..026d1f32f 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -180,8 +180,6 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: integrated_worker, as_service=True, cooldown=5, - # comm_channel_type=FileSystemCommChannel, - # featurestore_factory=FileSystemFeatureStore.from_descriptor, device="cpu", ) @@ -203,7 +201,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: ) msg_pump.start() - # # create a process to process commands + # create a process to execute commands process = mp.Process(target=worker_manager.execute) process.start() process.join(timeout=5) From a89f1608fe339e0f9caacb1cabc2e952190032a0 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:20:27 -0500 Subject: [PATCH 23/49] remove commented lines --- tests/dragon/utils/worker.py | 26 -------------------------- tests/mli/worker.py | 26 -------------------------- 2 files changed, 52 deletions(-) diff --git a/tests/dragon/utils/worker.py b/tests/dragon/utils/worker.py index f6c8120e0..0582cae56 100644 --- a/tests/dragon/utils/worker.py +++ b/tests/dragon/utils/worker.py @@ -96,35 +96,9 @@ def transform_output( execute_result: mliw.ExecuteResult, result_device: str, ) -> mliw.TransformOutputResult: - # transformed = [item.clone() for item in execute_result.predictions] - # return OutputTransformResult(transformed) - - # transformed = [item.bytes() for item in execute_result.predictions] - - # OutputTransformResult.transformed SHOULD be a list of - # capnproto Tensors Or tensor descriptors accompanying bytes - # send the original tensors... execute_result.predictions = [t.detach() for t in execute_result.predictions] # todo: solve sending all tensor metadata that coincisdes with each prediction return mliw.TransformOutputResult( execute_result.predictions, [1], "c", "float32" ) - # return OutputTransformResult(transformed) - - # @staticmethod - # def serialize_reply( - # request: InferenceRequest, results: OutputTransformResult - # ) -> t.Any: - # # results = IntegratedTorchWorker._prepare_outputs(results.outputs) - # # return results - # return None - # # response = MessageHandler.build_response( - # # status=200, # todo: are we satisfied with 0/1 (success, fail) - # # # todo: if not detailed messages, this shouldn't be returned. - # # message="success", - # # result=results, - # # custom_attributes=None, - # # ) - # # serialized_resp = MessageHandler.serialize_response(response) - # # return serialized_resp diff --git a/tests/mli/worker.py b/tests/mli/worker.py index f6c8120e0..0582cae56 100644 --- a/tests/mli/worker.py +++ b/tests/mli/worker.py @@ -96,35 +96,9 @@ def transform_output( execute_result: mliw.ExecuteResult, result_device: str, ) -> mliw.TransformOutputResult: - # transformed = [item.clone() for item in execute_result.predictions] - # return OutputTransformResult(transformed) - - # transformed = [item.bytes() for item in execute_result.predictions] - - # OutputTransformResult.transformed SHOULD be a list of - # capnproto Tensors Or tensor descriptors accompanying bytes - # send the original tensors... execute_result.predictions = [t.detach() for t in execute_result.predictions] # todo: solve sending all tensor metadata that coincisdes with each prediction return mliw.TransformOutputResult( execute_result.predictions, [1], "c", "float32" ) - # return OutputTransformResult(transformed) - - # @staticmethod - # def serialize_reply( - # request: InferenceRequest, results: OutputTransformResult - # ) -> t.Any: - # # results = IntegratedTorchWorker._prepare_outputs(results.outputs) - # # return results - # return None - # # response = MessageHandler.build_response( - # # status=200, # todo: are we satisfied with 0/1 (success, fail) - # # # todo: if not detailed messages, this shouldn't be returned. - # # message="success", - # # result=results, - # # custom_attributes=None, - # # ) - # # serialized_resp = MessageHandler.serialize_response(response) - # # return serialized_resp From 73c7f9b454d4107b5b458a47ac1f38b3a53c4812 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:22:16 -0500 Subject: [PATCH 24/49] formatting --- smartsim/_core/mli/infrastructure/control/workermanager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 57254de93..ee1ba6e83 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -45,6 +45,7 @@ if t.TYPE_CHECKING: from dragon.fli import FLInterface + from smartsim._core.mli.mli_schemas.response.response_capnp import Status logger = get_logger(__name__) From a5bda09a15489bee68097f327c517eef6a042dcd Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 31 Jul 2024 18:53:04 -0500 Subject: [PATCH 25/49] revert dupe change from upstream --- .github/workflows/run_tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 8ed348cbd..b8e96f05b 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -109,7 +109,7 @@ jobs: - name: Install SmartSim (with ML backends) run: | python -m pip install git+https://github.com/CrayLabs/SmartRedis.git@develop#egg=smartredis - python -m pip install .[dev,ml] + python -m pip install .[dev,mypy,ml] - name: Install ML Runtimes with Smart (with pt, tf, and onnx support) if: (contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12')) && ( matrix.subset != 'dragon' ) @@ -129,7 +129,6 @@ jobs: - name: Run mypy run: | - python -m pip install .[mypy] make check-mypy - name: Run Pylint From d50a540210e9880c5ccf2e441141195f4353a365 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Thu, 1 Aug 2024 16:27:10 -0500 Subject: [PATCH 26/49] fix confusing docstring --- smartsim/_core/mli/infrastructure/environmentloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 81551093f..265de3d9d 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -64,7 +64,7 @@ def __init__( for inference requests""" def get_backbone(self) -> t.Optional[FeatureStore]: - """Create the backbone feature store using the descriptor found in + """Attach to the backbone feature store using the descriptor found in an environment variable. The backbone is a standalone, system-created feature store used to share internal information among MLI components""" descriptor = self._backbone_descriptor or os.getenv("SS_DRG_DDICT", None) From 86b4c2e1454b42bdfd183fd081e2d9a07334bf70 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Thu, 1 Aug 2024 16:59:30 -0500 Subject: [PATCH 27/49] fix incomplete docstrings, tweak logs --- .../infrastructure/control/workermanager.py | 20 +++++++++++++------ .../mli/infrastructure/environmentloader.py | 16 +++++++++++++-- tests/dragon/utils/channel.py | 3 ++- tests/mli/channel.py | 3 ++- 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index ee1ba6e83..9928b4cd3 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -128,7 +128,9 @@ def __init__( def _check_feature_stores(self, request: InferenceRequest) -> bool: """Ensures that all feature stores required by the request are available - :param request: The request to validate""" + :param request: The request to validate + :returns: False if feature store validation fails for the request, True otherwise + """ # collect all feature stores required by the request fs_model: t.Set[str] = set() if request.model_key: @@ -147,7 +149,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: # create the feature stores we need to service request if fs_missing: - logger.info(f"Missing feature store(s): {fs_missing}") + logger.debug(f"Adding feature store(s): {fs_missing}") for descriptor in fs_missing: feature_store = self._fs_factory(descriptor) self._feature_stores[descriptor] = feature_store @@ -156,7 +158,9 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: def _check_model(self, request: InferenceRequest) -> bool: """Ensure that a model is available for the request - :param request: The request to validate""" + :param request: The request to validate + :returns: False if model validation fails for the request, True otherwise + """ if request.model_key or request.raw_model: return True @@ -165,7 +169,9 @@ def _check_model(self, request: InferenceRequest) -> bool: def _check_inputs(self, request: InferenceRequest) -> bool: """Ensure that inputs are available for the request - :param request: The request to validate""" + :param request: The request to validate + :returns: False if input validation fails for the request, True otherwise + """ if request.input_keys or request.raw_inputs: return True @@ -174,7 +180,9 @@ def _check_inputs(self, request: InferenceRequest) -> bool: def _check_callback(self, request: InferenceRequest) -> bool: """Ensure that a callback channel is available for the request - :param request: The request to validate""" + :param request: The request to validate + :returns: False if callback validation fails for the request, True otherwise + """ if request.callback is not None: return True @@ -184,7 +192,7 @@ def _check_callback(self, request: InferenceRequest) -> bool: def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. :param request: The request to validate - :return: True if the request is valid, False otherwise""" + :return: False if the request fails any validation checks, True otherwise""" checks = [ self._check_feature_stores(request), self._check_model(request), diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 265de3d9d..f7056f4a0 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -46,6 +46,15 @@ def __init__( callback_factory: t.Callable[[bytes], CommChannelBase], queue_factory: t.Callable[[str], CommChannelBase], ) -> None: + """Initialize the config loader instance with the factories necessary for + creating additional objects. + + :param featurestore_factory: A factory method that produces a feature store + given a descriptor + :param callback_factory: A factory method that produces a callback + channel given a descriptor + :param featurestore_factory: A factory method that produces a queue + channel given a descriptor""" self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) """The descriptor used to attach to the incoming event queue""" self.queue: t.Optional[CommChannelBase] = None @@ -66,7 +75,8 @@ def __init__( def get_backbone(self) -> t.Optional[FeatureStore]: """Attach to the backbone feature store using the descriptor found in an environment variable. The backbone is a standalone, system-created - feature store used to share internal information among MLI components""" + feature store used to share internal information among MLI components + :returns: The attached feature store via SS_DRG_DDICT""" descriptor = self._backbone_descriptor or os.getenv("SS_DRG_DDICT", None) if self._featurestore_factory is None: logger.warning("No feature store factory is configured") @@ -78,7 +88,9 @@ def get_backbone(self) -> t.Optional[FeatureStore]: return self.backbone def get_queue(self) -> t.Optional[CommChannelBase]: - """Returns the Queue previously set in SSQueue""" + """Attach to a queue-like communication channel using the descriptor + found in an environment variable. + :returns: The attached queue specified via SSQueue""" descriptor = self._queue_descriptor or os.getenv("SSQueue", None) if self._queue_factory is None: logger.warning("No queue factory is configured") diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index 7efe9b523..4314b494e 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -38,7 +38,8 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: - """Initialize the FileSystemCommChannel instance""" + """Initialize the FileSystemCommChannel instance + :param key: a path to the root directory of the feature store""" self._lock = threading.RLock() if not isinstance(key, bytes): diff --git a/tests/mli/channel.py b/tests/mli/channel.py index 9ae61a89b..9e8acd359 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -38,7 +38,8 @@ class FileSystemCommChannel(CommChannelBase): """Passes messages by writing to a file""" def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: - """Initialize the FileSystemCommChannel instance""" + """Initialize the FileSystemCommChannel instance + :param key: a path to the root directory of the feature store""" self._lock = threading.RLock() if not isinstance(key, bytes): super().__init__(key.as_posix().encode("utf-8")) From 23464837c542fd707924df444804c4a48959f3be Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Thu, 1 Aug 2024 17:56:10 -0500 Subject: [PATCH 28/49] docstring fix --- smartsim/_core/mli/comm/channel/dragonfli.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 2cbcb6944..b4b4c2e5b 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -43,7 +43,10 @@ class DragonFLIChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon FLI Channel""" def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None: - """Initialize the DragonFLIChannel instance""" + """Initialize the DragonFLIChannel instance + :param fli_desc: the descriptor of the FLI channel to attach + :param sender_supplied: flag indicating if the FLI uses sender-supplied streams + """ super().__init__(fli_desc) # todo: do we need memory pool information to construct the channel correctly? self._fli: "fli" = fli.FLInterface.attach(fli_desc) From d4194659592b02ee2e24b2313b6dff1dab9e6b46 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Thu, 1 Aug 2024 18:11:20 -0500 Subject: [PATCH 29/49] validate & report env config loader attempts to call factories --- .../mli/infrastructure/environmentloader.py | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index f7056f4a0..21cac2731 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -36,15 +36,14 @@ class EnvironmentConfigLoader: """ - Facilitates the loading of a FeatureStore and Queue - into the WorkerManager. + Facilitates the loading of a FeatureStore and Queue into the WorkerManager. """ def __init__( self, - featurestore_factory: t.Callable[[str], FeatureStore], - callback_factory: t.Callable[[bytes], CommChannelBase], - queue_factory: t.Callable[[str], CommChannelBase], + featurestore_factory: t.Optional[t.Callable[[str], FeatureStore]] = None, + callback_factory: t.Optional[t.Callable[[bytes], CommChannelBase]] = None, + queue_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None, ) -> None: """Initialize the config loader instance with the factories necessary for creating additional objects. @@ -53,14 +52,10 @@ def __init__( given a descriptor :param callback_factory: A factory method that produces a callback channel given a descriptor - :param featurestore_factory: A factory method that produces a queue + :param queue_factory: A factory method that produces a queue channel given a descriptor""" - self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) - """The descriptor used to attach to the incoming event queue""" self.queue: t.Optional[CommChannelBase] = None """The attached incoming event queue channel""" - self._backbone_descriptor: t.Optional[str] = os.getenv("SS_DRG_DDICT", None) - """The descriptor used to attach to the backbone feature store""" self.backbone: t.Optional[FeatureStore] = None """The attached backbone feature store""" self._featurestore_factory = featurestore_factory @@ -76,27 +71,33 @@ def get_backbone(self) -> t.Optional[FeatureStore]: """Attach to the backbone feature store using the descriptor found in an environment variable. The backbone is a standalone, system-created feature store used to share internal information among MLI components + :returns: The attached feature store via SS_DRG_DDICT""" - descriptor = self._backbone_descriptor or os.getenv("SS_DRG_DDICT", None) + descriptor = os.getenv("SS_DRG_DDICT", "") + + if not descriptor: + logger.warning("No backbone descriptor is configured") + if self._featurestore_factory is None: logger.warning("No feature store factory is configured") return None - if descriptor is not None: - self.backbone = self._featurestore_factory(descriptor) - self._backbone_descriptor = descriptor + self.backbone = self._featurestore_factory(descriptor) return self.backbone def get_queue(self) -> t.Optional[CommChannelBase]: """Attach to a queue-like communication channel using the descriptor found in an environment variable. + :returns: The attached queue specified via SSQueue""" - descriptor = self._queue_descriptor or os.getenv("SSQueue", None) + descriptor = os.getenv("SSQueue", "") + + if not descriptor: + logger.warning("No queue descriptor is configured") + if self._queue_factory is None: logger.warning("No queue factory is configured") return None - if descriptor is not None and descriptor: - self.queue = self._queue_factory(descriptor) - self._queue_descriptor = descriptor + self.queue = self._queue_factory(descriptor) return self.queue From 85a6ee049b7c5dd4f369664690371f32c2680de1 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Thu, 1 Aug 2024 18:44:58 -0500 Subject: [PATCH 30/49] report validation failures in MLI pipeline through callback --- smartsim/_core/mli/infrastructure/control/workermanager.py | 6 +++++- smartsim/_core/mli/infrastructure/environmentloader.py | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 9928b4cd3..eb1273b04 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -235,7 +235,11 @@ def _on_iteration(self) -> None: request.raw_inputs = tensor_bytes_list if not self._validate_request(request): - return + exception_handler( + ValueError("Error validating the request"), + request.callback, + "Error validating the request.", + ) timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 21cac2731..c62645f3f 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -41,9 +41,9 @@ class EnvironmentConfigLoader: def __init__( self, - featurestore_factory: t.Optional[t.Callable[[str], FeatureStore]] = None, - callback_factory: t.Optional[t.Callable[[bytes], CommChannelBase]] = None, - queue_factory: t.Optional[t.Callable[[str], CommChannelBase]] = None, + featurestore_factory: t.Callable[[str], FeatureStore], + callback_factory: t.Callable[[bytes], CommChannelBase], + queue_factory: t.Callable[[str], CommChannelBase], ) -> None: """Initialize the config loader instance with the factories necessary for creating additional objects. From d9a30d78e89764fde6c6e3ffb7e1bf7fd36eb538 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:55:31 -0500 Subject: [PATCH 31/49] fix removal of early return on empty descriptors --- smartsim/_core/mli/infrastructure/environmentloader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index c62645f3f..ea8e6b2ad 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -77,6 +77,7 @@ def get_backbone(self) -> t.Optional[FeatureStore]: if not descriptor: logger.warning("No backbone descriptor is configured") + return None if self._featurestore_factory is None: logger.warning("No feature store factory is configured") @@ -94,6 +95,7 @@ def get_queue(self) -> t.Optional[CommChannelBase]: if not descriptor: logger.warning("No queue descriptor is configured") + return None if self._queue_factory is None: logger.warning("No queue factory is configured") From 5f9c727f9fc75c4cf784322a0a418d922eaa1c4d Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:26:45 -0500 Subject: [PATCH 32/49] format docstrings to render correctly --- smartsim/_core/mli/comm/channel/channel.py | 2 ++ .../_core/mli/comm/channel/dragonchannel.py | 2 ++ smartsim/_core/mli/comm/channel/dragonfli.py | 4 +++ .../infrastructure/control/workermanager.py | 9 +++++- .../storage/dragonfeaturestore.py | 12 +++++--- .../infrastructure/storage/featurestore.py | 11 ++++--- tests/dragon/featurestore.py | 29 ++++++++++++------- tests/dragon/utils/channel.py | 4 +++ tests/mli/channel.py | 4 +++ tests/mli/featurestore.py | 24 ++++++++++----- 10 files changed, 74 insertions(+), 27 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index a3cce2181..d91859126 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -42,11 +42,13 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None: @abstractmethod def send(self, value: bytes) -> None: """Send a message through the underlying communication channel + :param value: The value to send""" @abstractmethod def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel + :returns: the received message""" @property diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index c9eca9046..80fdd9cdc 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -56,6 +56,7 @@ def send(self, value: bytes) -> None: def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel + :returns: the received message""" with self._channel.recvh(timeout=None) as recvh: message_bytes: bytes = recvh.recv_bytes(timeout=None) @@ -67,6 +68,7 @@ def from_descriptor( descriptor: str, ) -> "DragonCommChannel": """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached DragonCommChannel""" try: diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index b4b4c2e5b..4636894bd 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -44,6 +44,7 @@ class DragonFLIChannel(cch.CommChannelBase): def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None: """Initialize the DragonFLIChannel instance + :param fli_desc: the descriptor of the FLI channel to attach :param sender_supplied: flag indicating if the FLI uses sender-supplied streams """ @@ -56,12 +57,14 @@ def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None: def send(self, value: bytes) -> None: """Send a message through the underlying communication channel + :param value: The value to send""" with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value) def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel + :returns: the received message""" messages = [] eot = False @@ -80,6 +83,7 @@ def from_descriptor( descriptor: str, ) -> "DragonFLIChannel": """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached DragonFLIChannel""" try: diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index eb1273b04..6f52d5364 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -98,6 +98,7 @@ def __init__( device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: """Initialize the WorkerManager + :param config_loader: Environment config loader that loads the task queue and feature store :param workers: A worker to manage @@ -128,6 +129,7 @@ def __init__( def _check_feature_stores(self, request: InferenceRequest) -> bool: """Ensures that all feature stores required by the request are available + :param request: The request to validate :returns: False if feature store validation fails for the request, True otherwise """ @@ -158,6 +160,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: def _check_model(self, request: InferenceRequest) -> bool: """Ensure that a model is available for the request + :param request: The request to validate :returns: False if model validation fails for the request, True otherwise """ @@ -169,6 +172,7 @@ def _check_model(self, request: InferenceRequest) -> bool: def _check_inputs(self, request: InferenceRequest) -> bool: """Ensure that inputs are available for the request + :param request: The request to validate :returns: False if input validation fails for the request, True otherwise """ @@ -180,6 +184,7 @@ def _check_inputs(self, request: InferenceRequest) -> bool: def _check_callback(self, request: InferenceRequest) -> bool: """Ensure that a callback channel is available for the request + :param request: The request to validate :returns: False if callback validation fails for the request, True otherwise """ @@ -190,7 +195,8 @@ def _check_callback(self, request: InferenceRequest) -> bool: return False def _validate_request(self, request: InferenceRequest) -> bool: - """Ensure the request can be processed. + """Ensure the request can be processed + :param request: The request to validate :return: False if the request fails any validation checks, True otherwise""" checks = [ @@ -204,6 +210,7 @@ def _validate_request(self, request: InferenceRequest) -> bool: def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete + the inference pipeline""" logger.debug("executing worker manager pipeline") diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index a90c1f901..5f42ef0bd 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -48,6 +48,7 @@ def __init__(self, storage: "dragon_ddict.DDict") -> None: def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" try: value: t.Union[str, bytes] = self._storage[key] @@ -62,20 +63,22 @@ def __getitem__(self, key: str) -> t.Union[str, bytes]: def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key + :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" self._storage[key] = value def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" return key in self._storage @property def descriptor(self) -> str: - """Return a unique identifier enabling a client to connect to - the feature store + """A unique identifier enabling a client to connect to the feature store + :returns: A descriptor encoded as a string""" return str(self._storage.serialize()) @@ -85,6 +88,7 @@ def from_descriptor( descriptor: str, ) -> "DragonFeatureStore": """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached DragonFeatureStore""" try: diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index 4531f6696..d511d588e 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -50,23 +50,26 @@ class FeatureStore(ABC): @abstractmethod def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" @abstractmethod def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key + :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" @abstractmethod def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" @property @abstractmethod def descriptor(self) -> str: - """Return a unique identifier enabling a client to connect to - the feature store + """Unique identifier enabling a client to connect to the feature store + :returns: A descriptor encoded as a string""" diff --git a/tests/dragon/featurestore.py b/tests/dragon/featurestore.py index f8c645f6e..d06035fd7 100644 --- a/tests/dragon/featurestore.py +++ b/tests/dragon/featurestore.py @@ -43,6 +43,7 @@ def __init__(self) -> None: def __getitem__(self, key: str) -> bytes: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" if key not in self._storage: raise sse.SmartSimError(f"{key} not found in feature store") @@ -50,20 +51,22 @@ def __getitem__(self, key: str) -> bytes: def __setitem__(self, key: str, value: bytes) -> None: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" self._storage[key] = value def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" return key in self._storage @property def descriptor(self) -> str: - """Return a unique identifier enabling a client to connect to - the feature store + """Unique identifier enabling a client to connect to the feature store + :returns: A descriptor encoded as a string""" return "file-system-fs" @@ -76,6 +79,7 @@ def __init__( self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None ) -> None: """Initialize the FileSystemFeatureStore instance + :param storage_dir: (optional) root directory to store all data relative to""" if isinstance(storage_dir, str): storage_dir = pathlib.Path(storage_dir) @@ -83,6 +87,7 @@ def __init__( def __getitem__(self, key: str) -> bytes: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" path = self._key_path(key) if not path.exists(): @@ -91,6 +96,7 @@ def __getitem__(self, key: str) -> bytes: def __setitem__(self, key: str, value: bytes) -> None: """Assign a value using key + :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" path = self._key_path(key, create=True) @@ -98,14 +104,16 @@ def __setitem__(self, key: str, value: bytes) -> None: def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" path = self._key_path(key) return path.exists() def _key_path(self, key: str, create: bool = False) -> pathlib.Path: """Given a key, return a path that is optionally combined with a base directory used by the FileSystemFeatureStore. + :param key: Unique key of an item to retrieve from the feature store""" value = pathlib.Path(key) @@ -119,8 +127,8 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path: @property def descriptor(self) -> str: - """Return a unique identifier enabling a client to connect to - the feature store + """Unique identifier enabling a client to connect to the feature store + :returns: A descriptor encoded as a string""" if not self._storage_dir: raise ValueError("No storage path configured") @@ -132,6 +140,7 @@ def from_descriptor( descriptor: str, ) -> "FileSystemFeatureStore": """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached FileSystemFeatureStore""" try: diff --git a/tests/dragon/utils/channel.py b/tests/dragon/utils/channel.py index 4314b494e..08b659c07 100644 --- a/tests/dragon/utils/channel.py +++ b/tests/dragon/utils/channel.py @@ -39,6 +39,7 @@ class FileSystemCommChannel(CommChannelBase): def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: """Initialize the FileSystemCommChannel instance + :param key: a path to the root directory of the feature store""" self._lock = threading.RLock() @@ -56,6 +57,7 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: def send(self, value: bytes) -> None: """Send a message throuh the underlying communication channel + :param value: The value to send""" logger.debug( f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" @@ -65,6 +67,7 @@ def send(self, value: bytes) -> None: def recv(self) -> bytes: """Receieve a message through the underlying communication channel + :returns: the received message""" with self._lock: if self._file_path.exists(): @@ -78,6 +81,7 @@ def from_descriptor( descriptor: t.Union[str, bytes], ) -> "FileSystemCommChannel": """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached FileSystemCommChannel""" try: diff --git a/tests/mli/channel.py b/tests/mli/channel.py index 9e8acd359..226e8683d 100644 --- a/tests/mli/channel.py +++ b/tests/mli/channel.py @@ -39,6 +39,7 @@ class FileSystemCommChannel(CommChannelBase): def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: """Initialize the FileSystemCommChannel instance + :param key: a path to the root directory of the feature store""" self._lock = threading.RLock() if not isinstance(key, bytes): @@ -55,6 +56,7 @@ def __init__(self, key: t.Union[bytes, pathlib.Path]) -> None: def send(self, value: bytes) -> None: """Send a message throuh the underlying communication channel + :param value: The value to send""" logger.debug( f"Channel {self.descriptor.decode('utf-8')} sending message to {self._file_path}" @@ -64,6 +66,7 @@ def send(self, value: bytes) -> None: def recv(self) -> bytes: """Receieve a message through the underlying communication channel + :returns: the received message""" with self._lock: if self._file_path.exists(): @@ -77,6 +80,7 @@ def from_descriptor( descriptor: str, ) -> "FileSystemCommChannel": """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached FileSystemCommChannel""" try: diff --git a/tests/mli/featurestore.py b/tests/mli/featurestore.py index 5545168b7..de748ae6e 100644 --- a/tests/mli/featurestore.py +++ b/tests/mli/featurestore.py @@ -43,6 +43,7 @@ def __init__(self) -> None: def __getitem__(self, key: str) -> bytes: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" if key not in self._storage: raise sse.SmartSimError(f"{key} not found in feature store") @@ -50,8 +51,9 @@ def __getitem__(self, key: str) -> bytes: def __setitem__(self, key: str, value: bytes) -> None: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" self._storage[key] = value def __contains__(self, key: str) -> bool: @@ -62,8 +64,8 @@ def __contains__(self, key: str) -> bool: @property def descriptor(self) -> str: - """Return a unique identifier enabling a client to connect to - the feature store + """Unique identifier enabling a client to connect to the feature store + :returns: A descriptor encoded as a string""" return "in-memory-fs" @@ -76,6 +78,7 @@ def __init__( self, storage_dir: t.Optional[t.Union[pathlib.Path, str]] = None ) -> None: """Initialize the FileSystemFeatureStore instance + :param storage_dir: (optional) root directory to store all data relative to""" if isinstance(storage_dir, str): storage_dir = pathlib.Path(storage_dir) @@ -83,6 +86,7 @@ def __init__( def __getitem__(self, key: str) -> bytes: """Retrieve an item using key + :param key: Unique key of an item to retrieve from the feature store""" path = self._key_path(key) if not path.exists(): @@ -91,6 +95,7 @@ def __getitem__(self, key: str) -> bytes: def __setitem__(self, key: str, value: bytes) -> None: """Assign a value using key + :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" path = self._key_path(key, create=True) @@ -98,14 +103,16 @@ def __setitem__(self, key: str, value: bytes) -> None: def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. - Return `True` if the key is found, `False` otherwise - :param key: Unique key of an item to retrieve from the feature store""" + + :param key: Unique key of an item to retrieve from the feature store + :returns: `True` if the key is found, `False` otherwise""" path = self._key_path(key) return path.exists() def _key_path(self, key: str, create: bool = False) -> pathlib.Path: """Given a key, return a path that is optionally combined with a base directory used by the FileSystemFeatureStore. + :param key: Unique key of an item to retrieve from the feature store""" value = pathlib.Path(key) @@ -119,8 +126,8 @@ def _key_path(self, key: str, create: bool = False) -> pathlib.Path: @property def descriptor(self) -> str: - """Return a unique identifier enabling a client to connect to - the feature store + """Unique identifier enabling a client to connect to the feature store + :returns: A descriptor encoded as a string""" if not self._storage_dir: raise ValueError("No storage path configured") @@ -132,6 +139,7 @@ def from_descriptor( descriptor: str, ) -> "FileSystemFeatureStore": """A factory method that creates an instance from a descriptor string + :param descriptor: The descriptor that uniquely identifies the resource :returns: An attached FileSystemFeatureStore""" try: From 3fd5ed11b1e674b272857f283a257bc38060d221 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:51:01 -0500 Subject: [PATCH 33/49] rename backbone env var --- ex/high_throughput_inference/mock_app.py | 2 +- ex/high_throughput_inference/standalone_workermanager.py | 2 +- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- smartsim/_core/mli/infrastructure/environmentloader.py | 4 ++-- tests/dragon/test_environment_loader.py | 6 +++--- tests/dragon/test_error_handling.py | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index e34b2676a..3a5169a66 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -56,7 +56,7 @@ class ProtoClient: def __init__(self, timing_on: bool): connect_to_infrastructure() - ddict_str = os.environ["SS_DRG_DDICT"] + ddict_str = os.environ["SS_INFRA_BACKBONE"] self._ddict = DDict.attach(ddict_str) self._backbone_descriptor = DragonFeatureStore(self._ddict).descriptor to_worker_fli_str = None diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 466d2d669..91a425ae4 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -69,7 +69,7 @@ args = parser.parse_args() connect_to_infrastructure() - ddict_str = os.environ["SS_DRG_DDICT"] + ddict_str = os.environ["SS_INFRA_BACKBONE"] ddict = DDict.attach(ddict_str) to_worker_channel = Channel.make_process_local() diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 445538f20..16f5c03dc 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -521,7 +521,7 @@ def _start_steps(self) -> None: env={ **request.current_env, **request.env, - "SS_DRG_DDICT": self.infra_ddict, + "SS_INFRA_BACKBONE": self.infra_ddict, }, stdout=dragon_process.Popen.PIPE, stderr=dragon_process.Popen.PIPE, diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index ea8e6b2ad..3f52d8d83 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -72,8 +72,8 @@ def get_backbone(self) -> t.Optional[FeatureStore]: an environment variable. The backbone is a standalone, system-created feature store used to share internal information among MLI components - :returns: The attached feature store via SS_DRG_DDICT""" - descriptor = os.getenv("SS_DRG_DDICT", "") + :returns: The attached feature store via SS_INFRA_BACKBONE""" + descriptor = os.getenv("SS_INFRA_BACKBONE", "") if not descriptor: logger.warning("No backbone descriptor is configured") diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 1b338e1d9..77b400a95 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -101,10 +101,10 @@ def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): - """Verify the dragon feature store is loaded correctly by - the EnvironmentConfigLoader to demonstrate fs_factory correctness""" + """Verify the dragon feature store is loaded correctly by the + EnvironmentConfigLoader to demonstrate featurestore_factory correctness""" feature_store = DragonFeatureStore(DDict()) - monkeypatch.setenv("SS_DRG_DDICT", feature_store.descriptor) + monkeypatch.setenv("SS_INFRA_BACKBONE", feature_store.descriptor) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 73757014d..e576452b7 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -91,7 +91,7 @@ def setup_worker_manager_model_bytes( queue = FLInterface(main_ch=chan) monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("SS_DRG_DDICT", backbone_descriptor) + monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) worker_manager = WorkerManager( EnvironmentConfigLoader( @@ -129,7 +129,7 @@ def setup_worker_manager_model_key( queue = FLInterface(main_ch=chan) monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` - monkeypatch.setenv("SS_DRG_DDICT", backbone_descriptor) + monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) worker_manager = WorkerManager( EnvironmentConfigLoader( From 6d4f2e0a84e6b9b7d71492ff9c074ae4186109d2 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:58:09 -0500 Subject: [PATCH 34/49] debug descriptor failure on build agent --- tests/dragon/test_environment_loader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 77b400a95..46a4a5cb4 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -112,6 +112,8 @@ def test_environment_loader_backbone_load_dfs(monkeypatch: pytest.MonkeyPatch): queue_factory=None, ) + print(f"calling config.get_backbone: `{feature_store.descriptor}`") + backbone = config.get_backbone() assert backbone is not None From c75dc5a2d3a04758879e2001b241952dcf5203cb Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Fri, 2 Aug 2024 17:49:27 -0500 Subject: [PATCH 35/49] download and log original asset name on `smart build --dragon` --- smartsim/_core/_cli/scripts/dragon_install.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index 03a128ab8..2060c73c7 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -5,6 +5,7 @@ from github import Github from github.GitReleaseAsset import GitReleaseAsset +from urllib.request import urlretrieve from smartsim._core._cli.utils import pip from smartsim._core._install.builder import WebTGZ @@ -163,10 +164,22 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib if download_dir.exists() and list(download_dir.rglob("*.whl")): return download_dir - archive = WebTGZ(asset.browser_download_url) + download_dir.mkdir(parents=True, exist_ok=True) + + # grab a copy of the complete asset + asset_path = download_dir / str(asset.name) + download_url = asset.browser_download_url + try: + urlretrieve(download_url, str(asset_path)) + logger.debug(f"Retrieved asset {asset.name} to {download_url}") + except Exception: + logger.warning(f"Unable to download asset from: {download_url}") + + # extract the asset + archive = WebTGZ(download_url) archive.extract(download_dir) - logger.debug(f"Retrieved {asset.browser_download_url} to {download_dir}") + logger.debug(f"Extracted {download_url} to {download_dir}") return download_dir From ee07d94934d39a7a1d1816ed7678e6a05449bb8d Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Fri, 2 Aug 2024 18:33:03 -0500 Subject: [PATCH 36/49] test --- smartsim/_core/_cli/scripts/dragon_install.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index 2060c73c7..d31df4753 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -2,10 +2,10 @@ import pathlib import sys import typing as t +from urllib.request import urlretrieve from github import Github from github.GitReleaseAsset import GitReleaseAsset -from urllib.request import urlretrieve from smartsim._core._cli.utils import pip from smartsim._core._install.builder import WebTGZ @@ -169,11 +169,25 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib # grab a copy of the complete asset asset_path = download_dir / str(asset.name) download_url = asset.browser_download_url + if "0.91" not in asset.name: + if "3.9" in python_version(): + logger.debug("I want to snake the original w/3.9 rpm") + # download_url = "https://arti.hpc.amslabs.hpecorp.net/ui/native/dragon-rpm-master-local/dev/master/sle15_sp3_pe/x86_64/dragon-0.91-py3.11.5-1d600977c.rpm" + ... # temp no-op + elif "3.10" in python_version(): + logger.debug("snaking original w/3.10 rpm") + download_url = "https://drive.usercontent.google.com/download?id=1dyScGNomzoPO8-bC8i6zaIbOOhsL83Sp&export=download&authuser=0&confirm=t&uuid=6068afeb-14fd-4303-90a5-498b316d3cce&at=APZUnTWTIf9Tl7Yt8tcdKyodnydV:1722641072921" + elif "3.11" in python_version(): + logger.debug("snaking original w/3.11rpm") + download_url = "https://drive.usercontent.google.com/download?id=1vhUXLIu06-RPA_N3wWmi42avnawzizZZ&export=download&authuser=0&confirm=t&uuid=04c920cb-2e66-4762-8e0f-8ad57e0cbbdf&at=APZUnTUKtCv_BgYOkWAaHqoPpGLd:1722640947383" + else: + logger.debug(f"the name was: {asset.name}") + try: urlretrieve(download_url, str(asset_path)) logger.debug(f"Retrieved asset {asset.name} to {download_url}") except Exception: - logger.warning(f"Unable to download asset from: {download_url}") + logger.exception(f"Unable to download asset from: {download_url}") # extract the asset archive = WebTGZ(download_url) From a2691865afed1346cf9bedad32bd027d3dde124d Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Fri, 2 Aug 2024 19:12:01 -0500 Subject: [PATCH 37/49] test --- smartsim/_core/_cli/scripts/dragon_install.py | 8 +- tests/mli/test_worker_manager.py | 2 +- tests/test_dragon_installer.py | 108 +++++++++--------- 3 files changed, 60 insertions(+), 58 deletions(-) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index d31df4753..48eb750e7 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -1,5 +1,6 @@ import os import pathlib +import shutil import sys import typing as t from urllib.request import urlretrieve @@ -161,8 +162,9 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib # if we've previously downloaded the release and still have # wheels laying around, use that cached version instead - if download_dir.exists() and list(download_dir.rglob("*.whl")): - return download_dir + if download_dir.exists() or list(download_dir.rglob("*.whl")): + # return download_dir + shutil.rmtree(str(download_dir)) download_dir.mkdir(parents=True, exist_ok=True) @@ -185,7 +187,7 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib try: urlretrieve(download_url, str(asset_path)) - logger.debug(f"Retrieved asset {asset.name} to {download_url}") + logger.debug(f"Retrieved asset {asset.name} from {download_url}") except Exception: logger.exception(f"Unable to download asset from: {download_url}") diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 026d1f32f..8dcff39f4 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -54,9 +54,9 @@ from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger -from tests.mli.featurestore import FileSystemFeatureStore from .channel import FileSystemCommChannel +from .featurestore import FileSystemFeatureStore logger = get_logger(__name__) # The tests in this file belong to the dragon group diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index 4bf589ad4..ea4d3eb55 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -156,60 +156,60 @@ def test_cleanup_archive_exists(test_archive: pathlib.Path) -> None: assert not test_archive.exists() -def test_retrieve_cached( - test_dir: str, - # archive_path: pathlib.Path, - test_archive: pathlib.Path, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Verify that a previously retrieved asset archive is re-used and the - release asset retrieval is not attempted""" - - asset_id = 123 - - def mock_webtgz_extract(self_, target_) -> None: - mock_extraction_dir = pathlib.Path(target_) - with tarfile.TarFile.open(test_archive) as tar: - tar.extractall(mock_extraction_dir) - - # we'll use the mock extract to create the files that would normally be downloaded - expected_output_dir = test_archive.parent / str(asset_id) - mock_webtgz_extract(None, expected_output_dir) - - # get modification time of directory holding the "downloaded" archive - ts1 = expected_output_dir.stat().st_ctime - - requester = Requester( - auth=None, - base_url="https://github.com", - user_agent="mozilla", - per_page=10, - verify=False, - timeout=1, - retry=1, - pool_size=1, - ) - headers = {"mock-header": "mock-value"} - attributes = {"mock-attr": "mock-attr-value"} - completed = True - - asset = GitReleaseAsset(requester, headers, attributes, completed) - - # ensure mocked asset has values that we use... - monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo")) - monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name)) - monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id)) - - # show that retrieving an asset w/a different ID results in ignoring - # other wheels from prior downloads in the parent directory of the asset - asset_path = retrieve_asset(test_archive.parent, asset) - ts2 = asset_path.stat().st_ctime - - # NOTE: the file should be written to a subdir based on the asset ID - assert ( - asset_path == expected_output_dir - ) # shows that the expected path matches the output path - assert ts1 == ts2 # show that the file wasn't changed... +# def test_retrieve_cached( +# test_dir: str, +# # archive_path: pathlib.Path, +# test_archive: pathlib.Path, +# monkeypatch: pytest.MonkeyPatch, +# ) -> None: +# """Verify that a previously retrieved asset archive is re-used and the +# release asset retrieval is not attempted""" + +# asset_id = 123 + +# def mock_webtgz_extract(self_, target_) -> None: +# mock_extraction_dir = pathlib.Path(target_) +# with tarfile.TarFile.open(test_archive) as tar: +# tar.extractall(mock_extraction_dir) + +# # we'll use the mock extract to create the files that would normally be downloaded +# expected_output_dir = test_archive.parent / str(asset_id) +# mock_webtgz_extract(None, expected_output_dir) + +# # get modification time of directory holding the "downloaded" archive +# ts1 = expected_output_dir.stat().st_ctime + +# requester = Requester( +# auth=None, +# base_url="https://github.com", +# user_agent="mozilla", +# per_page=10, +# verify=False, +# timeout=1, +# retry=1, +# pool_size=1, +# ) +# headers = {"mock-header": "mock-value"} +# attributes = {"mock-attr": "mock-attr-value"} +# completed = True + +# asset = GitReleaseAsset(requester, headers, attributes, completed) + +# # ensure mocked asset has values that we use... +# monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo")) +# monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name)) +# monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id)) + +# # show that retrieving an asset w/a different ID results in ignoring +# # other wheels from prior downloads in the parent directory of the asset +# asset_path = retrieve_asset(test_archive.parent, asset) +# ts2 = asset_path.stat().st_ctime + +# # NOTE: the file should be written to a subdir based on the asset ID +# assert ( +# asset_path == expected_output_dir +# ) # shows that the expected path matches the output path +# assert ts1 == ts2 # show that the file wasn't changed... def test_retrieve_updated( From e75a18fa318bfc20a08bba4183ae146189b5bb61 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Fri, 2 Aug 2024 19:46:11 -0500 Subject: [PATCH 38/49] remove test_worker_Manager --- tests/mli/test_worker_manager.py | 418 +++++++++++++++---------------- 1 file changed, 209 insertions(+), 209 deletions(-) diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 8dcff39f4..ae764591f 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -1,209 +1,209 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import io -import logging -import multiprocessing as mp -import pathlib -import time - -import pytest - -torch = pytest.importorskip("torch") -dragon = pytest.importorskip("dragon") - -import base64 -import os - -import dragon.channels as dch -from dragon import fli - -from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.control.workermanager import ( - EnvironmentConfigLoader, - WorkerManager, -) -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( - DragonFeatureStore, -) -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker -from smartsim._core.mli.message_handler import MessageHandler -from smartsim.log import get_logger - -from .channel import FileSystemCommChannel -from .featurestore import FileSystemFeatureStore - -logger = get_logger(__name__) -# The tests in this file belong to the dragon group -pytestmark = pytest.mark.dragon - - -def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: - """Create a simple torch model and persist to disk for - testing purposes. - - TODO: remove once unit tests are in place""" - # test_path = pathlib.Path(work_dir) - if not model_path.parent.exists(): - model_path.parent.mkdir(parents=True, exist_ok=True) - - model_path.unlink(missing_ok=True) - # model_path = test_path / "basic.pt" - - model = torch.nn.Linear(2, 1) - torch.save(model, model_path) - - return model_path - - -def mock_messages( - worker_manager_queue: CommChannelBase, - feature_store: FeatureStore, - feature_store_root_dir: pathlib.Path, - comm_channel_root_dir: pathlib.Path, -) -> None: - """Mock event producer for triggering the inference pipeline""" - feature_store_root_dir.mkdir(parents=True, exist_ok=True) - comm_channel_root_dir.mkdir(parents=True, exist_ok=True) - - model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") - model_bytes = model_path.read_bytes() - model_key = str(feature_store_root_dir / "model_fs.pt") - - feature_store[model_key] = model_bytes - - iteration_number = 0 - - while True: - iteration_number += 1 - time.sleep(1) - # 1. for demo, ignore upstream and just put stuff into downstream - # 2. for demo, only one downstream but we'd normally have to filter - # msg content and send to the correct downstream (worker) queue - # timestamp = time.time_ns() - # mock_channel = test_path / f"brainstorm-{timestamp}.txt" - # mock_channel.touch() - - # thread - just look for key (wait for keys) - # call checkpoint, try to get non-persistent key, it blocks - # working set size > 1 has side-effects - # only incurs cost when working set size has been exceeded - - channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" - callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) - - input_path = feature_store_root_dir / f"{iteration_number}/input.pt" - output_path = feature_store_root_dir / f"{iteration_number}/output.pt" - - input_key = str(input_path) - output_key = str(output_path) - - buffer = io.BytesIO() - tensor = torch.randn((1, 2), dtype=torch.float32) - torch.save(tensor, buffer) - feature_store[input_key] = buffer.getvalue() - fsd = feature_store.descriptor - - message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) - message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) - message_model_key = MessageHandler.build_model_key(model_key, fsd) - - request = MessageHandler.build_request( - reply_channel=callback_channel.descriptor, - model=message_model_key, - inputs=[message_tensor_input_key], - outputs=[message_tensor_output_key], - output_descriptors=[], - custom_attributes=None, - ) - request_bytes = MessageHandler.serialize_request(request) - worker_manager_queue.send(request_bytes) - - -@pytest.fixture -def prepare_environment(test_dir: str) -> pathlib.Path: - """Cleanup prior outputs to run demo repeatedly""" - path = pathlib.Path(f"{test_dir}/workermanager.log") - logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) - return path - - -def test_worker_manager(prepare_environment: pathlib.Path) -> None: - """Test the worker manager""" - - test_path = prepare_environment - fs_path = test_path / "feature_store" - comm_path = test_path / "comm_store" - - to_worker_channel = dch.Channel.make_process_local() - to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_serialized = to_worker_fli.serialize() - - # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader - # or test environment may be unable to send messages w/queue - os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - - config_loader = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, - ) - integrated_worker = TorchWorker() - - worker_manager = WorkerManager( - config_loader, - integrated_worker, - as_service=True, - cooldown=5, - device="cpu", - ) - - worker_queue = config_loader.get_queue() - if worker_queue is None: - logger.warn( - f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" - ) - - # create a mock client application to populate the request queue - msg_pump = mp.Process( - target=mock_messages, - args=( - worker_queue, - FileSystemFeatureStore(fs_path), - fs_path, - comm_path, - ), - ) - msg_pump.start() - - # create a process to execute commands - process = mp.Process(target=worker_manager.execute) - process.start() - process.join(timeout=5) - process.kill() - msg_pump.kill() +# # BSD 2-Clause License +# # +# # Copyright (c) 2021-2024, Hewlett Packard Enterprise +# # All rights reserved. +# # +# # Redistribution and use in source and binary forms, with or without +# # modification, are permitted provided that the following conditions are met: +# # +# # 1. Redistributions of source code must retain the above copyright notice, this +# # list of conditions and the following disclaimer. +# # +# # 2. Redistributions in binary form must reproduce the above copyright notice, +# # this list of conditions and the following disclaimer in the documentation +# # and/or other materials provided with the distribution. +# # +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# import io +# import logging +# import multiprocessing as mp +# import pathlib +# import time + +# import pytest + +# torch = pytest.importorskip("torch") +# dragon = pytest.importorskip("dragon") + +# import base64 +# import os + +# import dragon.channels as dch +# from dragon import fli + +# from smartsim._core.mli.comm.channel.channel import CommChannelBase +# from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +# from smartsim._core.mli.infrastructure.control.workermanager import ( +# EnvironmentConfigLoader, +# WorkerManager, +# ) +# from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( +# DragonFeatureStore, +# ) +# from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +# from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +# from smartsim._core.mli.message_handler import MessageHandler +# from smartsim.log import get_logger + +# from .channel import FileSystemCommChannel +# from .featurestore import FileSystemFeatureStore + +# logger = get_logger(__name__) +# # The tests in this file belong to the dragon group +# pytestmark = pytest.mark.dragon + + +# def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: +# """Create a simple torch model and persist to disk for +# testing purposes. + +# TODO: remove once unit tests are in place""" +# # test_path = pathlib.Path(work_dir) +# if not model_path.parent.exists(): +# model_path.parent.mkdir(parents=True, exist_ok=True) + +# model_path.unlink(missing_ok=True) +# # model_path = test_path / "basic.pt" + +# model = torch.nn.Linear(2, 1) +# torch.save(model, model_path) + +# return model_path + + +# def mock_messages( +# worker_manager_queue: CommChannelBase, +# feature_store: FeatureStore, +# feature_store_root_dir: pathlib.Path, +# comm_channel_root_dir: pathlib.Path, +# ) -> None: +# """Mock event producer for triggering the inference pipeline""" +# feature_store_root_dir.mkdir(parents=True, exist_ok=True) +# comm_channel_root_dir.mkdir(parents=True, exist_ok=True) + +# model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") +# model_bytes = model_path.read_bytes() +# model_key = str(feature_store_root_dir / "model_fs.pt") + +# feature_store[model_key] = model_bytes + +# iteration_number = 0 + +# while True: +# iteration_number += 1 +# time.sleep(1) +# # 1. for demo, ignore upstream and just put stuff into downstream +# # 2. for demo, only one downstream but we'd normally have to filter +# # msg content and send to the correct downstream (worker) queue +# # timestamp = time.time_ns() +# # mock_channel = test_path / f"brainstorm-{timestamp}.txt" +# # mock_channel.touch() + +# # thread - just look for key (wait for keys) +# # call checkpoint, try to get non-persistent key, it blocks +# # working set size > 1 has side-effects +# # only incurs cost when working set size has been exceeded + +# channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" +# callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) + +# input_path = feature_store_root_dir / f"{iteration_number}/input.pt" +# output_path = feature_store_root_dir / f"{iteration_number}/output.pt" + +# input_key = str(input_path) +# output_key = str(output_path) + +# buffer = io.BytesIO() +# tensor = torch.randn((1, 2), dtype=torch.float32) +# torch.save(tensor, buffer) +# feature_store[input_key] = buffer.getvalue() +# fsd = feature_store.descriptor + +# message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) +# message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) +# message_model_key = MessageHandler.build_model_key(model_key, fsd) + +# request = MessageHandler.build_request( +# reply_channel=callback_channel.descriptor, +# model=message_model_key, +# inputs=[message_tensor_input_key], +# outputs=[message_tensor_output_key], +# output_descriptors=[], +# custom_attributes=None, +# ) +# request_bytes = MessageHandler.serialize_request(request) +# worker_manager_queue.send(request_bytes) + + +# @pytest.fixture +# def prepare_environment(test_dir: str) -> pathlib.Path: +# """Cleanup prior outputs to run demo repeatedly""" +# path = pathlib.Path(f"{test_dir}/workermanager.log") +# logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) +# return path + + +# def test_worker_manager(prepare_environment: pathlib.Path) -> None: +# """Test the worker manager""" + +# test_path = prepare_environment +# fs_path = test_path / "feature_store" +# comm_path = test_path / "comm_store" + +# to_worker_channel = dch.Channel.make_process_local() +# to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) +# to_worker_fli_serialized = to_worker_fli.serialize() + +# # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader +# # or test environment may be unable to send messages w/queue +# os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + +# config_loader = EnvironmentConfigLoader( +# featurestore_factory=DragonFeatureStore.from_descriptor, +# callback_factory=FileSystemCommChannel.from_descriptor, +# queue_factory=DragonFLIChannel.from_descriptor, +# ) +# integrated_worker = TorchWorker() + +# worker_manager = WorkerManager( +# config_loader, +# integrated_worker, +# as_service=True, +# cooldown=5, +# device="cpu", +# ) + +# worker_queue = config_loader.get_queue() +# if worker_queue is None: +# logger.warn( +# f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" +# ) + +# # create a mock client application to populate the request queue +# msg_pump = mp.Process( +# target=mock_messages, +# args=( +# worker_queue, +# FileSystemFeatureStore(fs_path), +# fs_path, +# comm_path, +# ), +# ) +# msg_pump.start() + +# # create a process to execute commands +# process = mp.Process(target=worker_manager.execute) +# process.start() +# process.join(timeout=5) +# process.kill() +# msg_pump.kill() From 125dc840052c051c51d03336f9e0027dc3321984 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Mon, 5 Aug 2024 15:37:05 -0500 Subject: [PATCH 39/49] add test_worker_manager back into test set --- tests/mli/test_worker_manager.py | 417 +++++++++++++++---------------- 1 file changed, 208 insertions(+), 209 deletions(-) diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index ae764591f..51f445885 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -1,209 +1,208 @@ -# # BSD 2-Clause License -# # -# # Copyright (c) 2021-2024, Hewlett Packard Enterprise -# # All rights reserved. -# # -# # Redistribution and use in source and binary forms, with or without -# # modification, are permitted provided that the following conditions are met: -# # -# # 1. Redistributions of source code must retain the above copyright notice, this -# # list of conditions and the following disclaimer. -# # -# # 2. Redistributions in binary form must reproduce the above copyright notice, -# # this list of conditions and the following disclaimer in the documentation -# # and/or other materials provided with the distribution. -# # -# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# import io -# import logging -# import multiprocessing as mp -# import pathlib -# import time - -# import pytest - -# torch = pytest.importorskip("torch") -# dragon = pytest.importorskip("dragon") - -# import base64 -# import os - -# import dragon.channels as dch -# from dragon import fli - -# from smartsim._core.mli.comm.channel.channel import CommChannelBase -# from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -# from smartsim._core.mli.infrastructure.control.workermanager import ( -# EnvironmentConfigLoader, -# WorkerManager, -# ) -# from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( -# DragonFeatureStore, -# ) -# from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -# from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker -# from smartsim._core.mli.message_handler import MessageHandler -# from smartsim.log import get_logger - -# from .channel import FileSystemCommChannel -# from .featurestore import FileSystemFeatureStore - -# logger = get_logger(__name__) -# # The tests in this file belong to the dragon group -# pytestmark = pytest.mark.dragon - - -# def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: -# """Create a simple torch model and persist to disk for -# testing purposes. - -# TODO: remove once unit tests are in place""" -# # test_path = pathlib.Path(work_dir) -# if not model_path.parent.exists(): -# model_path.parent.mkdir(parents=True, exist_ok=True) - -# model_path.unlink(missing_ok=True) -# # model_path = test_path / "basic.pt" - -# model = torch.nn.Linear(2, 1) -# torch.save(model, model_path) - -# return model_path - - -# def mock_messages( -# worker_manager_queue: CommChannelBase, -# feature_store: FeatureStore, -# feature_store_root_dir: pathlib.Path, -# comm_channel_root_dir: pathlib.Path, -# ) -> None: -# """Mock event producer for triggering the inference pipeline""" -# feature_store_root_dir.mkdir(parents=True, exist_ok=True) -# comm_channel_root_dir.mkdir(parents=True, exist_ok=True) - -# model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") -# model_bytes = model_path.read_bytes() -# model_key = str(feature_store_root_dir / "model_fs.pt") - -# feature_store[model_key] = model_bytes - -# iteration_number = 0 - -# while True: -# iteration_number += 1 -# time.sleep(1) -# # 1. for demo, ignore upstream and just put stuff into downstream -# # 2. for demo, only one downstream but we'd normally have to filter -# # msg content and send to the correct downstream (worker) queue -# # timestamp = time.time_ns() -# # mock_channel = test_path / f"brainstorm-{timestamp}.txt" -# # mock_channel.touch() - -# # thread - just look for key (wait for keys) -# # call checkpoint, try to get non-persistent key, it blocks -# # working set size > 1 has side-effects -# # only incurs cost when working set size has been exceeded - -# channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" -# callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) - -# input_path = feature_store_root_dir / f"{iteration_number}/input.pt" -# output_path = feature_store_root_dir / f"{iteration_number}/output.pt" - -# input_key = str(input_path) -# output_key = str(output_path) - -# buffer = io.BytesIO() -# tensor = torch.randn((1, 2), dtype=torch.float32) -# torch.save(tensor, buffer) -# feature_store[input_key] = buffer.getvalue() -# fsd = feature_store.descriptor - -# message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) -# message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) -# message_model_key = MessageHandler.build_model_key(model_key, fsd) - -# request = MessageHandler.build_request( -# reply_channel=callback_channel.descriptor, -# model=message_model_key, -# inputs=[message_tensor_input_key], -# outputs=[message_tensor_output_key], -# output_descriptors=[], -# custom_attributes=None, -# ) -# request_bytes = MessageHandler.serialize_request(request) -# worker_manager_queue.send(request_bytes) - - -# @pytest.fixture -# def prepare_environment(test_dir: str) -> pathlib.Path: -# """Cleanup prior outputs to run demo repeatedly""" -# path = pathlib.Path(f"{test_dir}/workermanager.log") -# logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) -# return path - - -# def test_worker_manager(prepare_environment: pathlib.Path) -> None: -# """Test the worker manager""" - -# test_path = prepare_environment -# fs_path = test_path / "feature_store" -# comm_path = test_path / "comm_store" - -# to_worker_channel = dch.Channel.make_process_local() -# to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) -# to_worker_fli_serialized = to_worker_fli.serialize() - -# # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader -# # or test environment may be unable to send messages w/queue -# os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - -# config_loader = EnvironmentConfigLoader( -# featurestore_factory=DragonFeatureStore.from_descriptor, -# callback_factory=FileSystemCommChannel.from_descriptor, -# queue_factory=DragonFLIChannel.from_descriptor, -# ) -# integrated_worker = TorchWorker() - -# worker_manager = WorkerManager( -# config_loader, -# integrated_worker, -# as_service=True, -# cooldown=5, -# device="cpu", -# ) - -# worker_queue = config_loader.get_queue() -# if worker_queue is None: -# logger.warn( -# f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" -# ) - -# # create a mock client application to populate the request queue -# msg_pump = mp.Process( -# target=mock_messages, -# args=( -# worker_queue, -# FileSystemFeatureStore(fs_path), -# fs_path, -# comm_path, -# ), -# ) -# msg_pump.start() - -# # create a process to execute commands -# process = mp.Process(target=worker_manager.execute) -# process.start() -# process.join(timeout=5) -# process.kill() -# msg_pump.kill() +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import logging +import multiprocessing as mp +import pathlib +import time + +import pytest + +torch = pytest.importorskip("torch") +dragon = pytest.importorskip("dragon") + +import base64 +import os + +import dragon.channels as dch +from channel import FileSystemCommChannel +from dragon import fli +from featurestore import FileSystemFeatureStore + +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.infrastructure.control.workermanager import ( + EnvironmentConfigLoader, + WorkerManager, +) +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger(__name__) +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: + """Create a simple torch model and persist to disk for + testing purposes. + + TODO: remove once unit tests are in place""" + # test_path = pathlib.Path(work_dir) + if not model_path.parent.exists(): + model_path.parent.mkdir(parents=True, exist_ok=True) + + model_path.unlink(missing_ok=True) + # model_path = test_path / "basic.pt" + + model = torch.nn.Linear(2, 1) + torch.save(model, model_path) + + return model_path + + +def mock_messages( + worker_manager_queue: CommChannelBase, + feature_store: FeatureStore, + feature_store_root_dir: pathlib.Path, + comm_channel_root_dir: pathlib.Path, +) -> None: + """Mock event producer for triggering the inference pipeline""" + feature_store_root_dir.mkdir(parents=True, exist_ok=True) + comm_channel_root_dir.mkdir(parents=True, exist_ok=True) + + model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") + model_bytes = model_path.read_bytes() + model_key = str(feature_store_root_dir / "model_fs.pt") + + feature_store[model_key] = model_bytes + + iteration_number = 0 + + while True: + iteration_number += 1 + time.sleep(1) + # 1. for demo, ignore upstream and just put stuff into downstream + # 2. for demo, only one downstream but we'd normally have to filter + # msg content and send to the correct downstream (worker) queue + # timestamp = time.time_ns() + # mock_channel = test_path / f"brainstorm-{timestamp}.txt" + # mock_channel.touch() + + # thread - just look for key (wait for keys) + # call checkpoint, try to get non-persistent key, it blocks + # working set size > 1 has side-effects + # only incurs cost when working set size has been exceeded + + channel_key = comm_channel_root_dir / f"{iteration_number}/channel.txt" + callback_channel = FileSystemCommChannel(pathlib.Path(channel_key)) + + input_path = feature_store_root_dir / f"{iteration_number}/input.pt" + output_path = feature_store_root_dir / f"{iteration_number}/output.pt" + + input_key = str(input_path) + output_key = str(output_path) + + buffer = io.BytesIO() + tensor = torch.randn((1, 2), dtype=torch.float32) + torch.save(tensor, buffer) + feature_store[input_key] = buffer.getvalue() + fsd = feature_store.descriptor + + message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) + message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) + message_model_key = MessageHandler.build_model_key(model_key, fsd) + + request = MessageHandler.build_request( + reply_channel=callback_channel.descriptor, + model=message_model_key, + inputs=[message_tensor_input_key], + outputs=[message_tensor_output_key], + output_descriptors=[], + custom_attributes=None, + ) + request_bytes = MessageHandler.serialize_request(request) + worker_manager_queue.send(request_bytes) + + +@pytest.fixture +def prepare_environment(test_dir: str) -> pathlib.Path: + """Cleanup prior outputs to run demo repeatedly""" + path = pathlib.Path(f"{test_dir}/workermanager.log") + logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) + return path + + +def test_worker_manager(prepare_environment: pathlib.Path) -> None: + """Test the worker manager""" + + test_path = prepare_environment + fs_path = test_path / "feature_store" + comm_path = test_path / "comm_store" + + to_worker_channel = dch.Channel.make_process_local() + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + to_worker_fli_serialized = to_worker_fli.serialize() + + # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader + # or test environment may be unable to send messages w/queue + os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + integrated_worker = TorchWorker() + + worker_manager = WorkerManager( + config_loader, + integrated_worker, + as_service=True, + cooldown=5, + device="cpu", + ) + + worker_queue = config_loader.get_queue() + if worker_queue is None: + logger.warn( + f"FLI input queue not loaded correctly from config_loader: {config_loader._queue_descriptor}" + ) + + # create a mock client application to populate the request queue + msg_pump = mp.Process( + target=mock_messages, + args=( + worker_queue, + FileSystemFeatureStore(fs_path), + fs_path, + comm_path, + ), + ) + msg_pump.start() + + # create a process to execute commands + process = mp.Process(target=worker_manager.execute) + process.start() + process.join(timeout=5) + process.kill() + msg_pump.kill() From 783294ad0d0b18bde7fab7e9a2226ebdadd9bb88 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:10:09 -0500 Subject: [PATCH 40/49] rename SSQueue env var to SS_QUEUE --- ex/high_throughput_inference/standalone_workermanager.py | 2 +- smartsim/_core/mli/infrastructure/environmentloader.py | 4 ++-- tests/dragon/test_environment_loader.py | 6 +++--- tests/dragon/test_error_handling.py | 4 ++-- tests/mli/test_worker_manager.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 91a425ae4..62b930b8a 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -80,7 +80,7 @@ worker_type_name = base64.b64decode(args.worker_class.encode("ascii")) torch_worker = cloudpickle.loads(worker_type_name)() - os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + os.environ["SS_QUEUE"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 3f52d8d83..762b00769 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -90,8 +90,8 @@ def get_queue(self) -> t.Optional[CommChannelBase]: """Attach to a queue-like communication channel using the descriptor found in an environment variable. - :returns: The attached queue specified via SSQueue""" - descriptor = os.getenv("SSQueue", "") + :returns: The attached queue specified via SS_QUEUE""" + descriptor = os.getenv("SS_QUEUE", "") if not descriptor: logger.warning("No queue descriptor is configured") diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 46a4a5cb4..12c089792 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -55,7 +55,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke """A descriptor can be stored, loaded, and reattached""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize())) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -76,7 +76,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): queue are the same""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize())) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -89,7 +89,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): """An incorrect serialized descriptor will fails to attach""" - monkeypatch.setenv("SSQueue", "randomstring") + monkeypatch.setenv("SS_QUEUE", "randomstring") config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=None, diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index e576452b7..febe75ce9 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -89,7 +89,7 @@ def setup_worker_manager_model_bytes( chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize())) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) @@ -127,7 +127,7 @@ def setup_worker_manager_model_key( chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SSQueue", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize())) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 51f445885..380c6b06e 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -165,7 +165,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader # or test environment may be unable to send messages w/queue - os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + os.environ["SS_QUEUE"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, From 9bce16a275c30004995d038cd6268c39540791a3 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:14:25 -0500 Subject: [PATCH 41/49] remove commented code, rename variable for clarity --- .../_core/mli/infrastructure/control/workermanager.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 6f52d5364..aa30b019f 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -121,7 +121,7 @@ def __init__( """Dictionary of previously loaded models""" self._feature_stores: t.Dict[str, FeatureStore] = {} """A collection of attached feature stores""" - self._fs_factory = config_loader._featurestore_factory + self._featurestore_factory = config_loader._featurestore_factory """A factory method to create a desired feature store client type""" self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone() """A standalone, system-created feature store used to share internal @@ -145,7 +145,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: fs_actual = {item.descriptor for item in self._feature_stores.values()} fs_missing = fs_desired - fs_actual - if self._fs_factory is None: + if self._featurestore_factory is None: logger.warning("No feature store factory configured") return False @@ -153,7 +153,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: if fs_missing: logger.debug(f"Adding feature store(s): {fs_missing}") for descriptor in fs_missing: - feature_store = self._fs_factory(descriptor) + feature_store = self._featurestore_factory(descriptor) self._feature_stores[descriptor] = feature_store return True @@ -262,11 +262,6 @@ def _on_iteration(self) -> None: ) return - # if request.model_key.descriptor not in self._feature_stores: - # self._fs_factory(request.model_key.descriptor) - # todo: decide if we should load here or in _check_feature_stores. - # todo: should i raise error here? - if request.model_key.key in self._cached_models: timings.append(time.perf_counter() - interm) # timing interm = time.perf_counter() # timing From 446d0008d1ac329119fc919000b19fcbb605ce7b Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:20:22 -0500 Subject: [PATCH 42/49] rename ss_queue -> ss_request_queue --- ex/high_throughput_inference/standalone_workermanager.py | 3 ++- smartsim/_core/mli/infrastructure/environmentloader.py | 4 ++-- tests/dragon/test_environment_loader.py | 6 +++--- tests/dragon/test_error_handling.py | 4 ++-- tests/mli/test_worker_manager.py | 3 ++- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 62b930b8a..2b5ba7df4 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -80,7 +80,8 @@ worker_type_name = base64.b64decode(args.worker_class.encode("ascii")) torch_worker = cloudpickle.loads(worker_type_name)() - os.environ["SS_QUEUE"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + os.environ["SS_REQUEST_QUEUE"] = descriptor config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 762b00769..b4b9e565c 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -90,8 +90,8 @@ def get_queue(self) -> t.Optional[CommChannelBase]: """Attach to a queue-like communication channel using the descriptor found in an environment variable. - :returns: The attached queue specified via SS_QUEUE""" - descriptor = os.getenv("SS_QUEUE", "") + :returns: The attached queue specified via `SS_REQUEST_QUEUE`""" + descriptor = os.getenv("SS_REQUEST_QUEUE", "") if not descriptor: logger.warning("No queue descriptor is configured") diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index 12c089792..6ae5d2b30 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -55,7 +55,7 @@ def test_environment_loader_attach_fli(content: bytes, monkeypatch: pytest.Monke """A descriptor can be stored, loaded, and reattached""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -76,7 +76,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): queue are the same""" chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -89,7 +89,7 @@ def test_environment_loader_serialize_fli(monkeypatch: pytest.MonkeyPatch): def test_environment_loader_flifails(monkeypatch: pytest.MonkeyPatch): """An incorrect serialized descriptor will fails to attach""" - monkeypatch.setenv("SS_QUEUE", "randomstring") + monkeypatch.setenv("SS_REQUEST_QUEUE", "randomstring") config = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, callback_factory=None, diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index febe75ce9..208ab1e5e 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -89,7 +89,7 @@ def setup_worker_manager_model_bytes( chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) @@ -127,7 +127,7 @@ def setup_worker_manager_model_key( chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) - monkeypatch.setenv("SS_QUEUE", du.B64.bytes_to_str(queue.serialize())) + monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 380c6b06e..cf385354e 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -165,7 +165,8 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader # or test environment may be unable to send messages w/queue - os.environ["SS_QUEUE"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + os.environ["SS_REQUEST_QUEUE"] = descriptor config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, From 989db29a77c4eb2de917ebc651e2c380ae018cb9 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:33:43 -0500 Subject: [PATCH 43/49] replaced log.warning w/log.error on missing components --- smartsim/_core/mli/infrastructure/control/workermanager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index aa30b019f..dcc35ae83 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -146,7 +146,7 @@ def _check_feature_stores(self, request: InferenceRequest) -> bool: fs_missing = fs_desired - fs_actual if self._featurestore_factory is None: - logger.warning("No feature store factory configured") + logger.error("No feature store factory configured") return False # create the feature stores we need to service request @@ -215,7 +215,7 @@ def _on_iteration(self) -> None: logger.debug("executing worker manager pipeline") if self._task_queue is None: - logger.warning("No queue to check for tasks") + logger.error("No queue to check for tasks") return timings = [] # timing From 3e8d6ebc6b4722642af45b1be88c58e6ff96ad1e Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:43:07 -0500 Subject: [PATCH 44/49] improve DragonFeatureStore docstrings --- .../storage/dragonfeaturestore.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index 5f42ef0bd..012f3cb2e 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -43,18 +43,24 @@ class DragonFeatureStore(FeatureStore): """A feature store backed by a dragon distributed dictionary""" def __init__(self, storage: "dragon_ddict.DDict") -> None: - """Initialize the DragonFeatureStore instance""" + """Initialize the DragonFeatureStore instance + + :param storage: A distributed dictionary to be used as the underlying + storage mechanism of the feature store""" self._storage = storage def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key - :param key: Unique key of an item to retrieve from the feature store""" + :param key: Unique key of an item to retrieve from the feature store + :returns: The value identified by the supplied key + :raises KeyError: if the key is not found in the feature store + :raises SmartSimError: if retrieval from the feature store fails""" try: value: t.Union[str, bytes] = self._storage[key] return value except KeyError as ex: - raise ex + raise except Exception as ex: # note: explicitly avoid round-trip to check for key existence raise SmartSimError( @@ -90,9 +96,12 @@ def from_descriptor( """A factory method that creates an instance from a descriptor string :param descriptor: The descriptor that uniquely identifies the resource - :returns: An attached DragonFeatureStore""" + :returns: An attached DragonFeatureStore + :raises SmartSimError: if attachment to DragonFeatureStore fails""" try: return DragonFeatureStore(dragon_ddict.DDict.attach(descriptor)) - except: + except Exception as ex: logger.error(f"Error creating dragon feature store: {descriptor}") - raise + raise SmartSimError( + f"Error creating dragon feature store: {descriptor}" + ) from ex From 0344398ebf870ed4c37b59b70aa0c05e47112799 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Tue, 6 Aug 2024 11:08:43 -0500 Subject: [PATCH 45/49] ensure KeyError is logged --- .../_core/mli/infrastructure/storage/dragonfeaturestore.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index 012f3cb2e..e89abcd2a 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -59,7 +59,8 @@ def __getitem__(self, key: str) -> t.Union[str, bytes]: try: value: t.Union[str, bytes] = self._storage[key] return value - except KeyError as ex: + except KeyError: + logger.warning(f"An unknown key was requested: {key}") raise except Exception as ex: # note: explicitly avoid round-trip to check for key existence From d040e289a5662176ef9db76a8686ed9b12706866 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Tue, 6 Aug 2024 15:37:01 -0500 Subject: [PATCH 46/49] move dragon-based test into correct subdir --- tests/{mli => dragon}/test_worker_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename tests/{mli => dragon}/test_worker_manager.py (98%) diff --git a/tests/mli/test_worker_manager.py b/tests/dragon/test_worker_manager.py similarity index 98% rename from tests/mli/test_worker_manager.py rename to tests/dragon/test_worker_manager.py index cf385354e..57585aac9 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -39,9 +39,9 @@ import os import dragon.channels as dch -from channel import FileSystemCommChannel +from .utils.channel import FileSystemCommChannel +from .featurestore import FileSystemFeatureStore from dragon import fli -from featurestore import FileSystemFeatureStore from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel From 5645f79d0e9aba0e917ca95d6587e3dc93418ef2 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Tue, 6 Aug 2024 17:54:20 -0500 Subject: [PATCH 47/49] formatting fix --- tests/dragon/test_worker_manager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 57585aac9..864e14993 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -39,8 +39,6 @@ import os import dragon.channels as dch -from .utils.channel import FileSystemCommChannel -from .featurestore import FileSystemFeatureStore from dragon import fli from smartsim._core.mli.comm.channel.channel import CommChannelBase @@ -57,6 +55,9 @@ from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger +from .featurestore import FileSystemFeatureStore +from .utils.channel import FileSystemCommChannel + logger = get_logger(__name__) # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon From 1e743156787a18ff9bcef3c1cd81aa3720a71bf5 Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:10:28 -0500 Subject: [PATCH 48/49] remove asset URL overrides --- smartsim/_core/_cli/scripts/dragon_install.py | 16 +-- tests/test_dragon_installer.py | 108 +++++++++--------- 2 files changed, 55 insertions(+), 69 deletions(-) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index 48eb750e7..f88af4eb4 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -163,27 +163,13 @@ def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib # if we've previously downloaded the release and still have # wheels laying around, use that cached version instead if download_dir.exists() or list(download_dir.rglob("*.whl")): - # return download_dir - shutil.rmtree(str(download_dir)) + return download_dir download_dir.mkdir(parents=True, exist_ok=True) # grab a copy of the complete asset asset_path = download_dir / str(asset.name) download_url = asset.browser_download_url - if "0.91" not in asset.name: - if "3.9" in python_version(): - logger.debug("I want to snake the original w/3.9 rpm") - # download_url = "https://arti.hpc.amslabs.hpecorp.net/ui/native/dragon-rpm-master-local/dev/master/sle15_sp3_pe/x86_64/dragon-0.91-py3.11.5-1d600977c.rpm" - ... # temp no-op - elif "3.10" in python_version(): - logger.debug("snaking original w/3.10 rpm") - download_url = "https://drive.usercontent.google.com/download?id=1dyScGNomzoPO8-bC8i6zaIbOOhsL83Sp&export=download&authuser=0&confirm=t&uuid=6068afeb-14fd-4303-90a5-498b316d3cce&at=APZUnTWTIf9Tl7Yt8tcdKyodnydV:1722641072921" - elif "3.11" in python_version(): - logger.debug("snaking original w/3.11rpm") - download_url = "https://drive.usercontent.google.com/download?id=1vhUXLIu06-RPA_N3wWmi42avnawzizZZ&export=download&authuser=0&confirm=t&uuid=04c920cb-2e66-4762-8e0f-8ad57e0cbbdf&at=APZUnTUKtCv_BgYOkWAaHqoPpGLd:1722640947383" - else: - logger.debug(f"the name was: {asset.name}") try: urlretrieve(download_url, str(asset_path)) diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index ea4d3eb55..4bf589ad4 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -156,60 +156,60 @@ def test_cleanup_archive_exists(test_archive: pathlib.Path) -> None: assert not test_archive.exists() -# def test_retrieve_cached( -# test_dir: str, -# # archive_path: pathlib.Path, -# test_archive: pathlib.Path, -# monkeypatch: pytest.MonkeyPatch, -# ) -> None: -# """Verify that a previously retrieved asset archive is re-used and the -# release asset retrieval is not attempted""" - -# asset_id = 123 - -# def mock_webtgz_extract(self_, target_) -> None: -# mock_extraction_dir = pathlib.Path(target_) -# with tarfile.TarFile.open(test_archive) as tar: -# tar.extractall(mock_extraction_dir) - -# # we'll use the mock extract to create the files that would normally be downloaded -# expected_output_dir = test_archive.parent / str(asset_id) -# mock_webtgz_extract(None, expected_output_dir) - -# # get modification time of directory holding the "downloaded" archive -# ts1 = expected_output_dir.stat().st_ctime - -# requester = Requester( -# auth=None, -# base_url="https://github.com", -# user_agent="mozilla", -# per_page=10, -# verify=False, -# timeout=1, -# retry=1, -# pool_size=1, -# ) -# headers = {"mock-header": "mock-value"} -# attributes = {"mock-attr": "mock-attr-value"} -# completed = True - -# asset = GitReleaseAsset(requester, headers, attributes, completed) - -# # ensure mocked asset has values that we use... -# monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo")) -# monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name)) -# monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id)) - -# # show that retrieving an asset w/a different ID results in ignoring -# # other wheels from prior downloads in the parent directory of the asset -# asset_path = retrieve_asset(test_archive.parent, asset) -# ts2 = asset_path.stat().st_ctime - -# # NOTE: the file should be written to a subdir based on the asset ID -# assert ( -# asset_path == expected_output_dir -# ) # shows that the expected path matches the output path -# assert ts1 == ts2 # show that the file wasn't changed... +def test_retrieve_cached( + test_dir: str, + # archive_path: pathlib.Path, + test_archive: pathlib.Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Verify that a previously retrieved asset archive is re-used and the + release asset retrieval is not attempted""" + + asset_id = 123 + + def mock_webtgz_extract(self_, target_) -> None: + mock_extraction_dir = pathlib.Path(target_) + with tarfile.TarFile.open(test_archive) as tar: + tar.extractall(mock_extraction_dir) + + # we'll use the mock extract to create the files that would normally be downloaded + expected_output_dir = test_archive.parent / str(asset_id) + mock_webtgz_extract(None, expected_output_dir) + + # get modification time of directory holding the "downloaded" archive + ts1 = expected_output_dir.stat().st_ctime + + requester = Requester( + auth=None, + base_url="https://github.com", + user_agent="mozilla", + per_page=10, + verify=False, + timeout=1, + retry=1, + pool_size=1, + ) + headers = {"mock-header": "mock-value"} + attributes = {"mock-attr": "mock-attr-value"} + completed = True + + asset = GitReleaseAsset(requester, headers, attributes, completed) + + # ensure mocked asset has values that we use... + monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo")) + monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name)) + monkeypatch.setattr(asset, "_id", _git_attr(value=asset_id)) + + # show that retrieving an asset w/a different ID results in ignoring + # other wheels from prior downloads in the parent directory of the asset + asset_path = retrieve_asset(test_archive.parent, asset) + ts2 = asset_path.stat().st_ctime + + # NOTE: the file should be written to a subdir based on the asset ID + assert ( + asset_path == expected_output_dir + ) # shows that the expected path matches the output path + assert ts1 == ts2 # show that the file wasn't changed... def test_retrieve_updated( From 6097c46ec8d535be98f1772d69e53021154722cc Mon Sep 17 00:00:00 2001 From: ankona <3595025+ankona@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:57:04 -0500 Subject: [PATCH 49/49] remove usage of deprecated dragon policy affinity --- .../_core/launcher/dragon/dragonBackend.py | 18 ++++++------------ tests/test_dragon_run_policy.py | 5 ----- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 16f5c03dc..4fe6d55ad 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -457,7 +457,6 @@ def create_run_policy( if isinstance(request, DragonRunRequest): run_request: DragonRunRequest = request - affinity = dragon_policy.Policy.Affinity.DEFAULT cpu_affinity: t.List[int] = [] gpu_affinity: t.List[int] = [] @@ -465,25 +464,20 @@ def create_run_policy( if run_request.policy is not None: # Affinities are not mutually exclusive. If specified, both are used if run_request.policy.cpu_affinity: - affinity = dragon_policy.Policy.Affinity.SPECIFIC cpu_affinity = run_request.policy.cpu_affinity if run_request.policy.gpu_affinity: - affinity = dragon_policy.Policy.Affinity.SPECIFIC gpu_affinity = run_request.policy.gpu_affinity logger.debug( - f"Affinity strategy: {affinity}, " f"CPU affinity mask: {cpu_affinity}, " f"GPU affinity mask: {gpu_affinity}" ) - if affinity != dragon_policy.Policy.Affinity.DEFAULT: - return dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=node_name, - affinity=affinity, - cpu_affinity=cpu_affinity, - gpu_affinity=gpu_affinity, - ) + return dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) return dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, diff --git a/tests/test_dragon_run_policy.py b/tests/test_dragon_run_policy.py index 1d8d069fa..c94ae375b 100644 --- a/tests/test_dragon_run_policy.py +++ b/tests/test_dragon_run_policy.py @@ -143,7 +143,6 @@ def test_create_run_policy_run_request_no_run_policy() -> None: assert policy.device == Policy.Device.DEFAULT assert set(policy.cpu_affinity) == set() assert policy.gpu_affinity == [] - assert policy.affinity == Policy.Affinity.DEFAULT @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @@ -167,7 +166,6 @@ def test_create_run_policy_run_request_default_run_policy() -> None: assert set(policy.cpu_affinity) == set() assert set(policy.gpu_affinity) == set() - assert policy.affinity == Policy.Affinity.DEFAULT @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @@ -192,7 +190,6 @@ def test_create_run_policy_run_request_cpu_affinity_no_device() -> None: assert set(policy.cpu_affinity) == affinity assert policy.gpu_affinity == [] - assert policy.affinity == Policy.Affinity.SPECIFIC @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @@ -216,7 +213,6 @@ def test_create_run_policy_run_request_cpu_affinity() -> None: assert set(policy.cpu_affinity) == affinity assert policy.gpu_affinity == [] - assert policy.affinity == Policy.Affinity.SPECIFIC @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @@ -240,7 +236,6 @@ def test_create_run_policy_run_request_gpu_affinity() -> None: assert policy.cpu_affinity == [] assert set(policy.gpu_affinity) == set(affinity) - assert policy.affinity == Policy.Affinity.SPECIFIC @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems")