From 58cb6f55388b4b019d4ef69a252884e5aea30388 Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Tue, 7 May 2024 16:58:42 -0500 Subject: [PATCH 01/11] orchestrator to feature store, db to fs, database to feauture store first iteration rename --- conftest.py | 134 +++---- .../application_producer_script.py | 2 +- .../experiment_driver.py | 14 +- .../experiment_doc_examples/exp.py | 14 +- .../doc_examples/orch_examples/colo_driver.py | 2 +- .../doc_examples/orch_examples/std_app.py | 2 +- .../doc_examples/orch_examples/std_driver.py | 20 +- doc/tutorials/getting_started/consumer.py | 6 +- .../multi_db_example/application_script.py | 24 +- .../multi_db_example/multidb_driver.py | 48 +-- .../ml_inference/colo-db-torch-example.py | 6 +- doc/tutorials/ml_training/surrogate/fd_sim.py | 2 +- .../ml_training/surrogate/tf_training.py | 2 +- .../online_analysis/lattice/driver.py | 6 +- .../online_analysis/lattice/fv_sim.py | 4 +- setup.py | 8 +- smartsim/_core/_cli/build.py | 24 +- smartsim/_core/_cli/dbcli.py | 8 +- smartsim/_core/_cli/info.py | 20 +- smartsim/_core/_cli/utils.py | 4 +- smartsim/_core/_cli/validate.py | 18 +- smartsim/_core/_install/buildenv.py | 4 +- smartsim/_core/_install/builder.py | 28 +- smartsim/_core/config/config.py | 8 +- smartsim/_core/control/controller.py | 335 +++++++++--------- smartsim/_core/control/job.py | 16 +- smartsim/_core/control/jobmanager.py | 64 ++-- smartsim/_core/control/manifest.py | 66 ++-- smartsim/_core/entrypoints/colocated.py | 104 +++--- smartsim/_core/entrypoints/indirect.py | 4 +- smartsim/_core/entrypoints/redis.py | 20 +- smartsim/_core/generation/generator.py | 28 +- smartsim/_core/launcher/colocated.py | 154 ++++---- smartsim/_core/launcher/step/alpsStep.py | 4 +- smartsim/_core/launcher/step/localStep.py | 2 +- smartsim/_core/launcher/step/lsfStep.py | 4 +- smartsim/_core/launcher/step/mpiStep.py | 4 +- smartsim/_core/launcher/step/slurmStep.py | 2 +- smartsim/_core/launcher/step/step.py | 14 +- smartsim/_core/utils/__init__.py | 2 +- smartsim/_core/utils/helpers.py | 26 +- smartsim/_core/utils/redis.py | 96 ++--- smartsim/_core/utils/serialize.py | 52 +-- smartsim/_core/utils/telemetry/collector.py | 50 +-- smartsim/_core/utils/telemetry/manifest.py | 14 +- smartsim/_core/utils/telemetry/telemetry.py | 14 +- smartsim/_core/utils/telemetry/util.py | 2 +- smartsim/database/__init__.py | 2 +- smartsim/database/orchestrator.py | 278 +++++++-------- smartsim/entity/__init__.py | 2 +- smartsim/entity/dbnode.py | 38 +- smartsim/entity/dbobject.py | 22 +- smartsim/entity/ensemble.py | 90 ++--- smartsim/entity/entityList.py | 18 +- smartsim/entity/model.py | 200 +++++------ smartsim/error/errors.py | 4 +- smartsim/experiment.py | 134 +++---- smartsim/ml/data.py | 16 +- smartsim/ml/tf/utils.py | 4 +- smartsim/settings/alpsSettings.py | 2 +- smartsim/settings/base.py | 12 +- smartsim/settings/lsfSettings.py | 34 +- smartsim/settings/mpiSettings.py | 2 +- smartsim/settings/pbsSettings.py | 2 +- smartsim/settings/slurmSettings.py | 2 +- tests/backends/run_sklearn_onnx.py | 2 +- tests/backends/run_torch.py | 2 +- tests/backends/test_cli_mini_exp.py | 14 +- tests/backends/test_dataloader.py | 36 +- tests/backends/test_dbmodel.py | 164 ++++----- tests/backends/test_dbscript.py | 156 ++++---- tests/backends/test_onnx.py | 2 +- tests/backends/test_tf.py | 2 +- tests/backends/test_torch.py | 2 +- .../full_wlm/test_generic_orc_launch_batch.py | 114 +++--- tests/on_wlm/test_colocated_model.py | 74 ++-- tests/on_wlm/test_containers_wlm.py | 14 +- tests/on_wlm/test_generic_orc_launch.py | 62 ++-- tests/on_wlm/test_het_job.py | 10 +- tests/on_wlm/test_symlinking.py | 6 +- tests/on_wlm/test_wlm_orc_config_settings.py | 48 +-- tests/test_alps_settings.py | 2 +- tests/test_cli.py | 16 +- tests/test_collector_manager.py | 26 +- tests/test_collectors.py | 12 +- tests/test_colo_model_local.py | 112 +++--- tests/test_colo_model_lsf.py | 86 ++--- .../telemetry/colocatedmodel.json | 10 +- .../test_configs/telemetry/db_and_model.json | 14 +- .../telemetry/db_and_model_1run.json | 12 +- tests/test_configs/telemetry/ensembles.json | 4 +- .../test_configs/telemetry/serialmodels.json | 4 +- tests/test_configs/telemetry/telemetry.json | 126 +++---- tests/test_containers.py | 16 +- tests/test_controller.py | 6 +- tests/test_controller_errors.py | 40 +-- tests/test_dbnode.py | 58 +-- tests/test_experiment.py | 32 +- tests/test_generator.py | 10 +- tests/test_indirect.py | 2 +- tests/test_interrupt.py | 8 +- tests/test_launch_errors.py | 24 +- tests/test_lsf_settings.py | 2 +- tests/test_manifest.py | 54 +-- tests/test_model.py | 2 +- tests/test_mpi_settings.py | 2 +- tests/test_multidb.py | 276 +++++++-------- tests/test_orc_config_settings.py | 30 +- tests/test_orchestrator.py | 208 +++++------ tests/test_output_files.py | 6 +- tests/test_reconnect_orchestrator.py | 33 +- tests/test_serialize.py | 10 +- tests/test_slurm_parser.py | 4 +- tests/test_slurm_settings.py | 2 +- tests/test_smartredis.py | 26 +- tests/test_symlinking.py | 14 +- tests/test_telemetry_monitor.py | 128 +++---- 117 files changed, 2236 insertions(+), 2236 deletions(-) diff --git a/conftest.py b/conftest.py index 8d6f6fb2a..89191ee5b 100644 --- a/conftest.py +++ b/conftest.py @@ -47,7 +47,7 @@ from smartsim._core.config import CONFIG from smartsim._core.config.config import Config from smartsim._core.utils.telemetry.telemetry import JobEntity -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity import Model from smartsim.error import SSConfigError from smartsim.settings import ( @@ -340,14 +340,14 @@ def get_run_settings( return RunSettings(exe, args) @staticmethod - def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: + def get_feature_store(nodes: int = 1, batch: bool = False) -> FeatureStore: if test_launcher == "pbs": if not shutil.which("aprun"): hostlist = get_hostlist() else: hostlist = None - return Orchestrator( - db_nodes=nodes, + return FeatureStore( + fs_nodes=nodes, port=test_port, batch=batch, interface=test_nic, @@ -356,8 +356,8 @@ def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: ) if test_launcher == "pals": hostlist = get_hostlist() - return Orchestrator( - db_nodes=nodes, + return FeatureStore( + fs_nodes=nodes, port=test_port, batch=batch, interface=test_nic, @@ -365,16 +365,16 @@ def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: hosts=hostlist, ) if test_launcher == "slurm": - return Orchestrator( - db_nodes=nodes, + return FeatureStore( + fs_nodes=nodes, port=test_port, batch=batch, interface=test_nic, launcher=test_launcher, ) if test_launcher == "lsf": - return Orchestrator( - db_nodes=nodes, + return FeatureStore( + fs_nodes=nodes, port=test_port, batch=batch, cpus_per_shard=4, @@ -384,7 +384,7 @@ def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: launcher=test_launcher, ) - return Orchestrator(port=test_port, interface="lo") + return FeatureStore(port=test_port, interface="lo") @staticmethod def choose_host(rs: RunSettings) -> t.Optional[str]: @@ -397,62 +397,62 @@ def choose_host(rs: RunSettings) -> t.Optional[str]: @pytest.fixture -def local_db( +def local_fs( request: t.Any, wlmutils: t.Type[WLMUtils], test_dir: str -) -> t.Generator[Orchestrator, None, None]: - """Yield fixture for startup and teardown of an local orchestrator""" +) -> t.Generator[FeatureStore, None, None]: + """Yield fixture for startup and teardown of an local feature_store""" exp_name = request.function.__name__ exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - db = Orchestrator(port=wlmutils.get_test_port(), interface="lo") - db.set_path(test_dir) - exp.start(db) + fs = FeatureStore(port=wlmutils.get_test_port(), interface="lo") + fs.set_path(test_dir) + exp.start(fs) - yield db + yield fs # pass or fail, the teardown code below is ran after the # completion of a test case that uses this fixture - exp.stop(db) + exp.stop(fs) @pytest.fixture -def db( +def fs( request: t.Any, wlmutils: t.Type[WLMUtils], test_dir: str -) -> t.Generator[Orchestrator, None, None]: - """Yield fixture for startup and teardown of an orchestrator""" +) -> t.Generator[FeatureStore, None, None]: + """Yield fixture for startup and teardown of an feature_store""" launcher = wlmutils.get_test_launcher() exp_name = request.function.__name__ exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) - db = wlmutils.get_orchestrator() - db.set_path(test_dir) - exp.start(db) + fs = wlmutils.get_feature_store() + fs.set_path(test_dir) + exp.start(fs) - yield db + yield fs # pass or fail, the teardown code below is ran after the # completion of a test case that uses this fixture - exp.stop(db) + exp.stop(fs) @pytest.fixture -def db_cluster( +def fs_cluster( test_dir: str, wlmutils: t.Type[WLMUtils], request: t.Any -) -> t.Generator[Orchestrator, None, None]: +) -> t.Generator[FeatureStore, None, None]: """ - Yield fixture for startup and teardown of a clustered orchestrator. + Yield fixture for startup and teardown of a clustered feature_store. This should only be used in on_wlm and full_wlm tests. """ launcher = wlmutils.get_test_launcher() exp_name = request.function.__name__ exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) - db = wlmutils.get_orchestrator(nodes=3) - db.set_path(test_dir) - exp.start(db) + fs = wlmutils.get_feature_store(nodes=3) + fs.set_path(test_dir) + exp.start(fs) - yield db + yield fs # pass or fail, the teardown code below is ran after the # completion of a test case that uses this fixture - exp.stop(db) + exp.stop(fs) @pytest.fixture(scope="function", autouse=True) @@ -465,13 +465,13 @@ def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None: @pytest.fixture -def dbutils() -> t.Type[DBUtils]: - return DBUtils +def fsutils() -> t.Type[FSUtils]: + return FSUtils -class DBUtils: +class FSUtils: @staticmethod - def get_db_configs() -> t.Dict[str, t.Any]: + def get_fs_configs() -> t.Dict[str, t.Any]: config_settings = { "enable_checkpoints": 1, "set_max_memory": "3gb", @@ -485,7 +485,7 @@ def get_db_configs() -> t.Dict[str, t.Any]: return config_settings @staticmethod - def get_smartsim_error_db_configs() -> t.Dict[str, t.Any]: + def get_smartsim_error_fs_configs() -> t.Dict[str, t.Any]: bad_configs = { "save": [ "-1", # frequency must be positive @@ -512,7 +512,7 @@ def get_smartsim_error_db_configs() -> t.Dict[str, t.Any]: return bad_configs @staticmethod - def get_type_error_db_configs() -> t.Dict[t.Union[int, str], t.Any]: + def get_type_error_fs_configs() -> t.Dict[t.Union[int, str], t.Any]: bad_configs: t.Dict[t.Union[int, str], t.Any] = { "save": [2, True, ["2"]], # frequency must be specified as a string "maxmemory": [99, True, ["99"]], # memory form must be a string @@ -533,15 +533,15 @@ def get_type_error_db_configs() -> t.Dict[t.Union[int, str], t.Any]: @staticmethod def get_config_edit_method( - db: Orchestrator, config_setting: str + fs: FeatureStore, config_setting: str ) -> t.Optional[t.Callable[..., None]]: - """Get a db configuration file edit method from a str""" + """Get a fs configuration file edit method from a str""" config_edit_methods: t.Dict[str, t.Callable[..., None]] = { - "enable_checkpoints": db.enable_checkpoints, - "set_max_memory": db.set_max_memory, - "set_eviction_strategy": db.set_eviction_strategy, - "set_max_clients": db.set_max_clients, - "set_max_message_size": db.set_max_message_size, + "enable_checkpoints": fs.enable_checkpoints, + "set_max_memory": fs.set_max_memory, + "set_eviction_strategy": fs.set_eviction_strategy, + "set_max_clients": fs.set_max_clients, + "set_max_message_size": fs.set_max_message_size, } return config_edit_methods.get(config_setting, None) @@ -647,21 +647,21 @@ class ColoUtils: @staticmethod def setup_test_colo( fileutils: t.Type[FileUtils], - db_type: str, + fs_type: str, exp: Experiment, application_file: str, - db_args: t.Dict[str, t.Any], + fs_args: t.Dict[str, t.Any], colo_settings: t.Optional[RunSettings] = None, colo_model_name: str = "colocated_model", port: int = test_port, on_wlm: bool = False, ) -> Model: - """Setup database needed for the colo pinning tests""" + """Setup feature store needed for the colo pinning tests""" # get test setup sr_test_script = fileutils.get_test_conf_path(application_file) - # Create an app with a colo_db which uses 1 db_cpu + # Create an app with a colo_fs which uses 1 fs_cpu if colo_settings is None: colo_settings = exp.create_run_settings( exe=sys.executable, exe_args=[sr_test_script] @@ -671,28 +671,28 @@ def setup_test_colo( colo_settings.set_nodes(1) colo_model = exp.create_model(colo_model_name, colo_settings) - if db_type in ["tcp", "deprecated"]: - db_args["port"] = port - db_args["ifname"] = "lo" - if db_type == "uds" and colo_model_name is not None: + if fs_type in ["tcp", "deprecated"]: + fs_args["port"] = port + fs_args["ifname"] = "lo" + if fs_type == "uds" and colo_model_name is not None: tmp_dir = tempfile.gettempdir() socket_suffix = str(uuid.uuid4())[:7] socket_name = f"{colo_model_name}_{socket_suffix}.socket" - db_args["unix_socket"] = os.path.join(tmp_dir, socket_name) + fs_args["unix_socket"] = os.path.join(tmp_dir, socket_name) colocate_fun: t.Dict[str, t.Callable[..., None]] = { - "tcp": colo_model.colocate_db_tcp, - "deprecated": colo_model.colocate_db, - "uds": colo_model.colocate_db_uds, + "tcp": colo_model.colocate_fs_tcp, + "deprecated": colo_model.colocate_fs, + "uds": colo_model.colocate_fs_uds, } with warnings.catch_warnings(): - if db_type == "deprecated": - message = "`colocate_db` has been deprecated" + if fs_type == "deprecated": + message = "`colocate_fs` has been deprecated" warnings.filterwarnings("ignore", message=message) - colocate_fun[db_type](**db_args) - # assert model will launch with colocated db + colocate_fun[fs_type](**fs_args) + # assert model will launch with colocated fs assert colo_model.colocated - # Check to make sure that limit_db_cpus made it into the colo settings + # Check to make sure that limit_fs_cpus made it into the colo settings return colo_model @@ -726,7 +726,7 @@ def mock_sink() -> t.Type[MockSink]: @pytest.fixture def mock_con() -> t.Callable[[int, int], t.Iterable[t.Any]]: - """Generates mock db connection telemetry""" + """Generates mock fs connection telemetry""" def _mock_con(min: int = 1, max: int = 254) -> t.Iterable[t.Any]: for i in range(min, max): @@ -740,7 +740,7 @@ def _mock_con(min: int = 1, max: int = 254) -> t.Iterable[t.Any]: @pytest.fixture def mock_mem() -> t.Callable[[int, int], t.Iterable[t.Any]]: - """Generates mock db memory usage telemetry""" + """Generates mock fs memory usage telemetry""" def _mock_mem(min: int = 1, max: int = 1000) -> t.Iterable[t.Any]: for i in range(min, max): diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py b/doc/tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py index 619a56e05..cc294e47e 100644 --- a/doc/tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py @@ -6,5 +6,5 @@ # Create NumPy array array = np.array([1, 2, 3, 4]) -# Use SmartRedis Client to place tensor in standalone Orchestrator +# Use SmartRedis Client to place tensor in standalone Feature Store client.put_tensor("tensor", array) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py b/doc/tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py index 1a1db58e4..fbf35611b 100644 --- a/doc/tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py @@ -5,8 +5,8 @@ # Initialize the Experiment exp = Experiment("getting-started", launcher="auto") -# Initialize a standalone Orchestrator -standalone_orch = exp.create_database(db_nodes=1) +# Initialize a standalone Feature Store +standalone_feature_store = exp.create_feature_store(fs_nodes=1) # Initialize a RunSettings object for Ensemble ensemble_settings = exp.create_run_settings(exe="/path/to/executable_producer_simulation") @@ -23,10 +23,10 @@ consumer_model = exp.create_model("consumer", model_settings) # Generate SmartSim entity folder tree -exp.generate(standalone_orch, producer_ensemble, consumer_model, overwrite=True) +exp.generate(standalone_feature_store, producer_ensemble, consumer_model, overwrite=True) -# Launch Orchestrator -exp.start(standalone_orch, summary=True) +# Launch Feature Store +exp.start(standalone_feature_store, summary=True) # Launch Ensemble exp.start(producer_ensemble, block=True, summary=True) @@ -38,5 +38,5 @@ # Launch consumer Model exp.start(consumer_model, block=True, summary=True) -# Clobber Orchestrator -exp.stop(standalone_orch) \ No newline at end of file +# Clobber Feature Store +exp.stop(standalone_feature_store) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/experiment_doc_examples/exp.py b/doc/tutorials/doc_examples/experiment_doc_examples/exp.py index 7a36262be..738b767d3 100644 --- a/doc/tutorials/doc_examples/experiment_doc_examples/exp.py +++ b/doc/tutorials/doc_examples/experiment_doc_examples/exp.py @@ -6,8 +6,8 @@ # Initialize a SmartSim logger smartsim_logger = get_logger("logger") -# Initialize an Orchestrator -standalone_database = exp.create_database(db_nodes=3, port=6379, interface="ib0") +# Initialize an Feature Store +standalone_feature_store = exp.create_feature_store(fs_nodes=3, port=6379, interface="ib0") # Initialize the Model RunSettings settings = exp.create_run_settings("echo", exe_args="Hello World") @@ -15,12 +15,12 @@ model = exp.create_model("hello_world", settings) # Generate the output directory -exp.generate(standalone_database, model, overwrite=True) +exp.generate(standalone_feature_store, model, overwrite=True) -# Launch the Orchestrator then Model instance -exp.start(standalone_database, model) +# Launch the Feature Store then Model instance +exp.start(standalone_feature_store, model) -# Clobber the Orchestrator -exp.stop(standalone_database) +# Clobber the Feature Store +exp.stop(standalone_feature_store) # Log the summary of the Experiment smartsim_logger.info(exp.summary()) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/orch_examples/colo_driver.py b/doc/tutorials/doc_examples/orch_examples/colo_driver.py index fde06e9b7..9e6c104ac 100644 --- a/doc/tutorials/doc_examples/orch_examples/colo_driver.py +++ b/doc/tutorials/doc_examples/orch_examples/colo_driver.py @@ -17,7 +17,7 @@ model = exp.create_model("colo_model", model_settings) # Colocate the Model -model.colocate_db_uds() +model.colocate_fs_uds() # Generate output files exp.generate(model) diff --git a/doc/tutorials/doc_examples/orch_examples/std_app.py b/doc/tutorials/doc_examples/orch_examples/std_app.py index 67129fbf4..79d548a32 100644 --- a/doc/tutorials/doc_examples/orch_examples/std_app.py +++ b/doc/tutorials/doc_examples/orch_examples/std_app.py @@ -4,7 +4,7 @@ # Initialize a SmartRedis Client application_client = Client(cluster=True) -# Retrieve the driver script tensor from Orchestrator +# Retrieve the driver script tensor from FeatureStore driver_script_tensor = application_client.get_tensor("tensor_1") # Log the tensor application_client.log_data(LLInfo, f"The multi-sharded db tensor is: {driver_script_tensor}") diff --git a/doc/tutorials/doc_examples/orch_examples/std_driver.py b/doc/tutorials/doc_examples/orch_examples/std_driver.py index cf425125b..3605d13d0 100644 --- a/doc/tutorials/doc_examples/orch_examples/std_driver.py +++ b/doc/tutorials/doc_examples/orch_examples/std_driver.py @@ -8,15 +8,15 @@ # Initialize the Experiment exp = Experiment("getting-started", launcher="auto") -# Initialize a multi-sharded Orchestrator -standalone_orchestrator = exp.create_database(db_nodes=3) +# Initialize a multi-sharded feature store +standalone_feature_store = exp.create_feature_store(fs_nodes=3) -# Initialize a SmartRedis client for multi-sharded Orchestrator -driver_client = Client(cluster=True, address=standalone_orchestrator.get_address()[0]) +# Initialize a SmartRedis client for multi-sharded feature store +driver_client = Client(cluster=True, address=standalone_feature_store.get_address()[0]) # Create NumPy array local_array = np.array([1, 2, 3, 4]) -# Use the SmartRedis client to place tensor in the standalone Orchestrator +# Use the SmartRedis client to place tensor in the standalone feature store driver_client.put_tensor("tensor_1", local_array) # Initialize a RunSettings object @@ -27,10 +27,10 @@ model = exp.create_model("model", model_settings) # Create the output directory -exp.generate(standalone_orchestrator, model) +exp.generate(standalone_feature_store, model) -# Launch the multi-sharded Orchestrator -exp.start(standalone_orchestrator) +# Launch the multi-sharded feature store +exp.start(standalone_feature_store) # Launch the Model exp.start(model, block=True, summary=True) @@ -40,7 +40,7 @@ # Validate that the tensor exists logger.info(f"The tensor exists: {app_tensor}") -# Cleanup the Orchestrator -exp.stop(standalone_orchestrator) +# Cleanup the feature store +exp.stop(standalone_feature_store) # Print the Experiment summary logger.info(exp.summary()) \ No newline at end of file diff --git a/doc/tutorials/getting_started/consumer.py b/doc/tutorials/getting_started/consumer.py index aef71f220..841e3cbc8 100644 --- a/doc/tutorials/getting_started/consumer.py +++ b/doc/tutorials/getting_started/consumer.py @@ -6,7 +6,7 @@ parser.add_argument("--redis-port") args = parser.parse_args() -# get model and set into database +# get model and set into feature store address = "127.0.0.1:" + str(args.redis_port) os.environ["SSDB"] = address c = Client(None, logger_name="SmartSim") @@ -21,5 +21,5 @@ for key in data_sources: c.set_data_source(key) input_exists = c.poll_tensor("product", 100, 100) - db_tensor = c.get_tensor("product") - print(f"Tensor for {key} is:", db_tensor) \ No newline at end of file + fs_tensor = c.get_tensor("product") + print(f"Tensor for {key} is:", fs_tensor) \ No newline at end of file diff --git a/doc/tutorials/getting_started/multi_db_example/application_script.py b/doc/tutorials/getting_started/multi_db_example/application_script.py index 239c16684..57caaf910 100644 --- a/doc/tutorials/getting_started/multi_db_example/application_script.py +++ b/doc/tutorials/getting_started/multi_db_example/application_script.py @@ -3,18 +3,18 @@ from smartredis.error import * # Initialize a ConfigOptions object -single_shard_config = ConfigOptions.create_from_environment("single_shard_db_identifier") -# Initialize a SmartRedis client for the single sharded database +single_shard_config = ConfigOptions.create_from_environment("single_shard_fs_identifier") +# Initialize a SmartRedis client for the single sharded feature store app_single_shard_client = Client(single_shard_config, logger_name="Model: single shard logger") # Initialize a ConfigOptions object -multi_shard_config = ConfigOptions.create_from_environment("multi_shard_db_identifier") -# Initialize a SmartRedis client for the multi sharded database +multi_shard_config = ConfigOptions.create_from_environment("multi_shard_fs_identifier") +# Initialize a SmartRedis client for the multi sharded feature store app_multi_shard_client = Client(multi_shard_config, logger_name="Model: multi shard logger") # Initialize a ConfigOptions object -colo_config = ConfigOptions.create_from_environment("colo_db_identifier") -# Initialize a SmartRedis client for the colocated database +colo_config = ConfigOptions.create_from_environment("colo_fs_identifier") +# Initialize a SmartRedis client for the colocated feature store colo_client = Client(colo_config, logger_name="Model: colo logger") # Retrieve the tensor placed in driver script using the associated client @@ -22,16 +22,16 @@ val2 = app_multi_shard_client.get_tensor("tensor_2") # Print message to stdout using SmartRedis Client logger -app_single_shard_client.log_data(LLInfo, f"The single sharded db tensor is: {val1}") -app_multi_shard_client.log_data(LLInfo, f"The multi sharded db tensor is: {val2}") +app_single_shard_client.log_data(LLInfo, f"The single sharded fs tensor is: {val1}") +app_multi_shard_client.log_data(LLInfo, f"The multi sharded fs tensor is: {val2}") -# Place retrieved tensors in colocated database +# Place retrieved tensors in colocated feature store colo_client.put_tensor("tensor_1", val1) colo_client.put_tensor("tensor_2", val2) -# Check that tensors are in colocated database +# Check that tensors are in colocated feature store colo_val1 = colo_client.poll_tensor("tensor_1", 10, 10) colo_val2 = colo_client.poll_tensor("tensor_2", 10, 10) # Print message to stdout using SmartRedis Client logger -colo_client.log_data(LLInfo, f"The colocated db has tensor_1: {colo_val1}") -colo_client.log_data(LLInfo, f"The colocated db has tensor_2: {colo_val2}") \ No newline at end of file +colo_client.log_data(LLInfo, f"The colocated fs has tensor_1: {colo_val1}") +colo_client.log_data(LLInfo, f"The colocated fs has tensor_2: {colo_val2}") \ No newline at end of file diff --git a/doc/tutorials/getting_started/multi_db_example/multidb_driver.py b/doc/tutorials/getting_started/multi_db_example/multidb_driver.py index fae6a9b15..9f4ed2b2d 100644 --- a/doc/tutorials/getting_started/multi_db_example/multidb_driver.py +++ b/doc/tutorials/getting_started/multi_db_example/multidb_driver.py @@ -5,41 +5,41 @@ import sys exe_ex = sys.executable -logger = get_logger("Multidb Experiment Log") +logger = get_logger("MultiFS Experiment Log") # Initialize the Experiment -exp = Experiment("getting-started-multidb", launcher="auto") +exp = Experiment("getting-started-multifs", launcher="auto") -# Initialize a single sharded database -single_shard_db = exp.create_database(port=6379, db_nodes=1, interface="ib0", db_identifier="single_shard_db_identifier") -exp.generate(single_shard_db, overwrite=True) +# Initialize a single sharded feature store +single_shard_fs = exp.create_feature_store(port=6379, fs_nodes=1, interface="ib0", fs_identifier="single_shard_fs_identifier") +exp.generate(single_shard_fs, overwrite=True) -# Initialize a multi sharded database -multi_shard_db = exp.create_database(port=6380, db_nodes=3, interface="ib0", db_identifier="multi_shard_db_identifier") -exp.generate(multi_shard_db, overwrite=True) +# Initialize a multi sharded feature store +multi_shard_fs = exp.create_feature_store(port=6380, fs_nodes=3, interface="ib0", fs_identifier="multi_shard_fs_identifier") +exp.generate(multi_shard_fs, overwrite=True) -# Launch the single and multi sharded database -exp.start(single_shard_db, multi_shard_db, summary=True) +# Launch the single and multi sharded feature store +exp.start(single_shard_fs, multi_shard_fs, summary=True) -# Initialize SmartRedis client for single sharded database -driver_client_single_shard = Client(cluster=False, address=single_shard_db.get_address()[0], logger_name="Single shard db logger") -# Initialize SmartRedis client for multi sharded database -driver_client_multi_shard = Client(cluster=True, address=multi_shard_db.get_address()[0], logger_name="Multi shard db logger") +# Initialize SmartRedis client for single sharded feature store +driver_client_single_shard = Client(cluster=False, address=single_shard_fs.get_address()[0], logger_name="Single shard fs logger") +# Initialize SmartRedis client for multi sharded feature store +driver_client_multi_shard = Client(cluster=True, address=multi_shard_fs.get_address()[0], logger_name="Multi shard fs logger") # Create NumPy array array_1 = np.array([1, 2, 3, 4]) -# Use single shard db SmartRedis client to place tensor in single sharded db +# Use single shard fs SmartRedis client to place tensor in single sharded fs driver_client_single_shard.put_tensor("tensor_1", array_1) # Create NumPy array array_2 = np.array([5, 6, 7, 8]) -# Use single shard db SmartRedis client to place tensor in multi sharded db +# Use single shard fs SmartRedis client to place tensor in multi sharded fs driver_client_multi_shard.put_tensor("tensor_2", array_2) -# Check that tensors are in correct databases -check_single_shard_db_tensor_incorrect = driver_client_single_shard.key_exists("tensor_2") -check_multi_shard_db_tensor_incorrect = driver_client_multi_shard.key_exists("tensor_1") -logger.info(f"The multi shard array key exists in the incorrect database: {check_single_shard_db_tensor_incorrect}") -logger.info(f"The single shard array key exists in the incorrect database: {check_multi_shard_db_tensor_incorrect}") +# Check that tensors are in correct feature stores +check_single_shard_fs_tensor_incorrect = driver_client_single_shard.key_exists("tensor_2") +check_multi_shard_fs_tensor_incorrect = driver_client_multi_shard.key_exists("tensor_1") +logger.info(f"The multi shard array key exists in the incorrect feature store: {check_single_shard_fs_tensor_incorrect}") +logger.info(f"The single shard array key exists in the incorrect feature store: {check_multi_shard_fs_tensor_incorrect}") # Initialize a RunSettings object model_settings = exp.create_run_settings(exe=exe_ex, exe_args="./path/to/application_script.py") @@ -49,11 +49,11 @@ # Initialize a SmartSim Model model = exp.create_model("colo_model", model_settings) # Colocate the Model -model.colocate_db_tcp(db_identifier="colo_db_identifier") +model.colocate_fs_tcp(fs_identifier="colo_fs_identifier") # Launch the colocated Model exp.start(model, block=True, summary=True) -# Tear down the single and multi sharded databases -exp.stop(single_shard_db, multi_shard_db) +# Tear down the single and multi sharded feature stores +exp.stop(single_shard_fs, multi_shard_fs) # Print the Experiment summary logger.info(exp.summary()) \ No newline at end of file diff --git a/doc/tutorials/ml_inference/colo-db-torch-example.py b/doc/tutorials/ml_inference/colo-db-torch-example.py index af6e90a50..970beba91 100644 --- a/doc/tutorials/ml_inference/colo-db-torch-example.py +++ b/doc/tutorials/ml_inference/colo-db-torch-example.py @@ -4,13 +4,13 @@ def calc_svd(input_tensor): # svd function from TorchScript API # torch isn't imported since we don't need that dependency - # in the client code to call this function in the database. + # in the client code to call this function in the feature store. return input_tensor.svd() -# connect a client to the database +# connect a client to the feature store # no address required since this `Model` was launched through SmartSim -# Cluster=False since colocated databases are never clustered. +# Cluster=False since colocated feature stores are never clustered. client = Client(cluster=False) tensor = np.random.randint(0, 100, size=(5, 3, 2)).astype(np.float32) diff --git a/doc/tutorials/ml_training/surrogate/fd_sim.py b/doc/tutorials/ml_training/surrogate/fd_sim.py index db68b24b2..660b20997 100644 --- a/doc/tutorials/ml_training/surrogate/fd_sim.py +++ b/doc/tutorials/ml_training/surrogate/fd_sim.py @@ -108,7 +108,7 @@ def simulate(steps, size): def create_dataset(idx, u_init, u_steady): """Create SmartRedis Dataset containing multiple NumPy arrays - to be stored at a single key within the database""" + to be stored at a single key within the feature store""" dataset = Dataset(f"sim_data_{idx}") dataset.add_tensor("u_steady", np.expand_dims(u_steady, axis=[0,-1])) dataset.add_tensor("u_init", np.expand_dims(u_init, axis=[0,-1])) diff --git a/doc/tutorials/ml_training/surrogate/tf_training.py b/doc/tutorials/ml_training/surrogate/tf_training.py index 932cb2df3..51c781d99 100644 --- a/doc/tutorials/ml_training/surrogate/tf_training.py +++ b/doc/tutorials/ml_training/surrogate/tf_training.py @@ -11,7 +11,7 @@ def create_dataset(idx, F): """Create SmartRedis Dataset containing multiple NumPy arrays - to be stored at a single key within the database""" + to be stored at a single key within the feature store""" dataset = Dataset(f"ml_data_{idx}") dataset.add_tensor("steady", F) diff --git a/doc/tutorials/online_analysis/lattice/driver.py b/doc/tutorials/online_analysis/lattice/driver.py index 702d9e50c..a7064b7c6 100644 --- a/doc/tutorials/online_analysis/lattice/driver.py +++ b/doc/tutorials/online_analysis/lattice/driver.py @@ -4,11 +4,11 @@ from smartredis import Client from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.settings import RunSettings exp = Experiment("finite_volume_simulation", launcher="local") -db = Orchestrator(port=6780) +db = FeatureStore(port=6780) # simulation parameters and plot settings fig = plt.figure(figsize=(12,6), dpi=80) @@ -27,7 +27,7 @@ # generate directories for output, error and results exp.generate(db, model, overwrite=True) -# start the database and connect client to get data +# start the feature store and connect client to get data exp.start(db) client = Client(address="127.0.0.1:6780", cluster=False) diff --git a/doc/tutorials/online_analysis/lattice/fv_sim.py b/doc/tutorials/online_analysis/lattice/fv_sim.py index c9c75b88d..102bd5b78 100644 --- a/doc/tutorials/online_analysis/lattice/fv_sim.py +++ b/doc/tutorials/online_analysis/lattice/fv_sim.py @@ -23,7 +23,7 @@ def finite_volume_simulation(steps=4000, x_res=400, y_res=100, for i in idxs: F[:,:,i] *= rho0 / rho - # save cylinder location to database + # save cylinder location to feature store cylinder = (X - x_res/4)**2 + (Y - y_res/2)**2 < (y_res/4)**2 # bool array client.put_tensor("cylinder", cylinder.astype(np.int8)) @@ -59,7 +59,7 @@ def finite_volume_simulation(steps=4000, x_res=400, y_res=100, def create_dataset(time_step, ux, uy): """Create SmartRedis Dataset containing multiple NumPy arrays - to be stored at a single key within the database""" + to be stored at a single key within the feature store""" dataset = Dataset(f"data_{time_step}") dataset.add_tensor("ux", ux) dataset.add_tensor("uy", uy) diff --git a/setup.py b/setup.py index 236b59c1a..f377051b1 100644 --- a/setup.py +++ b/setup.py @@ -141,13 +141,13 @@ def finalize_options(self): class SmartSimBuild(build_py): def run(self): - database_builder = builder.DatabaseBuilder( + feature_store_builder = builder.FeatureStoreBuilder( build_env(), build_env.MALLOC, build_env.JOBS ) - if not database_builder.is_built: - database_builder.build_from_git(versions.REDIS_URL, versions.REDIS) + if not feature_store_builder.is_built: + feature_store_builder.build_from_git(versions.REDIS_URL, versions.REDIS) - database_builder.cleanup() + feature_store_builder.cleanup() # run original build_py command super().run() diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index ab982ac1b..0e53c7181 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -106,12 +106,12 @@ def check_backends_install() -> bool: return not bool(msg) -def build_database( +def build_feature_store( build_env: BuildEnv, versions: Versioner, keydb: bool, verbose: bool ) -> None: - # check database installation - database_name = "KeyDB" if keydb else "Redis" - database_builder = builder.DatabaseBuilder( + # check feature store installation + feature_store_name = "KeyDB" if keydb else "Redis" + feature_store_builder = builder.FeatureStoreBuilder( build_env(), jobs=build_env.JOBS, _os=builder.OperatingSystem.from_str(platform.system()), @@ -119,14 +119,14 @@ def build_database( malloc=build_env.MALLOC, verbose=verbose, ) - if not database_builder.is_built: + if not feature_store_builder.is_built: logger.info( - f"Building {database_name} version {versions.REDIS} " + f"Building {feature_store_name} version {versions.REDIS} " f"from {versions.REDIS_URL}" ) - database_builder.build_from_git(versions.REDIS_URL, versions.REDIS_BRANCH) - database_builder.cleanup() - logger.info(f"{database_name} build complete!") + feature_store_builder.build_from_git(versions.REDIS_URL, versions.REDIS_BRANCH) + feature_store_builder.cleanup() + logger.info(f"{feature_store_name} build complete!") def build_redis_ai( @@ -394,16 +394,16 @@ def execute( ) if verbose: - db_name: DbEngine = "KEYDB" if keydb else "REDIS" + fs_name: DbEngine = "KEYDB" if keydb else "REDIS" logger.info("Version Information:") - vers = versions.as_dict(db_name=db_name) + vers = versions.as_dict(fs_name=fs_name) version_names = list(vers.keys()) print(tabulate(vers, headers=version_names, tablefmt="github"), "\n") try: if not args.only_python_packages: # REDIS/KeyDB - build_database(build_env, versions, keydb, verbose) + build_feature_store(build_env, versions, keydb, verbose) # REDISAI build_redis_ai( diff --git a/smartsim/_core/_cli/dbcli.py b/smartsim/_core/_cli/dbcli.py index 733c2fe4d..b06e5984f 100644 --- a/smartsim/_core/_cli/dbcli.py +++ b/smartsim/_core/_cli/dbcli.py @@ -28,14 +28,14 @@ import os import typing as t -from smartsim._core._cli.utils import get_db_path +from smartsim._core._cli.utils import get_fs_path def execute( _args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / ) -> int: - if db_path := get_db_path(): - print(db_path) + if fs_path := get_fs_path(): + print(fs_path) return os.EX_OK - print("Database (Redis or KeyDB) dependencies not found") + print("Feature store(Redis or KeyDB) dependencies not found") return os.EX_SOFTWARE diff --git a/smartsim/_core/_cli/info.py b/smartsim/_core/_cli/info.py index c08fcb1a3..4f4137cd2 100644 --- a/smartsim/_core/_cli/info.py +++ b/smartsim/_core/_cli/info.py @@ -29,12 +29,12 @@ def execute( end="\n\n", ) - print("Orchestrator Configuration:") - db_path = _utils.get_db_path() - db_table = [["Installed", _fmt_installed_db(db_path)]] - if db_path: - db_table.append(["Location", str(db_path)]) - print(tabulate(db_table, tablefmt="fancy_outline"), end="\n\n") + print("FeatureStore Configuration:") + fs_path = _utils.get_fs_path() + fs_table = [["Installed", _fmt_installed_fs(fs_path)]] + if fs_path: + fs_table.append(["Location", str(fs_path)]) + print(tabulate(fs_table, tablefmt="fancy_outline"), end="\n\n") print("Redis AI Configuration:") rai_path = _helpers.redis_install_base().parent / "redisai.so" @@ -72,11 +72,11 @@ def execute( return os.EX_OK -def _fmt_installed_db(db_path: t.Optional[pathlib.Path]) -> str: - if db_path is None: +def _fmt_installed_fs(fs_path: t.Optional[pathlib.Path]) -> str: + if fs_path is None: return _MISSING_DEP - db_name, _ = db_path.name.split("-", 1) - return _helpers.colorize(db_name.upper(), "green") + fs_name, _ = fs_path.name.split("-", 1) + return _helpers.colorize(fs_name.upper(), "green") def _fmt_installed_redis_ai(rai_path: pathlib.Path) -> str: diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index 1b099c248..af0aba417 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -108,12 +108,12 @@ def clean(core_path: Path, _all: bool = False) -> int: removed = True file_path.unlink() if removed: - logger.info("Successfully removed SmartSim database installation") + logger.info("Successfully removed SmartSim feature store installation") return os.EX_OK -def get_db_path() -> t.Optional[Path]: +def get_fs_path() -> t.Optional[Path]: bin_path = get_install_path() / "_core" / "bin" for option in bin_path.iterdir(): if option.name in ("redis-cli", "keydb-cli"): diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index f957d541d..9ddd559f8 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -128,7 +128,7 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: type=int, default=None, help=( - "The port on which to run the orchestrator for the mini experiment. " + "The port on which to run the feature store for the mini experiment. " "If not provided, `smart` will attempt to automatically select an " "open port" ), @@ -154,7 +154,7 @@ def test_install( exp.telemetry.disable() port = _find_free_port() if port is None else port - with _make_managed_local_orc(exp, port) as client: + with _make_managed_local_feature_store(exp, port) as client: logger.info("Verifying Tensor Transfer") client.put_tensor("plain-tensor", np.ones((1, 1, 3, 3))) client.get_tensor("plain-tensor") @@ -192,18 +192,18 @@ def _set_or_del_env_var(var: str, val: t.Optional[str]) -> None: @contextlib.contextmanager -def _make_managed_local_orc( +def _make_managed_local_feature_store( exp: Experiment, port: int ) -> t.Generator[Client, None, None]: - """Context managed orc that will be stopped if an exception is raised""" - orc = exp.create_database(db_nodes=1, interface="lo", port=port) - exp.generate(orc) - exp.start(orc) + """Context managed feature store that will be stopped if an exception is raised""" + feature_store = exp.create_feature_store(fs_nodes=1, interface="lo", port=port) + exp.generate(feature_store) + exp.start(feature_store) try: - (client_addr,) = orc.get_address() + (client_addr,) = feature_store.get_address() yield Client(False, address=client_addr) finally: - exp.stop(orc) + exp.stop(feature_store) def _find_free_port() -> int: diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index 476d0374c..64d1cc3f4 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -300,11 +300,11 @@ class Versioner: TENSORFLOW = Version_(REDISAI.tensorflow) ONNX = Version_(REDISAI.onnx) - def as_dict(self, db_name: DbEngine = "REDIS") -> t.Dict[str, t.Tuple[str, ...]]: + def as_dict(self, fs_name: DbEngine = "REDIS") -> t.Dict[str, t.Tuple[str, ...]]: pkg_map = { "SMARTSIM": self.SMARTSIM, "SMARTREDIS": self.SMARTREDIS, - db_name: self.REDIS, + fs_name: self.REDIS, "REDISAI": self.REDISAI, "TORCH": self.TORCH, "TENSORFLOW": self.TENSORFLOW, diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index 55b1e90b9..f32802074 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -243,7 +243,7 @@ def run_command( raise BuildError(e) from e -class DatabaseBuilder(Builder): +class FeatureStoreBuilder(Builder): """Class to build Redis or KeyDB from Source Supported build methods: - from git @@ -285,8 +285,8 @@ def build_from_git( :param branch: branch to checkout """ # pylint: disable=too-many-locals - database_name = "keydb" if "KeyDB" in git_url else "redis" - database_build_path = Path(self.build_dir, database_name.lower()) + feature_store_name = "keydb" if "KeyDB" in git_url else "redis" + feature_store_build_path = Path(self.build_dir, feature_store_name.lower()) # remove git directory if it exists as it should # really never exist as we delete after build @@ -297,9 +297,9 @@ def build_from_git( if keydb_build_path.is_dir(): shutil.rmtree(str(keydb_build_path)) - # Check database URL + # Check feature store URL if not self.is_valid_url(git_url): - raise BuildError(f"Malformed {database_name} URL: {git_url}") + raise BuildError(f"Malformed {feature_store_name} URL: {git_url}") clone_cmd = config_git_command( self._platform, @@ -311,7 +311,7 @@ def build_from_git( branch, "--depth", "1", - database_name, + feature_store_name, ], ) @@ -325,14 +325,14 @@ def build_from_git( str(self.jobs), f"MALLOC={self.malloc}", ] - self.run_command(build_cmd, cwd=str(database_build_path)) + self.run_command(build_cmd, cwd=str(feature_store_build_path)) # move redis binaries to smartsim/smartsim/_core/bin - database_src_dir = database_build_path / "src" - server_source = database_src_dir / (database_name.lower() + "-server") - server_destination = self.bin_path / (database_name.lower() + "-server") - cli_source = database_src_dir / (database_name.lower() + "-cli") - cli_destination = self.bin_path / (database_name.lower() + "-cli") + feature_store_src_dir = feature_store_build_path / "src" + server_source = feature_store_src_dir / (feature_store_name.lower() + "-server") + server_destination = self.bin_path / (feature_store_name.lower() + "-server") + cli_source = feature_store_src_dir / (feature_store_name.lower() + "-cli") + cli_destination = self.bin_path / (feature_store_name.lower() + "-cli") self.copy_file(server_source, server_destination, set_exe=True) self.copy_file(cli_source, cli_destination, set_exe=True) @@ -342,8 +342,8 @@ def build_from_git( bin_path = Path(dependency_path, "bin").resolve() try: database_exe = next(bin_path.glob("*-server")) - database = Path(os.environ.get("REDIS_PATH", database_exe)).resolve() - _ = expand_exe_path(str(database)) + feature_store = Path(os.environ.get("REDIS_PATH", database_exe)).resolve() + _ = expand_exe_path(str(feature_store)) except (TypeError, FileNotFoundError) as e: raise BuildError("Installation of redis-server failed!") from e diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 42a548c42..fc3d41bb0 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -116,7 +116,7 @@ def database_conf(self) -> str: conf = Path(os.environ.get("REDIS_CONF", self.conf_path)).resolve() if not conf.is_file(): raise SSConfigError( - "Database configuration file at REDIS_CONF could not be found" + "Feature store configuration file at REDIS_CONF could not be found" ) return str(conf) @@ -124,12 +124,12 @@ def database_conf(self) -> str: def database_exe(self) -> str: try: database_exe = next(self.bin_path.glob("*-server")) - database = Path(os.environ.get("REDIS_PATH", database_exe)).resolve() - exe = expand_exe_path(str(database)) + feature_store = Path(os.environ.get("REDIS_PATH", database_exe)).resolve() + exe = expand_exe_path(str(feature_store)) return exe except (TypeError, FileNotFoundError) as e: raise SSConfigError( - "Specified database binary at REDIS_PATH could not be used" + "Specified feature store binary at REDIS_PATH could not be used" ) from e @property diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 0724235c4..82785fedd 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -45,16 +45,16 @@ from ..._core.launcher.step import Step from ..._core.utils.helpers import ( SignalInterceptionStack, - unpack_colo_db_identifier, - unpack_db_identifier, + unpack_colo_fs_identifier, + unpack_fs_identifier, ) from ..._core.utils.redis import ( - db_is_active, + fs_is_active, set_ml_model, set_script, - shutdown_db_node, + shutdown_fs_node, ) -from ...database import Orchestrator +from ...database import FeatureStore from ...entity import Ensemble, EntitySequence, Model, SmartSimEntity from ...error import ( LauncherError, @@ -138,16 +138,16 @@ def start( if CONFIG.telemetry_enabled: self._start_telemetry_monitor(exp_path) - # block until all non-database jobs are complete + # block until all non-feature store jobs are complete if block: # poll handles its own keyboard interrupt as # it may be called separately self.poll(5, True, kill_on_interrupt=kill_on_interrupt) @property - def orchestrator_active(self) -> bool: + def feature_store_active(self) -> bool: with JM_LOCK: - if len(self._jobs.db_jobs) > 0: + if len(self._jobs.fs_jobs) > 0: return True return False @@ -182,8 +182,8 @@ def finished( :raises ValueError: if entity has not been launched yet """ try: - if isinstance(entity, Orchestrator): - raise TypeError("Finished() does not support Orchestrator instances") + if isinstance(entity, FeatureStore): + raise TypeError("Finished() does not support FeatureStore instances") if isinstance(entity, EntitySequence): return all(self.finished(ent) for ent in entity.entities) if not isinstance(entity, SmartSimEntity): @@ -227,21 +227,21 @@ def stop_entity( ) self._jobs.move_to_completed(job) - def stop_db(self, db: Orchestrator) -> None: - """Stop an orchestrator + def stop_fs(self, fs: FeatureStore) -> None: + """Stop an FeatureStore - :param db: orchestrator to be stopped + :param fs: FeatureStore to be stopped """ - if db.batch: - self.stop_entity(db) + if fs.batch: + self.stop_entity(fs) else: with JM_LOCK: - for node in db.entities: + for node in fs.entities: for host_ip, port in itertools.product( - (get_ip_from_host(host) for host in node.hosts), db.ports + (get_ip_from_host(host) for host in node.hosts), fs.ports ): - retcode, _, _ = shutdown_db_node(host_ip, port) - # Sometimes the DB will not shutdown (unless we force NOSAVE) + retcode, _, _ = shutdown_fs_node(host_ip, port) + # Sometimes the fs will not shutdown (unless we force NOSAVE) if retcode != 0: self.stop_entity(node) continue @@ -256,7 +256,7 @@ def stop_db(self, db: Orchestrator) -> None: ) self._jobs.move_to_completed(job) - db.reset_hosts() + fs.reset_hosts() def stop_entity_list(self, entity_list: EntitySequence[SmartSimEntity]) -> None: """Stop an instance of an entity list @@ -380,8 +380,8 @@ def _launch( ) -> LaunchedManifest[t.Tuple[str, Step]]: """Main launching function of the controller - Orchestrators are always launched first so that the - address of the database can be given to following entities + FeatureStores are always launched first so that the + address of the feature store can be given to following entities :param exp_name: The name of the launching experiment :param exp_path: path to location of ``Experiment`` directory if generated @@ -393,27 +393,27 @@ def _launch( exp_path=exp_path, launcher_name=str(self._launcher), ) - # Loop over deployables to launch and launch multiple orchestrators - for orchestrator in manifest.dbs: - for key in self._jobs.get_db_host_addresses(): - _, db_id = unpack_db_identifier(key, "_") - if orchestrator.db_identifier == db_id: + # Loop over deployables to launch and launch multiple FeatureStores + for featurestore in manifest.fss: + for key in self._jobs.get_fs_host_addresses(): + _, fs_id = unpack_fs_identifier(key, "_") + if featurestore.fs_identifier == fs_id: raise SSDBIDConflictError( - f"Database identifier {orchestrator.db_identifier}" + f"Feature store identifier {featurestore.fs_identifier}" " has already been used. Pass in a unique" - " name for db_identifier" + " name for fs_identifier" ) - if orchestrator.num_shards > 1 and isinstance( + if featurestore.num_shards > 1 and isinstance( self._launcher, LocalLauncher ): raise SmartSimError( - "Local launcher does not support multi-host orchestrators" + "Local launcher does not support multi-host feature stores" ) - self._launch_orchestrator(orchestrator, manifest_builder) + self._launch_feature_store(featurestore, manifest_builder) - if self.orchestrator_active: - self._set_dbobjects(manifest) + if self.feature_store_active: + self._set_fsobjects(manifest) # create all steps prior to launch steps: t.List[ @@ -476,70 +476,70 @@ def _launch( return manifest_builder.finalize() - def _launch_orchestrator( + def _launch_feature_store( self, - orchestrator: Orchestrator, + featurestore: FeatureStore, manifest_builder: LaunchedManifestBuilder[t.Tuple[str, Step]], ) -> None: - """Launch an Orchestrator instance + """Launch an FeatureStore instance - This function will launch the Orchestrator instance and + This function will launch the FeatureStore instance and if on WLM, find the nodes where it was launched and set them in the JobManager - :param orchestrator: orchestrator to launch + :param featurestore: FeatureStore to launch :param manifest_builder: An `LaunchedManifestBuilder` to record the - names and `Step`s of the launched orchestrator + names and `Step`s of the launched featurestore """ - orchestrator.remove_stale_files() - orc_telem_dir = manifest_builder.run_telemetry_subdirectory / "database" + featurestore.remove_stale_files() + feature_store_telem_dir = manifest_builder.run_telemetry_subdirectory / "database" - # if the orchestrator was launched as a batch workload - if orchestrator.batch: - orc_batch_step, substeps = self._create_batch_job_step( - orchestrator, orc_telem_dir + # if the featurestore was launched as a batch workload + if featurestore.batch: + feature_store_batch_step, substeps = self._create_batch_job_step( + featurestore, feature_store_telem_dir ) - manifest_builder.add_database( - orchestrator, [(orc_batch_step.name, step) for step in substeps] + manifest_builder.add_feature_store( + featurestore, [(feature_store_batch_step.name, step) for step in substeps] ) - self._launch_step(orc_batch_step, orchestrator) - self.symlink_output_files(orc_batch_step, orchestrator) + self._launch_step(feature_store_batch_step, featurestore) + self.symlink_output_files(feature_store_batch_step, featurestore) # symlink substeps to maintain directory structure - for substep, substep_entity in zip(substeps, orchestrator.entities): + for substep, substep_entity in zip(substeps, featurestore.entities): self.symlink_output_files(substep, substep_entity) - # if orchestrator was run on existing allocation, locally, or in allocation + # if featurestore was run on existing allocation, locally, or in allocation else: - db_steps = [ - (self._create_job_step(db, orc_telem_dir / orchestrator.name), db) - for db in orchestrator.entities + fs_steps = [ + (self._create_job_step(fs, feature_store_telem_dir / featurestore.name), fs) + for fs in featurestore.entities ] - manifest_builder.add_database( - orchestrator, [(step.name, step) for step, _ in db_steps] + manifest_builder.add_feature_store( + featurestore, [(step.name, step) for step, _ in fs_steps] ) - for db_step in db_steps: - self._launch_step(*db_step) - self.symlink_output_files(*db_step) + for fs_step in fs_steps: + self._launch_step(*fs_step) + self.symlink_output_files(*fs_step) - # wait for orchestrator to spin up - self._orchestrator_launch_wait(orchestrator) + # wait for featurestore to spin up + self._feature_store_launch_wait(featurestore) # set the jobs in the job manager to provide SSDB variable to entities # if _host isnt set within each - self._jobs.set_db_hosts(orchestrator) + self._jobs.set_fs_hosts(featurestore) - # create the database cluster - if orchestrator.num_shards > 2: + # create the feature store cluster + if featurestore.num_shards > 2: num_trials = 5 cluster_created = False while not cluster_created: try: - create_cluster(orchestrator.hosts, orchestrator.ports) - check_cluster_status(orchestrator.hosts, orchestrator.ports) - num_shards = orchestrator.num_shards - logger.info(f"Database cluster created with {num_shards} shards") + create_cluster(featurestore.hosts, featurestore.ports) + check_cluster_status(featurestore.hosts, featurestore.ports) + num_shards = featurestore.num_shards + logger.info(f"Feature store cluster created with {num_shards} shards") cluster_created = True except SSInternalError: if num_trials > 0: @@ -551,8 +551,8 @@ def _launch_orchestrator( else: # surface SSInternalError as we have no way to recover raise - self._save_orchestrator(orchestrator) - logger.debug(f"Orchestrator launched on nodes: {orchestrator.hosts}") + self._save_feature_store(featurestore) + logger.debug(f"FeatureStore launched on nodes: {featurestore.hosts}") def _launch_step( self, @@ -569,10 +569,10 @@ def _launch_step( completed_job = self._jobs.completed.get(entity.name, None) # if completed job DNE and is the entity name is not - # running in JobManager.jobs or JobManager.db_jobs, + # running in JobManager.jobs or JobManager.fs_jobs, # launch the job if completed_job is None and ( - entity.name not in self._jobs.jobs and entity.name not in self._jobs.db_jobs + entity.name not in self._jobs.jobs and entity.name not in self._jobs.fs_jobs ): try: job_id = self._launcher.run(job_step) @@ -614,7 +614,7 @@ def _launch_step( def _create_batch_job_step( self, - entity_list: t.Union[Orchestrator, Ensemble, _AnonymousBatchJob], + entity_list: t.Union[FeatureStore, Ensemble, _AnonymousBatchJob], telemetry_dir: pathlib.Path, ) -> t.Tuple[Step, t.List[Step]]: """Use launcher to create batch job step @@ -674,16 +674,16 @@ def _prep_entity_client_env(self, entity: Model) -> None: """ client_env: t.Dict[str, t.Union[str, int, float, bool]] = {} - address_dict = self._jobs.get_db_host_addresses() + address_dict = self._jobs.get_fs_host_addresses() - for db_id, addresses in address_dict.items(): - db_name, _ = unpack_db_identifier(db_id, "_") + for fs_id, addresses in address_dict.items(): + fs_name, _ = unpack_fs_identifier(fs_id, "_") if addresses: # Cap max length of SSDB - client_env[f"SSDB{db_name}"] = ",".join(addresses[:128]) + client_env[f"SSDB{fs_name}"] = ",".join(addresses[:128]) # Retrieve num_shards to append to client env - client_env[f"SR_DB_TYPE{db_name}"] = ( + client_env[f"SR_DB_TYPE{fs_name}"] = ( CLUSTERED if len(addresses) > 1 else STANDALONE ) @@ -695,20 +695,20 @@ def _prep_entity_client_env(self, entity: Model) -> None: client_env["SSKEYOUT"] = entity.name # Set address to local if it's a colocated model - if entity.colocated and entity.run_settings.colocated_db_settings is not None: - db_name_colo = entity.run_settings.colocated_db_settings["db_identifier"] - assert isinstance(db_name_colo, str) + if entity.colocated and entity.run_settings.colocated_fs_settings is not None: + fs_name_colo = entity.run_settings.colocated_fs_settings["fs_identifier"] + assert isinstance(fs_name_colo, str) for key in address_dict: - _, db_id = unpack_db_identifier(key, "_") - if db_name_colo == db_id: + _, fs_id = unpack_fs_identifier(key, "_") + if fs_name_colo == fs_id: raise SSDBIDConflictError( - f"Database identifier {db_name_colo}" + f"Feature store identifier {fs_name_colo}" " has already been used. Pass in a unique" - " name for db_identifier" + " name for fs_identifier" ) - db_name_colo = unpack_colo_db_identifier(db_name_colo) - if colo_cfg := entity.run_settings.colocated_db_settings: + fs_name_colo = unpack_colo_fs_identifier(fs_name_colo) + if colo_cfg := entity.run_settings.colocated_fs_settings: port = colo_cfg.get("port", None) socket = colo_cfg.get("unix_socket", None) if socket and port: @@ -716,51 +716,50 @@ def _prep_entity_client_env(self, entity: Model) -> None: "Co-located was configured for both TCP/IP and UDS" ) if port: - client_env[f"SSDB{db_name_colo}"] = f"127.0.0.1:{str(port)}" + client_env[f"SSDB{fs_name_colo}"] = f"127.0.0.1:{str(port)}" elif socket: - client_env[f"SSDB{db_name_colo}"] = f"unix://{socket}" + client_env[f"SSDB{fs_name_colo}"] = f"unix://{socket}" else: raise SSInternalError( - "Colocated database was not configured for either TCP or UDS" + "Colocated feature store was not configured for either TCP or UDS" ) - client_env[f"SR_DB_TYPE{db_name_colo}"] = STANDALONE + client_env[f"SR_DB_TYPE{fs_name_colo}"] = STANDALONE entity.run_settings.update_env(client_env) - def _save_orchestrator(self, orchestrator: Orchestrator) -> None: - """Save the orchestrator object via pickle + def _save_feature_store(self, featurestore: FeatureStore) -> None: + """Save the FeatureStore object via pickle - This function saves the orchestrator information to a pickle + This function saves the feature store information to a pickle file that can be imported by subsequent experiments to reconnect - to the orchestrator. + to the featurestore. - :param orchestrator: Orchestrator configuration to be saved + :param featurestore: FeatureStore configuration to be saved """ - - dat_file = "/".join((orchestrator.path, "smartsim_db.dat")) - db_jobs = self._jobs.db_jobs - orc_data = {"db": orchestrator, "db_jobs": db_jobs} + dat_file = "/".join((featurestore.path, "smartsim_db.dat")) + fs_jobs = self._jobs.fs_jobs + feature_store_data = {"fs": featurestore, "fs_jobs": fs_jobs} steps = [] - for db_job in db_jobs.values(): - steps.append(self._launcher.step_mapping[db_job.name]) - orc_data["steps"] = steps + for fs_job in fs_jobs.values(): + steps.append(self._launcher.step_mapping[fs_job.name]) + feature_store_data["steps"] = steps with open(dat_file, "wb") as pickle_file: - pickle.dump(orc_data, pickle_file) + pickle.dump(feature_store_data, pickle_file) - def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: - """Wait for the orchestrator instances to run + def _feature_store_launch_wait(self, featurestore: FeatureStore) -> None: + """Wait for the featurestore instances to run - In the case where the orchestrator is launched as a batch - through a WLM, we wait for the orchestrator to exit the + In the case where the featurestore is launched as a batch + through a WLM, we wait for the featurestore to exit the queue before proceeding so new launched entities can be launched with SSDB address - :param orchestrator: orchestrator instance + :param featurestore: FeatureStore instance :raises SmartSimError: if launch fails or manually stopped by user """ - if orchestrator.batch: - logger.info("Orchestrator launched as a batch") - logger.info("While queued, SmartSim will wait for Orchestrator to run") + if featurestore.batch: + logger.info("FeatureStore launched as a batch") + logger.info("While queued, SmartSim will wait for FeatureStore to run") logger.info("CTRL+C interrupt to abort and cancel launch") ready = False @@ -772,21 +771,21 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: self._jobs.check_jobs() # _jobs.get_status acquires JM lock for main thread, no need for locking - statuses = self.get_entity_list_status(orchestrator) + statuses = self.get_entity_list_status(featurestore) if all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses): ready = True # TODO remove in favor of by node status check time.sleep(CONFIG.jm_interval) elif any(stat in TERMINAL_STATUSES for stat in statuses): - self.stop_db(orchestrator) - msg = "Orchestrator failed during startup" - msg += f" See {orchestrator.path} for details" + self.stop_fs(featurestore) + msg = "FeatureStore failed during startup" + msg += f" See {featurestore.path} for details" raise SmartSimError(msg) else: - logger.debug("Waiting for orchestrator instances to spin up...") + logger.debug("Waiting for featurestore instances to spin up...") except KeyboardInterrupt: - logger.info("Orchestrator launch cancelled - requesting to stop") - self.stop_db(orchestrator) + logger.info("FeatureStore launch cancelled - requesting to stop") + self.stop_fs(featurestore) # re-raise keyboard interrupt so the job manager will display # any running and un-killed jobs as this method is only called @@ -794,82 +793,82 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: # launch explicitly raise - def reload_saved_db(self, checkpoint_file: str) -> Orchestrator: + def reload_saved_fs(self, checkpoint_file: str) -> FeatureStore: with JM_LOCK: - if self.orchestrator_active: - raise SmartSimError("Orchestrator exists and is active") + if self.feature_store_active: + raise SmartSimError("FeatureStore exists and is active") if not osp.exists(checkpoint_file): raise FileNotFoundError( - f"The SmartSim database config file {checkpoint_file} " + f"The SmartSim feature store config file {checkpoint_file} " "cannot be found." ) try: with open(checkpoint_file, "rb") as pickle_file: - db_config = pickle.load(pickle_file) + fs_config = pickle.load(pickle_file) except (OSError, IOError) as e: - msg = "Database checkpoint corrupted" + msg = "Feature store checkpoint corrupted" raise SmartSimError(msg) from e err_message = ( - "The SmartSim database checkpoint is incomplete or corrupted. " + "The SmartSim feature store checkpoint is incomplete or corrupted. " ) - if not "db" in db_config: + if not "db" in fs_config: raise SmartSimError( - err_message + "Could not find the orchestrator object." + err_message + "Could not find the featurestore object." ) - if not "db_jobs" in db_config: + if not "db_jobs" in fs_config: raise SmartSimError( - err_message + "Could not find database job objects." + err_message + "Could not find feature store job objects." ) - if not "steps" in db_config: + if not "steps" in fs_config: raise SmartSimError( - err_message + "Could not find database job objects." + err_message + "Could not find feature store job objects." ) - orc: Orchestrator = db_config["db"] + feature_store: FeatureStore = fs_config["db"] - # TODO check that each db_object is running + # TODO check that each fs_object is running - job_steps = zip(db_config["db_jobs"].values(), db_config["steps"]) + job_steps = zip(fs_config["db_jobs"].values(), fs_config["steps"]) try: - for db_job, step in job_steps: - self._jobs.db_jobs[db_job.ename] = db_job - self._launcher.step_mapping[db_job.name] = step + for fs_job, step in job_steps: + self._jobs.fs_jobs[fs_job.ename] = fs_job + self._launcher.step_mapping[fs_job.name] = step if step.task_id: self._launcher.task_manager.add_existing(int(step.task_id)) except LauncherError as e: - raise SmartSimError("Failed to reconnect orchestrator") from e + raise SmartSimError("Failed to reconnect feature store") from e # start job manager if not already started if not self._jobs.actively_monitoring: self._jobs.start() - return orc + return feature_store - def _set_dbobjects(self, manifest: Manifest) -> None: - if not manifest.has_db_objects: + def _set_fsobjects(self, manifest: Manifest) -> None: + if not manifest.has_fs_objects: return - address_dict = self._jobs.get_db_host_addresses() + address_dict = self._jobs.get_fs_host_addresses() for ( - db_id, - db_addresses, + fs_id, + fs_addresses, ) in address_dict.items(): - db_name, name = unpack_db_identifier(db_id, "_") + fs_name, name = unpack_fs_identifier(fs_id, "_") - hosts = list({address.split(":")[0] for address in db_addresses}) - ports = list({int(address.split(":")[-1]) for address in db_addresses}) + hosts = list({address.split(":")[0] for address in fs_addresses}) + ports = list({int(address.split(":")[-1]) for address in fs_addresses}) - if not db_is_active(hosts=hosts, ports=ports, num_shards=len(db_addresses)): - raise SSInternalError("Cannot set DB Objects, DB is not running") + if not fs_is_active(hosts=hosts, ports=ports, num_shards=len(fs_addresses)): + raise SSInternalError("Cannot set FS Objects, FS is not running") - environ[f"SSDB{db_name}"] = db_addresses[0] + environ[f"SSDB{fs_name}"] = fs_addresses[0] - environ[f"SR_DB_TYPE{db_name}"] = ( - CLUSTERED if len(db_addresses) > 1 else STANDALONE + environ[f"SR_DB_TYPE{fs_name}"] = ( + CLUSTERED if len(fs_addresses) > 1 else STANDALONE ) options = ConfigOptions.create_from_environment(name) @@ -877,27 +876,27 @@ def _set_dbobjects(self, manifest: Manifest) -> None: for model in manifest.models: if not model.colocated: - for db_model in model.db_models: - set_ml_model(db_model, client) - for db_script in model.db_scripts: - set_script(db_script, client) + for fs_model in model.fs_models: + set_ml_model(fs_model, client) + for fs_script in model.fs_scripts: + set_script(fs_script, client) for ensemble in manifest.ensembles: - for db_model in ensemble.db_models: - set_ml_model(db_model, client) - for db_script in ensemble.db_scripts: - set_script(db_script, client) + for fs_model in ensemble.fs_models: + set_ml_model(fs_model, client) + for fs_script in ensemble.fs_scripts: + set_script(fs_script, client) for entity in ensemble.models: if not entity.colocated: # Set models which could belong only # to the entities and not to the ensemble # but avoid duplicates - for db_model in entity.db_models: - if db_model not in ensemble.db_models: - set_ml_model(db_model, client) - for db_script in entity.db_scripts: - if db_script not in ensemble.db_scripts: - set_script(db_script, client) + for fs_model in entity.fs_models: + if fs_model not in ensemble.fs_models: + set_ml_model(fs_model, client) + for fs_script in entity.fs_scripts: + if fs_script not in ensemble.fs_scripts: + set_script(fs_script, client) def _start_telemetry_monitor(self, exp_dir: str) -> None: """Spawns a telemetry monitor process to keep track of the life times diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index eeefaf001..1c72e6b46 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -76,9 +76,9 @@ def __init__(self) -> None: """Flag indicating if the entity has completed execution""" @property - def is_db(self) -> bool: - """Returns `True` if the entity represents a database or database shard""" - return self.type in ["orchestrator", "dbnode"] + def is_fs(self) -> bool: + """Returns `True` if the entity represents a feature store or feature store shard""" + return self.type in ["featurestore", "fsnode"] @property def is_managed(self) -> bool: @@ -112,13 +112,13 @@ def check_completion_status(self) -> None: self._is_complete = True @staticmethod - def _map_db_metadata(entity_dict: t.Dict[str, t.Any], entity: "JobEntity") -> None: - """Map DB-specific properties from a runtime manifest onto a `JobEntity` + def _map_fs_metadata(entity_dict: t.Dict[str, t.Any], entity: "JobEntity") -> None: + """Map FS-specific properties from a runtime manifest onto a `JobEntity` :param entity_dict: The raw dictionary deserialized from manifest JSON :param entity: The entity instance to modify """ - if entity.is_db: + if entity.is_fs: # add collectors if they're configured to be enabled in the manifest entity.collectors = { "client": entity_dict.get("client_file", ""), @@ -169,7 +169,7 @@ def from_manifest( entity = JobEntity() cls._map_standard_metadata(entity_type, entity_dict, entity, exp_dir) - cls._map_db_metadata(entity_dict, entity) + cls._map_fs_metadata(entity_dict, entity) return entity @@ -207,7 +207,7 @@ def __init__( # output is only populated if it's system related (e.g. cmd failed immediately) self.output: t.Optional[str] = None self.error: t.Optional[str] = None # same as output - self.hosts: t.List[str] = [] # currently only used for DB jobs + self.hosts: t.List[str] = [] # currently only used for FS jobs self.launched_with = launcher self.is_task = is_task self.start_time = time.time() diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 1bc24cf9a..1c33c3846 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -32,8 +32,8 @@ from threading import RLock, Thread from types import FrameType -from ...database import Orchestrator -from ...entity import DBNode, EntitySequence, SmartSimEntity +from ...database import FeatureStore +from ...entity import FSNode, EntitySequence, SmartSimEntity from ...log import ContextThread, get_logger from ...status import TERMINAL_STATUSES, SmartSimStatus from ..config import CONFIG @@ -66,7 +66,7 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None: # active jobs self.jobs: t.Dict[str, Job] = {} - self.db_jobs: t.Dict[str, Job] = {} + self.fs_jobs: t.Dict[str, Job] = {} # completed jobs self.completed: t.Dict[str, Job] = {} @@ -129,8 +129,8 @@ def move_to_completed(self, job: Job) -> None: job.record_history() # remove from actively monitored jobs - if job.ename in self.db_jobs: - del self.db_jobs[job.ename] + if job.ename in self.fs_jobs: + del self.fs_jobs[job.ename] elif job.ename in self.jobs: del self.jobs[job.ename] @@ -142,7 +142,7 @@ def __getitem__(self, entity_name: str) -> Job: :returns: the Job associated with the entity_name """ with self._lock: - entities = ChainMap(self.db_jobs, self.jobs, self.completed) + entities = ChainMap(self.fs_jobs, self.jobs, self.completed) return entities[entity_name] def __call__(self) -> t.Dict[str, Job]: @@ -150,7 +150,7 @@ def __call__(self) -> t.Dict[str, Job]: :returns: Dictionary of all jobs """ - all_jobs = {**self.jobs, **self.db_jobs} + all_jobs = {**self.jobs, **self.fs_jobs} return all_jobs def __contains__(self, key: str) -> bool: @@ -177,10 +177,10 @@ def add_job( launcher = str(self._launcher) # all operations here should be atomic job = Job(job_name, job_id, entity, launcher, is_task) - if isinstance(entity, (DBNode, Orchestrator)): - self.db_jobs[entity.name] = job - elif isinstance(entity, JobEntity) and entity.is_db: - self.db_jobs[entity.name] = job + if isinstance(entity, (FSNode, FeatureStore)): + self.fs_jobs[entity.name] = job + elif isinstance(entity, JobEntity) and entity.is_fs: + self.fs_jobs[entity.name] = job else: self.jobs[entity.name] = job @@ -282,50 +282,50 @@ def restart_job( del self.completed[entity_name] job.reset(job_name, job_id, is_task) - if isinstance(job.entity, (DBNode, Orchestrator)): - self.db_jobs[entity_name] = job + if isinstance(job.entity, (FSNode, FeatureStore)): + self.fs_jobs[entity_name] = job else: self.jobs[entity_name] = job - def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: - """Retrieve the list of hosts for the database - for corresponding database identifiers + def get_fs_host_addresses(self) -> t.Dict[str, t.List[str]]: + """Retrieve the list of hosts for the feature store + for corresponding feature store identifiers :return: dictionary of host ip addresses """ address_dict: t.Dict[str, t.List[str]] = {} - for db_job in self.db_jobs.values(): + for fs_job in self.fs_jobs.values(): addresses = [] - if isinstance(db_job.entity, (DBNode, Orchestrator)): - db_entity = db_job.entity - for combine in itertools.product(db_job.hosts, db_entity.ports): + if isinstance(fs_job.entity, (FSNode, FeatureStore)): + fs_entity = fs_job.entity + for combine in itertools.product(fs_job.hosts, fs_entity.ports): ip_addr = get_ip_from_host(combine[0]) addresses.append(":".join((ip_addr, str(combine[1])))) - dict_entry: t.List[str] = address_dict.get(db_entity.db_identifier, []) + dict_entry: t.List[str] = address_dict.get(fs_entity.fs_identifier, []) dict_entry.extend(addresses) - address_dict[db_entity.db_identifier] = dict_entry + address_dict[fs_entity.fs_identifier] = dict_entry return address_dict - def set_db_hosts(self, orchestrator: Orchestrator) -> None: - """Set the DB hosts in db_jobs so future entities can query this + def set_fs_hosts(self, FeatureStore: FeatureStore) -> None: + """Set the fs hosts in fs_jobs so future entities can query this - :param orchestrator: orchestrator instance + :param FeatureStore: FeatureStore instance """ # should only be called during launch in the controller with self._lock: - if orchestrator.batch: - self.db_jobs[orchestrator.name].hosts = orchestrator.hosts + if FeatureStore.batch: + self.fs_jobs[FeatureStore.name].hosts = FeatureStore.hosts else: - for dbnode in orchestrator.entities: - if not dbnode.is_mpmd: - self.db_jobs[dbnode.name].hosts = [dbnode.host] + for fsnode in FeatureStore.entities: + if not fsnode.is_mpmd: + self.fs_jobs[fsnode.name].hosts = [fsnode.host] else: - self.db_jobs[dbnode.name].hosts = dbnode.hosts + self.fs_jobs[fsnode.name].hosts = fsnode.hosts def signal_interrupt(self, signo: int, _frame: t.Optional[FrameType]) -> None: """Custom handler for whenever SIGINT is received""" @@ -361,4 +361,4 @@ def _thread_sleep(self) -> None: def __len__(self) -> int: # number of active jobs - return len(self.db_jobs) + len(self.jobs) + return len(self.fs_jobs) + len(self.jobs) diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 97fc0ba8e..8358d1512 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -29,8 +29,8 @@ import typing as t from dataclasses import dataclass, field -from ...database import Orchestrator -from ...entity import DBNode, Ensemble, EntitySequence, Model, SmartSimEntity +from ...database import FeatureStore +from ...entity import FSNode, Ensemble, EntitySequence, Model, SmartSimEntity from ...error import SmartSimError from ..config import CONFIG from ..utils import helpers as _helpers @@ -38,7 +38,7 @@ _T = t.TypeVar("_T") _U = t.TypeVar("_U") -_AtomicLaunchableT = t.TypeVar("_AtomicLaunchableT", Model, DBNode) +_AtomicLaunchableT = t.TypeVar("_AtomicLaunchableT", Model, FSNode) if t.TYPE_CHECKING: import os @@ -50,7 +50,7 @@ class Manifest: `SmartSimEntity`-derived objects or `EntitySequence`-derived objects) can be accessed by using the corresponding accessor. - Instances of ``Model``, ``Ensemble`` and ``Orchestrator`` + Instances of ``Model``, ``Ensemble`` and ``FeatureStore`` can all be passed as arguments """ @@ -63,14 +63,14 @@ def __init__( self._check_entity_lists_nonempty() @property - def dbs(self) -> t.List[Orchestrator]: - """Return a list of Orchestrator instances in Manifest + def fss(self) -> t.List[FeatureStore]: + """Return a list of FeatureStore instances in Manifest - :raises SmartSimError: if user added to databases to manifest - :return: List of orchestrator instances + :raises SmartSimError: if user added to feature stores to manifest + :return: List of feature store instances """ - dbs = [item for item in self._deployables if isinstance(item, Orchestrator)] - return dbs + fss = [item for item in self._deployables if isinstance(item, FeatureStore)] + return fss @property def models(self) -> t.List[Model]: @@ -94,14 +94,14 @@ def ensembles(self) -> t.List[Ensemble]: @property def all_entity_lists(self) -> t.List[EntitySequence[SmartSimEntity]]: """All entity lists, including ensembles and - exceptional ones like Orchestrator + exceptional ones like FeatureStore :return: list of entity lists """ _all_entity_lists: t.List[EntitySequence[SmartSimEntity]] = list(self.ensembles) - for db in self.dbs: - _all_entity_lists.append(db) + for fs in self.fss: + _all_entity_lists.append(fs) return _all_entity_lists @@ -136,7 +136,7 @@ def __str__(self) -> str: output = "" e_header = "=== Ensembles ===\n" m_header = "=== Models ===\n" - db_header = "=== Database ===\n" + fs_header = "=== Feature Stores ===\n" if self.ensembles: output += e_header @@ -160,27 +160,27 @@ def __str__(self) -> str: output += f"Parameters: \n{_helpers.fmt_dict(model.params)}\n" output += "\n" - for adb in self.dbs: - output += db_header - output += f"Shards: {adb.num_shards}\n" - output += f"Port: {str(adb.ports[0])}\n" - output += f"Network: {adb._interfaces}\n" - output += f"Batch Launch: {adb.batch}\n" - if adb.batch: - output += f"{str(adb.batch_settings)}\n" + for afs in self.fss: + output += fs_header + output += f"Shards: {afs.num_shards}\n" + output += f"Port: {str(afs.ports[0])}\n" + output += f"Network: {afs._interfaces}\n" + output += f"Batch Launch: {afs.batch}\n" + if afs.batch: + output += f"{str(afs.batch_settings)}\n" output += "\n" return output @property - def has_db_objects(self) -> bool: - """Check if any entity has DBObjects to set""" + def has_fs_objects(self) -> bool: + """Check if any entity has fsObjects to set""" ents: t.Iterable[t.Union[Model, Ensemble]] = itertools.chain( self.models, self.ensembles, (member for ens in self.ensembles for member in ens.entities), ) - return any(any(ent.db_models) or any(ent.db_scripts) for ent in ents) + return any(any(ent.fs_models) or any(ent.fs_scripts) for ent in ents) class _LaunchedManifestMetadata(t.NamedTuple): @@ -214,7 +214,7 @@ class LaunchedManifest(t.Generic[_T]): metadata: _LaunchedManifestMetadata models: t.Tuple[t.Tuple[Model, _T], ...] ensembles: t.Tuple[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]], ...] - databases: t.Tuple[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]], ...] + featurestores: t.Tuple[t.Tuple[FeatureStore, t.Tuple[t.Tuple[FSNode, _T], ...]], ...] def map(self, func: t.Callable[[_T], _U]) -> "LaunchedManifest[_U]": def _map_entity_data( @@ -230,9 +230,9 @@ def _map_entity_data( (ens, _map_entity_data(func, model_data)) for ens, model_data in self.ensembles ), - databases=tuple( - (db_, _map_entity_data(func, node_data)) - for db_, node_data in self.databases + featurestores=tuple( + (fs_, _map_entity_data(func, node_data)) + for fs_, node_data in self.featurestores ), ) @@ -253,7 +253,7 @@ class LaunchedManifestBuilder(t.Generic[_T]): _ensembles: t.List[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]]] = field( default_factory=list, init=False ) - _databases: t.List[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]]] = ( + _featurestores: t.List[t.Tuple[FeatureStore, t.Tuple[t.Tuple[FSNode, _T], ...]]] = ( field(default_factory=list, init=False) ) @@ -271,8 +271,8 @@ def add_model(self, model: Model, data: _T) -> None: def add_ensemble(self, ens: Ensemble, data: t.Sequence[_T]) -> None: self._ensembles.append((ens, self._entities_to_data(ens.entities, data))) - def add_database(self, db_: Orchestrator, data: t.Sequence[_T]) -> None: - self._databases.append((db_, self._entities_to_data(db_.entities, data))) + def add_feature_store(self, fs_: FeatureStore, data: t.Sequence[_T]) -> None: + self._featurestores.append((fs_, self._entities_to_data(fs_.entities, data))) @staticmethod def _entities_to_data( @@ -297,7 +297,7 @@ def finalize(self) -> LaunchedManifest[_T]: ), models=tuple(self._models), ensembles=tuple(self._ensembles), - databases=tuple(self._databases), + featurestores=tuple(self._featurestores), ) diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 508251fe0..28fb1edca 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -58,14 +58,14 @@ def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: cleanup() -def launch_db_model(client: Client, db_model: t.List[str]) -> str: +def launch_fs_model(client: Client, fs_model: t.List[str]) -> str: """Parse options to launch model on local cluster - :param client: SmartRedis client connected to local DB - :param db_model: List of arguments defining the model + :param client: SmartRedis client connected to local FS + :param fs_model: List of arguments defining the model :return: Name of model """ - parser = argparse.ArgumentParser("Set ML model on DB") + parser = argparse.ArgumentParser("Set ML model on FS") parser.add_argument("--name", type=str) parser.add_argument("--file", type=str) parser.add_argument("--backend", type=str) @@ -78,7 +78,7 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: parser.add_argument("--tag", type=str, default="") parser.add_argument("--inputs", nargs="+", default=None) parser.add_argument("--outputs", nargs="+", default=None) - args = parser.parse_args(db_model) + args = parser.parse_args(fs_model) inputs = None outputs = None @@ -122,14 +122,14 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: return name -def launch_db_script(client: Client, db_script: t.List[str]) -> str: +def launch_fs_script(client: Client, fs_script: t.List[str]) -> str: """Parse options to launch script on local cluster - :param client: SmartRedis client connected to local DB - :param db_model: List of arguments defining the script + :param client: SmartRedis client connected to local FS + :param fs_model: List of arguments defining the script :return: Name of model """ - parser = argparse.ArgumentParser("Set script on DB") + parser = argparse.ArgumentParser("Set script on FS") parser.add_argument("--name", type=str) parser.add_argument("--func", type=str) parser.add_argument("--file", type=str) @@ -137,7 +137,7 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: parser.add_argument("--device", type=str) parser.add_argument("--devices_per_node", type=int, default=1) parser.add_argument("--first_device", type=int, default=0) - args = parser.parse_args(db_script) + args = parser.parse_args(fs_script) if args.file and args.func: raise ValueError("Both file and func cannot be provided.") @@ -165,11 +165,11 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: def main( network_interface: str, - db_cpus: int, + fs_cpus: int, command: t.List[str], - db_models: t.List[t.List[str]], - db_scripts: t.List[t.List[str]], - db_identifier: str, + fs_models: t.List[t.List[str]], + fs_scripts: t.List[t.List[str]], + fs_identifier: str, ) -> None: # pylint: disable=too-many-statements global DBPID # pylint: disable=global-statement @@ -198,7 +198,7 @@ def main( try: hostname = socket.gethostname() filename = ( - f"colo_orc_{hostname}.log" + f"colo_feature_store_{hostname}.log" if os.getenv("SMARTSIM_LOG_LEVEL") == "debug" else os.devnull ) @@ -210,66 +210,66 @@ def main( except Exception as e: cleanup() - logger.error(f"Failed to start database process: {str(e)}") + logger.error(f"Failed to start feature store process: {str(e)}") raise SSInternalError("Colocated process failed to start") from e try: logger.debug( - "\n\nColocated database information\n" + "\n\nColocated feature store information\n" f"\n\tIP Address(es): {' '.join(ip_addresses + [lo_address])}" f"\n\tCommand: {' '.join(cmd)}\n\n" - f"\n\t# of Database CPUs: {db_cpus}" - f"\n\tDatabase Identifier: {db_identifier}" + f"\n\t# of Feature Store CPUs: {fs_cpus}" + f"\n\tFeature Store Identifier: {fs_identifier}" ) except Exception as e: cleanup() - logger.error(f"Failed to start database process: {str(e)}") + logger.error(f"Failed to start feature store process: {str(e)}") raise SSInternalError("Colocated process failed to start") from e - def launch_models(client: Client, db_models: t.List[t.List[str]]) -> None: - for i, db_model in enumerate(db_models): + def launch_models(client: Client, fs_models: t.List[t.List[str]]) -> None: + for i, fs_model in enumerate(fs_models): logger.debug("Uploading model") - model_name = launch_db_model(client, db_model) - logger.debug(f"Added model {model_name} ({i+1}/{len(db_models)})") + model_name = launch_fs_model(client, fs_model) + logger.debug(f"Added model {model_name} ({i+1}/{len(fs_models)})") - def launch_db_scripts(client: Client, db_scripts: t.List[t.List[str]]) -> None: - for i, db_script in enumerate(db_scripts): + def launch_fs_scripts(client: Client, fs_scripts: t.List[t.List[str]]) -> None: + for i, fs_script in enumerate(fs_scripts): logger.debug("Uploading script") - script_name = launch_db_script(client, db_script) - logger.debug(f"Added script {script_name} ({i+1}/{len(db_scripts)})") + script_name = launch_fs_script(client, fs_script) + logger.debug(f"Added script {script_name} ({i+1}/{len(fs_scripts)})") try: - if db_models or db_scripts: + if fs_models or fs_scripts: try: - options = ConfigOptions.create_from_environment(db_identifier) + options = ConfigOptions.create_from_environment(fs_identifier) client = Client(options, logger_name="SmartSim") - launch_models(client, db_models) - launch_db_scripts(client, db_scripts) + launch_models(client, fs_models) + launch_fs_scripts(client, fs_scripts) except (RedisConnectionError, RedisReplyError) as ex: raise SSInternalError( - "Failed to set model or script, could not connect to database" + "Failed to set model or script, could not connect to feature store" ) from ex # Make sure we don't keep this around del client except Exception as e: cleanup() - logger.error(f"Colocated database process failed: {str(e)}") + logger.error(f"Colocated feature store process failed: {str(e)}") raise SSInternalError("Colocated entrypoint raised an error") from e def cleanup() -> None: try: - logger.debug("Cleaning up colocated database") - # attempt to stop the database process - db_proc = psutil.Process(DBPID) - db_proc.terminate() + logger.debug("Cleaning up colocated feature store") + # attempt to stop the feature store process + fs_proc = psutil.Process(DBPID) + fs_proc.terminate() except psutil.NoSuchProcess: - logger.warning("Couldn't find database process to kill.") + logger.warning("Couldn't find feature store process to kill.") except OSError as e: - logger.warning(f"Failed to clean up colocated database gracefully: {str(e)}") + logger.warning(f"Failed to clean up colocated feature store gracefully: {str(e)}") finally: if LOCK.is_locked: LOCK.release() @@ -294,27 +294,27 @@ def register_signal_handlers() -> None: "+lockfile", type=str, help="Filename to create for single proc per host" ) arg_parser.add_argument( - "+db_cpus", type=int, default=2, help="Number of CPUs to use for DB" + "+fs_cpus", type=int, default=2, help="Number of CPUs to use for FS" ) arg_parser.add_argument( - "+db_identifier", type=str, default="", help="Database Identifier" + "+fs_identifier", type=str, default="", help="Feature Store Identifier" ) arg_parser.add_argument("+command", nargs="+", help="Command to run") arg_parser.add_argument( - "+db_model", + "+fs_model", nargs="+", action="append", default=[], - help="Model to set on DB", + help="Model to set on FS", ) arg_parser.add_argument( - "+db_script", + "+fs_script", nargs="+", action="append", default=[], - help="Script to set on DB", + help="Script to set on FS", ) os.environ["PYTHONUNBUFFERED"] = "1" @@ -325,20 +325,20 @@ def register_signal_handlers() -> None: LOCK = filelock.FileLock(tmp_lockfile) LOCK.acquire(timeout=0.1) - logger.debug(f"Starting colocated database on host: {socket.gethostname()}") + logger.debug(f"Starting colocated feature store on host: {socket.gethostname()}") # make sure to register the cleanup before we start # the proecss so our signaller will be able to stop - # the database process. + # the feature store process. register_signal_handlers() main( parsed_args.ifname, - parsed_args.db_cpus, + parsed_args.fs_cpus, parsed_args.command, - parsed_args.db_model, - parsed_args.db_script, - parsed_args.db_identifier, + parsed_args.fs_model, + parsed_args.fs_script, + parsed_args.fs_identifier, ) # gracefully exit the processes in the distributed application that diff --git a/smartsim/_core/entrypoints/indirect.py b/smartsim/_core/entrypoints/indirect.py index 1f445ac4a..79d487466 100644 --- a/smartsim/_core/entrypoints/indirect.py +++ b/smartsim/_core/entrypoints/indirect.py @@ -61,7 +61,7 @@ def main( :param cmd: a base64 encoded cmd to execute :param entity_type: `SmartSimEntity` entity class. Valid values - include: orchestrator, dbnode, ensemble, model + include: feature store, fsnode, ensemble, model :param cwd: working directory to execute the cmd from :param status_dir: path to the output directory for status updates """ @@ -233,7 +233,7 @@ def get_parser() -> argparse.ArgumentParser: logger.debug("Starting indirect step execution") # make sure to register the cleanup before the start the process - # so our signaller will be able to stop the database process. + # so our signaller will be able to stop the feature store process. register_signal_handlers() rc = main( diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index 018fc26fd..216130629 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -121,20 +121,20 @@ def main(args: argparse.Namespace) -> int: print(line.decode("utf-8").rstrip(), flush=True) except Exception as e: cleanup() - raise SSInternalError("Database process starter raised an exception") from e + raise SSInternalError("Feature store process starter raised an exception") from e return 0 def cleanup() -> None: - logger.debug("Cleaning up database instance") + logger.debug("Cleaning up feature store instance") try: - # attempt to stop the database process + # attempt to stop the feature store process if DBPID is not None: psutil.Process(DBPID).terminate() except psutil.NoSuchProcess: - logger.warning("Couldn't find database process to kill.") + logger.warning("Couldn't find feature store process to kill.") except OSError as e: - logger.warning(f"Failed to clean up database gracefully: {str(e)}") + logger.warning(f"Failed to clean up feature store gracefully: {str(e)}") if __name__ == "__main__": @@ -144,12 +144,12 @@ def cleanup() -> None: prefix_chars="+", description="SmartSim Process Launcher" ) parser.add_argument( - "+orc-exe", type=str, help="Path to the orchestrator executable", required=True + "+orc-exe", type=str, help="Path to the feature store executable", required=True ) parser.add_argument( "+conf-file", type=str, - help="Path to the orchestrator configuration file", + help="Path to the feature store configuration file", required=True, ) parser.add_argument( @@ -168,7 +168,7 @@ def cleanup() -> None: parser.add_argument( "+port", type=int, - help="The port on which to launch the shard of the orchestrator", + help="The port on which to launch the shard of the feature store", required=True, ) parser.add_argument( @@ -177,13 +177,13 @@ def cleanup() -> None: parser.add_argument( "+cluster", action="store_true", - help="Specify if this orchestrator shard is part of a cluster", + help="Specify if this feature store shard is part of a cluster", ) args_ = parser.parse_args() # make sure to register the cleanup before the start # the process so our signaller will be able to stop - # the database process. + # the feature store process. for sig in SIGNALS: signal.signal(sig, handle_signal) diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index 624a43379..934f285eb 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -35,7 +35,7 @@ from tabulate import tabulate -from ...database import Orchestrator +from ...database import FeatureStore from ...entity import Ensemble, Model, TaggedFilesHierarchy from ...log import get_logger from ..control import Manifest @@ -105,7 +105,7 @@ def generate_experiment(self, *args: t.Any) -> None: generator_manifest = Manifest(*args) self._gen_exp_dir() - self._gen_orc_dir(generator_manifest.dbs) + self._gen_feature_store_dir(generator_manifest.fss) self._gen_entity_list_dir(generator_manifest.ensembles) self._gen_entity_dirs(generator_manifest.models) @@ -154,21 +154,21 @@ def _gen_exp_dir(self) -> None: dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S") log_file.write(f"Generation start date and time: {dt_string}\n") - def _gen_orc_dir(self, orchestrator_list: t.List[Orchestrator]) -> None: + def _gen_feature_store_dir(self, feature_store_list: t.List[FeatureStore]) -> None: """Create the directory that will hold the error, output and - configuration files for the orchestrator. + configuration files for the feature store. - :param orchestrator: Orchestrator instance + :param featurestore: FeatureStore instance """ - # Loop through orchestrators - for orchestrator in orchestrator_list: - orc_path = path.join(self.gen_path, orchestrator.name) - - orchestrator.set_path(orc_path) - # Always remove orchestrator files if present. - if path.isdir(orc_path): - shutil.rmtree(orc_path, ignore_errors=True) - pathlib.Path(orc_path).mkdir(exist_ok=self.overwrite) + # Loop through feature stores + for featurestore in feature_store_list: + feature_store_path = path.join(self.gen_path, featurestore.name) + + featurestore.set_path(feature_store_path) + # Always remove feature store files if present. + if path.isdir(feature_store_path): + shutil.rmtree(feature_store_path, ignore_errors=True) + pathlib.Path(feature_store_path).mkdir(exist_ok=self.overwrite) def _gen_entity_list_dir(self, entity_lists: t.List[Ensemble]) -> None: """Generate directories for Ensemble instances diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index 03540ce0f..4a1393082 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -27,14 +27,14 @@ import sys import typing as t -from ...entity.dbobject import DBModel, DBScript +from ...entity.dbobject import FSModel, FSScript from ...error import SSInternalError from ..config import CONFIG from ..utils.helpers import create_lockfile_name def write_colocated_launch_script( - file_name: str, db_log: str, colocated_settings: t.Dict[str, t.Any] + file_name: str, fs_log: str, colocated_settings: t.Dict[str, t.Any] ) -> None: """Write the colocated launch script @@ -42,11 +42,11 @@ def write_colocated_launch_script( is created for this entity. :param file_name: name of the script to write - :param db_log: log file for the db - :param colocated_settings: db settings from entity run_settings + :param fs_log: log file for the fs + :param colocated_settings: fs settings from entity run_settings """ - colocated_cmd = _build_colocated_wrapper_cmd(db_log, **colocated_settings) + colocated_cmd = _build_colocated_wrapper_cmd(fs_log, **colocated_settings) with open(file_name, "w", encoding="utf-8") as script_file: script_file.write("#!/bin/bash\n") @@ -78,24 +78,24 @@ def write_colocated_launch_script( def _build_colocated_wrapper_cmd( - db_log: str, + fs_log: str, cpus: int = 1, rai_args: t.Optional[t.Dict[str, str]] = None, - extra_db_args: t.Optional[t.Dict[str, str]] = None, + extra_fs_args: t.Optional[t.Dict[str, str]] = None, port: int = 6780, ifname: t.Optional[t.Union[str, t.List[str]]] = None, custom_pinning: t.Optional[str] = None, **kwargs: t.Any, ) -> str: - """Build the command use to run a colocated DB application + """Build the command use to run a colocated fs application - :param db_log: log file for the db - :param cpus: db cpus + :param fs_log: log file for the fs + :param cpus: fs cpus :param rai_args: redisai args - :param extra_db_args: extra redis args - :param port: port to bind DB to - :param ifname: network interface(s) to bind DB to - :param db_cpu_list: The list of CPUs that the database should be limited to + :param extra_fs_args: extra redis args + :param port: port to bind fs to + :param ifname: network interface(s) to bind fs to + :param fs_cpu_list: The list of CPUs that the feature store should be limited to :return: the command to run """ # pylint: disable=too-many-locals @@ -108,8 +108,8 @@ def _build_colocated_wrapper_cmd( lockfile = create_lockfile_name() # create the command that will be used to launch the - # database with the python entrypoint for starting - # up the backgrounded db process + # feature store with the python entrypoint for starting + # up the backgrounded fs process cmd = [ sys.executable, @@ -117,7 +117,7 @@ def _build_colocated_wrapper_cmd( "smartsim._core.entrypoints.colocated", "+lockfile", lockfile, - "+db_cpus", + "+fs_cpus", str(cpus), ] # Add in the interface if using TCP/IP @@ -126,12 +126,12 @@ def _build_colocated_wrapper_cmd( ifname = [ifname] cmd.extend(["+ifname", ",".join(ifname)]) cmd.append("+command") - # collect DB binaries and libraries from the config + # collect fs binaries and libraries from the config - db_cmd = [] + fs_cmd = [] if custom_pinning: - db_cmd.extend(["taskset", "-c", custom_pinning]) - db_cmd.extend( + fs_cmd.extend(["taskset", "-c", custom_pinning]) + fs_cmd.extend( [CONFIG.database_exe, CONFIG.database_conf, "--loadmodule", CONFIG.redisai] ) @@ -140,16 +140,16 @@ def _build_colocated_wrapper_cmd( if value: # RAI wants arguments for inference in all caps # ex. THREADS_PER_QUEUE=1 - db_cmd.append(f"{arg.upper()} {str(value)}") + fs_cmd.append(f"{arg.upper()} {str(value)}") - db_cmd.extend(["--port", str(port)]) + fs_cmd.extend(["--port", str(port)]) # Add socket and permissions for UDS unix_socket = kwargs.get("unix_socket", None) socket_permissions = kwargs.get("socket_permissions", None) if unix_socket and socket_permissions: - db_cmd.extend( + fs_cmd.extend( [ "--unixsocket", str(unix_socket), @@ -162,72 +162,72 @@ def _build_colocated_wrapper_cmd( "`unix_socket` and `socket_permissions` must both be defined or undefined." ) - db_cmd.extend( - ["--logfile", db_log] + fs_cmd.extend( + ["--logfile", fs_log] ) # usually /dev/null, unless debug was specified - if extra_db_args: - for db_arg, value in extra_db_args.items(): - # replace "_" with "-" in the db_arg because we use kwargs + if extra_fs_args: + for fs_arg, value in extra_fs_args.items(): + # replace "_" with "-" in the fs_arg because we use kwargs # for the extra configurations and Python doesn't allow a hyphen # in a variable name. All redis and KeyDB configuration options # use hyphens in their names. - db_arg = db_arg.replace("_", "-") - db_cmd.extend([f"--{db_arg}", value]) + fs_arg = fs_arg.replace("_", "-") + fs_cmd.extend([f"--{fs_arg}", value]) - db_models = kwargs.get("db_models", None) - if db_models: - db_model_cmd = _build_db_model_cmd(db_models) - db_cmd.extend(db_model_cmd) + fs_models = kwargs.get("fs_models", None) + if fs_models: + fs_model_cmd = _build_fs_model_cmd(fs_models) + fs_cmd.extend(fs_model_cmd) - db_scripts = kwargs.get("db_scripts", None) - if db_scripts: - db_script_cmd = _build_db_script_cmd(db_scripts) - db_cmd.extend(db_script_cmd) + fs_scripts = kwargs.get("fs_scripts", None) + if fs_scripts: + fs_script_cmd = _build_fs_script_cmd(fs_scripts) + fs_cmd.extend(fs_script_cmd) - cmd.extend(db_cmd) + cmd.extend(fs_cmd) return " ".join(cmd) -def _build_db_model_cmd(db_models: t.List[DBModel]) -> t.List[str]: +def _build_fs_model_cmd(fs_models: t.List[FSModel]) -> t.List[str]: cmd = [] - for db_model in db_models: - cmd.append("+db_model") - cmd.append(f"--name={db_model.name}") - - # Here db_model.file is guaranteed to exist - # because we don't allow the user to pass a serialized DBModel - cmd.append(f"--file={db_model.file}") - - cmd.append(f"--backend={db_model.backend}") - cmd.append(f"--device={db_model.device}") - cmd.append(f"--devices_per_node={db_model.devices_per_node}") - cmd.append(f"--first_device={db_model.first_device}") - if db_model.batch_size: - cmd.append(f"--batch_size={db_model.batch_size}") - if db_model.min_batch_size: - cmd.append(f"--min_batch_size={db_model.min_batch_size}") - if db_model.min_batch_timeout: - cmd.append(f"--min_batch_timeout={db_model.min_batch_timeout}") - if db_model.tag: - cmd.append(f"--tag={db_model.tag}") - if db_model.inputs: - cmd.append("--inputs=" + ",".join(db_model.inputs)) - if db_model.outputs: - cmd.append("--outputs=" + ",".join(db_model.outputs)) + for fs_model in fs_models: + cmd.append("+fs_model") + cmd.append(f"--name={fs_model.name}") + + # Here fs_model.file is guaranteed to exist + # because we don't allow the user to pass a serialized FSModel + cmd.append(f"--file={fs_model.file}") + + cmd.append(f"--backend={fs_model.backend}") + cmd.append(f"--device={fs_model.device}") + cmd.append(f"--devices_per_node={fs_model.devices_per_node}") + cmd.append(f"--first_device={fs_model.first_device}") + if fs_model.batch_size: + cmd.append(f"--batch_size={fs_model.batch_size}") + if fs_model.min_batch_size: + cmd.append(f"--min_batch_size={fs_model.min_batch_size}") + if fs_model.min_batch_timeout: + cmd.append(f"--min_batch_timeout={fs_model.min_batch_timeout}") + if fs_model.tag: + cmd.append(f"--tag={fs_model.tag}") + if fs_model.inputs: + cmd.append("--inputs=" + ",".join(fs_model.inputs)) + if fs_model.outputs: + cmd.append("--outputs=" + ",".join(fs_model.outputs)) return cmd -def _build_db_script_cmd(db_scripts: t.List[DBScript]) -> t.List[str]: +def _build_fs_script_cmd(fs_scripts: t.List[FSScript]) -> t.List[str]: cmd = [] - for db_script in db_scripts: - cmd.append("+db_script") - cmd.append(f"--name={db_script.name}") - if db_script.func: - # Notice that here db_script.func is guaranteed to be a str + for fs_script in fs_scripts: + cmd.append("+fs_script") + cmd.append(f"--name={fs_script.name}") + if fs_script.func: + # Notice that here fs_script.func is guaranteed to be a str # because we don't allow the user to pass a serialized function - sanitized_func = db_script.func.replace("\n", "\\n") + sanitized_func = fs_script.func.replace("\n", "\\n") if not ( sanitized_func.startswith("'") and sanitized_func.endswith("'") @@ -235,9 +235,9 @@ def _build_db_script_cmd(db_scripts: t.List[DBScript]) -> t.List[str]: ): sanitized_func = '"' + sanitized_func + '"' cmd.append(f"--func={sanitized_func}") - elif db_script.file: - cmd.append(f"--file={db_script.file}") - cmd.append(f"--device={db_script.device}") - cmd.append(f"--devices_per_node={db_script.devices_per_node}") - cmd.append(f"--first_device={db_script.first_device}") + elif fs_script.file: + cmd.append(f"--file={fs_script.file}") + cmd.append(f"--device={fs_script.device}") + cmd.append(f"--devices_per_node={fs_script.devices_per_node}") + cmd.append(f"--first_device={fs_script.first_device}") return cmd diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index eb7903af9..317774672 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -74,9 +74,9 @@ def get_launch_cmd(self) -> t.List[str]: aprun_cmd.extend(self.run_settings.format_env_vars()) aprun_cmd.extend(self.run_settings.format_run_args()) - if self.run_settings.colocated_db_settings: + if self.run_settings.colocated_fs_settings: # disable cpu binding as the entrypoint will set that - # for the application and database process now + # for the application and feature store process now aprun_cmd.extend(["--cc", "none"]) # Replace the command with the entrypoint wrapper script diff --git a/smartsim/_core/launcher/step/localStep.py b/smartsim/_core/launcher/step/localStep.py index 968152a41..ba0744d2f 100644 --- a/smartsim/_core/launcher/step/localStep.py +++ b/smartsim/_core/launcher/step/localStep.py @@ -54,7 +54,7 @@ def get_launch_cmd(self) -> t.List[str]: run_args = self.run_settings.format_run_args() cmd.extend(run_args) - if self.run_settings.colocated_db_settings: + if self.run_settings.colocated_fs_settings: # Replace the command with the entrypoint wrapper script if not (bash := shutil.which("bash")): raise RuntimeError("Unable to locate bash interpreter") diff --git a/smartsim/_core/launcher/step/lsfStep.py b/smartsim/_core/launcher/step/lsfStep.py index 0cb921e19..99a9ac61b 100644 --- a/smartsim/_core/launcher/step/lsfStep.py +++ b/smartsim/_core/launcher/step/lsfStep.py @@ -170,9 +170,9 @@ def get_launch_cmd(self) -> t.List[str]: jsrun_cmd.extend(self.run_settings.format_run_args()) - if self.run_settings.colocated_db_settings: + if self.run_settings.colocated_fs_settings: # disable cpu binding as the entrypoint will set that - # for the application and database process now + # for the application and feature store process now jsrun_cmd.extend(["--bind", "none"]) # Replace the command with the entrypoint wrapper script diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 767486462..340992849 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -73,9 +73,9 @@ def get_launch_cmd(self) -> t.List[str]: # add mpi settings to command mpi_cmd.extend(self.run_settings.format_run_args()) - if self.run_settings.colocated_db_settings: + if self.run_settings.colocated_fs_settings: # disable cpu binding as the entrypoint will set that - # for the application and database process now + # for the application and feature store process now # mpi_cmd.extend(["--cpu-bind", "none"]) # Replace the command with the entrypoint wrapper script diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 83f39cf09..0425f0fde 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -140,7 +140,7 @@ def get_launch_cmd(self) -> t.List[str]: srun_cmd += self.run_settings.format_run_args() - if self.run_settings.colocated_db_settings: + if self.run_settings.colocated_fs_settings: # Replace the command with the entrypoint wrapper script bash = shutil.which("bash") if not bash: diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index a32685b53..f6074c954 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -105,20 +105,20 @@ def get_colocated_launch_script(self) -> str: ) makedirs(osp.dirname(script_path), exist_ok=True) - db_settings = {} + fs_settings = {} if isinstance(self.step_settings, RunSettings): - db_settings = self.step_settings.colocated_db_settings or {} + fs_settings = self.step_settings.colocated_fs_settings or {} - # db log file causes write contention and kills performance so by + # fs log file causes write contention and kills performance so by # default we turn off logging unless user specified debug=True - if db_settings.get("debug", False): - db_log_file = self.get_step_file(ending="-db.log") + if fs_settings.get("debug", False): + fs_log_file = self.get_step_file(ending="-fs.log") else: - db_log_file = "/dev/null" + fs_log_file = "/dev/null" # write the colocated wrapper shell script to the directory for this # entity currently being prepped to launch - write_colocated_launch_script(script_path, db_log_file, db_settings) + write_colocated_launch_script(script_path, fs_log_file, fs_settings) return script_path # pylint: disable=no-self-use diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py index 0a109de95..7cb08063b 100644 --- a/smartsim/_core/utils/__init__.py +++ b/smartsim/_core/utils/__init__.py @@ -25,4 +25,4 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from .helpers import colorize, delete_elements, installed_redisai_backends -from .redis import check_cluster_status, create_cluster, db_is_active +from .redis import check_cluster_status, create_cluster, fs_is_active diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index 4c68eaa01..b3b684eac 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -47,27 +47,27 @@ _TSignalHandlerFn = t.Callable[[int, t.Optional["FrameType"]], object] -def unpack_db_identifier(db_id: str, token: str) -> t.Tuple[str, str]: - """Unpack the unformatted database identifier +def unpack_fs_identifier(fs_id: str, token: str) -> t.Tuple[str, str]: + """Unpack the unformatted feature store identifier and format for env variable suffix using the token - :param db_id: the unformatted database identifier eg. identifier_1 - :param token: character to use to construct the db suffix - :return: db id suffix and formatted db_id e.g. ("_identifier_1", "identifier_1") + :param fs_id: the unformatted feature store identifier eg. identifier_1 + :param token: character to use to construct the fs suffix + :return: fs id suffix and formatted fs_id e.g. ("_identifier_1", "identifier_1") """ - if db_id == "orchestrator": + if fs_id == "featurestore": return "", "" - db_name_suffix = token + db_id - return db_name_suffix, db_id + fs_name_suffix = token + fs_id + return fs_name_suffix, fs_id -def unpack_colo_db_identifier(db_id: str) -> str: - """Create database identifier suffix for colocated database +def unpack_colo_fs_identifier(fs_id: str) -> str: + """Create feature store identifier suffix for colocated feature store - :param db_id: the unformatted database identifier - :return: db suffix + :param fs_id: the unformatted feature store identifier + :return: fs suffix """ - return "_" + db_id if db_id else "" + return "_" + fs_id if fs_id else "" def create_short_id_str() -> str: diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 41ee69cc4..8443f430d 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -35,7 +35,7 @@ from smartredis import Client from smartredis.error import RedisReplyError -from ...entity import DBModel, DBScript +from ...entity import FSModel, FSScript from ...error import SSInternalError from ...log import get_logger from ..config import CONFIG @@ -73,7 +73,7 @@ def create_cluster(hosts: t.List[str], ports: t.List[int]) -> None: # cov-wlm if returncode != 0: logger.error(out) logger.error(err) - raise SSInternalError("Database '--cluster create' command failed") + raise SSInternalError("Feature store '--cluster create' command failed") logger.debug(out) @@ -95,10 +95,10 @@ def check_cluster_status( if not cluster_nodes: raise SSInternalError( - "No cluster nodes have been set for database status check." + "No cluster nodes have been set for feature store status check." ) - logger.debug("Beginning database cluster status check...") + logger.debug("Beginning feature store cluster status check...") while trials > 0: # wait for cluster to spin up time.sleep(5) @@ -117,16 +117,16 @@ def check_cluster_status( raise SSInternalError("Cluster setup could not be verified") -def db_is_active(hosts: t.List[str], ports: t.List[int], num_shards: int) -> bool: - """Check if a DB is running +def fs_is_active(hosts: t.List[str], ports: t.List[int], num_shards: int) -> bool: + """Check if a FS is running - if the DB is clustered, check cluster status, otherwise - just ping DB. + if the FS is clustered, check cluster status, otherwise + just ping FS. :param hosts: list of hosts :param ports: list of ports - :param num_shards: Number of DB shards - :return: Whether DB is running + :param num_shards: Number of FS shards + :return: Whether FS is running """ # if single shard if num_shards < 2: @@ -149,71 +149,71 @@ def db_is_active(hosts: t.List[str], ports: t.List[int], num_shards: int) -> boo return False -def set_ml_model(db_model: DBModel, client: Client) -> None: - logger.debug(f"Adding DBModel named {db_model.name}") +def set_ml_model(fs_model: FSModel, client: Client) -> None: + logger.debug(f"Adding FSModel named {fs_model.name}") - for device in db_model.devices: + for device in fs_model.devices: try: - if db_model.is_file: + if fs_model.is_file: client.set_model_from_file( - name=db_model.name, - model_file=str(db_model.file), - backend=db_model.backend, + name=fs_model.name, + model_file=str(fs_model.file), + backend=fs_model.backend, device=device, - batch_size=db_model.batch_size, - min_batch_size=db_model.min_batch_size, - min_batch_timeout=db_model.min_batch_timeout, - tag=db_model.tag, - inputs=db_model.inputs, - outputs=db_model.outputs, + batch_size=fs_model.batch_size, + min_batch_size=fs_model.min_batch_size, + min_batch_timeout=fs_model.min_batch_timeout, + tag=fs_model.tag, + inputs=fs_model.inputs, + outputs=fs_model.outputs, ) else: - if db_model.model is None: - raise ValueError(f"No model attacted to {db_model.name}") + if fs_model.model is None: + raise ValueError(f"No model attacted to {fs_model.name}") client.set_model( - name=db_model.name, - model=db_model.model, - backend=db_model.backend, + name=fs_model.name, + model=fs_model.model, + backend=fs_model.backend, device=device, - batch_size=db_model.batch_size, - min_batch_size=db_model.min_batch_size, - min_batch_timeout=db_model.min_batch_timeout, - tag=db_model.tag, - inputs=db_model.inputs, - outputs=db_model.outputs, + batch_size=fs_model.batch_size, + min_batch_size=fs_model.min_batch_size, + min_batch_timeout=fs_model.min_batch_timeout, + tag=fs_model.tag, + inputs=fs_model.inputs, + outputs=fs_model.outputs, ) except RedisReplyError as error: # pragma: no cover - logger.error("Error while setting model on orchestrator.") + logger.error("Error while setting model on feature store.") raise error -def set_script(db_script: DBScript, client: Client) -> None: - logger.debug(f"Adding DBScript named {db_script.name}") +def set_script(fs_script: FSScript, client: Client) -> None: + logger.debug(f"Adding FSScript named {fs_script.name}") - for device in db_script.devices: + for device in fs_script.devices: try: - if db_script.is_file: + if fs_script.is_file: client.set_script_from_file( - name=db_script.name, file=str(db_script.file), device=device + name=fs_script.name, file=str(fs_script.file), device=device ) - elif db_script.script: - if isinstance(db_script.script, str): + elif fs_script.script: + if isinstance(fs_script.script, str): client.set_script( - name=db_script.name, script=db_script.script, device=device + name=fs_script.name, script=fs_script.script, device=device ) else: client.set_function( - name=db_script.name, function=db_script.script, device=device + name=fs_script.name, function=fs_script.script, device=device ) else: - raise ValueError(f"No script or file attached to {db_script.name}") + raise ValueError(f"No script or file attached to {fs_script.name}") except RedisReplyError as error: # pragma: no cover - logger.error("Error while setting model on orchestrator.") + logger.error("Error while setting model on feature store.") raise error -def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov-wlm - """Send shutdown signal to DB node. +def shutdown_fs_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov-wlm + """Send shutdown signal to FS node. Should only be used in the case where cluster deallocation needs to occur manually. Usually, the SmartSim job manager diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py index d4ec66eaf..42dcd0e8f 100644 --- a/smartsim/_core/utils/serialize.py +++ b/smartsim/_core/utils/serialize.py @@ -36,9 +36,9 @@ if t.TYPE_CHECKING: from smartsim._core.control.manifest import LaunchedManifest as _Manifest - from smartsim.database.orchestrator import Orchestrator - from smartsim.entity import DBNode, Ensemble, Model - from smartsim.entity.dbobject import DBModel, DBScript + from smartsim.database.orchestrator import FeatureStore + from smartsim.entity import FSNode, Ensemble, Model + from smartsim.entity.dbobject import FSModel, FSScript from smartsim.settings.base import BatchSettings, RunSettings @@ -62,8 +62,8 @@ def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: _dictify_model(model, *telemetry_metadata) for model, telemetry_metadata in manifest.models ], - "orchestrator": [ - _dictify_db(db, nodes_info) for db, nodes_info in manifest.databases + "featurestore": [ + _dictify_fs(fs, nodes_info) for fs, nodes_info in manifest.featurestores ], "ensemble": [ _dictify_ensemble(ens, member_info) @@ -104,9 +104,9 @@ def _dictify_model( err_file: str, telemetry_data_path: Path, ) -> t.Dict[str, t.Any]: - colo_settings = (model.run_settings.colocated_db_settings or {}).copy() - db_scripts = t.cast("t.List[DBScript]", colo_settings.pop("db_scripts", [])) - db_models = t.cast("t.List[DBModel]", colo_settings.pop("db_models", [])) + colo_settings = (model.run_settings.colocated_fs_settings or {}).copy() + fs_scripts = t.cast("t.List[FSScript]", colo_settings.pop("fs_scripts", [])) + fs_models = t.cast("t.List[FSModel]", colo_settings.pop("fs_models", [])) return { "name": model.name, "path": model.path, @@ -131,7 +131,7 @@ def _dictify_model( "Copy": [], } ), - "colocated_db": ( + "colocated_fs": ( { "settings": colo_settings, "scripts": [ @@ -141,7 +141,7 @@ def _dictify_model( "device": script.device, } } - for script in db_scripts + for script in fs_scripts ], "models": [ { @@ -150,7 +150,7 @@ def _dictify_model( "device": model.device, } } - for model in db_models + for model in fs_models ], } if colo_settings @@ -214,20 +214,20 @@ def _dictify_batch_settings(batch_settings: BatchSettings) -> t.Dict[str, t.Any] } -def _dictify_db( - db: Orchestrator, - nodes: t.Sequence[t.Tuple[DBNode, TStepLaunchMetaData]], +def _dictify_fs( + fs: FeatureStore, + nodes: t.Sequence[t.Tuple[FSNode, TStepLaunchMetaData]], ) -> t.Dict[str, t.Any]: - db_path = _utils.get_db_path() - if db_path: - db_type, _ = db_path.name.split("-", 1) + fs_path = _utils.get_fs_path() + if fs_path: + fs_type, _ = fs_path.name.split("-", 1) else: - db_type = "Unknown" + fs_type = "Unknown" return { - "name": db.name, - "type": db_type, - "interface": db._interfaces, # pylint: disable=protected-access + "name": fs.name, + "type": fs_type, + "interface": fs._interfaces, # pylint: disable=protected-access "shards": [ { **shard.to_dict(), @@ -235,14 +235,14 @@ def _dictify_db( "out_file": out_file, "err_file": err_file, "memory_file": ( - str(status_dir / "memory.csv") if db.telemetry.is_enabled else "" + str(status_dir / "memory.csv") if fs.telemetry.is_enabled else "" ), "client_file": ( - str(status_dir / "client.csv") if db.telemetry.is_enabled else "" + str(status_dir / "client.csv") if fs.telemetry.is_enabled else "" ), "client_count_file": ( str(status_dir / "client_count.csv") - if db.telemetry.is_enabled + if fs.telemetry.is_enabled else "" ), "telemetry_metadata": { @@ -252,7 +252,7 @@ def _dictify_db( "managed": managed, }, } - for dbnode, ( + for fsnode, ( step_id, task_id, managed, @@ -260,6 +260,6 @@ def _dictify_db( err_file, status_dir, ) in nodes - for shard in dbnode.get_launched_shard_info() + for shard in fsnode.get_launched_shard_info() ], } diff --git a/smartsim/_core/utils/telemetry/collector.py b/smartsim/_core/utils/telemetry/collector.py index 178126dec..4d0a79af3 100644 --- a/smartsim/_core/utils/telemetry/collector.py +++ b/smartsim/_core/utils/telemetry/collector.py @@ -95,8 +95,8 @@ class _DBAddress: def __init__(self, host: str, port: int) -> None: """Initialize the instance - :param host: host address for database connections - :param port: port number for database connections + :param host: host address for feature store connections + :param port: port number for feature store connections """ self.host = host.strip() if host else "" self.port = port @@ -115,7 +115,7 @@ def __str__(self) -> str: class DBCollector(Collector): - """A base class for collectors that retrieve statistics from an orchestrator""" + """A base class for collectors that retrieve statistics from a feature store""" def __init__(self, entity: JobEntity, sink: Sink) -> None: """Initialize the `DBCollector` @@ -131,7 +131,7 @@ def __init__(self, entity: JobEntity, sink: Sink) -> None: ) async def _configure_client(self) -> None: - """Configure the client connection to the target database""" + """Configure the client connection to the target feature store""" try: if not self._client: self._client = redisa.Redis( @@ -146,7 +146,7 @@ async def _configure_client(self) -> None: ) async def prepare(self) -> None: - """Initialization logic for the DB collector. Creates a database + """Initialization logic for the FS collector. Creates a feature store connection then executes the `post_prepare` callback function.""" if self._client: return @@ -157,7 +157,7 @@ async def prepare(self) -> None: @abc.abstractmethod async def _post_prepare(self) -> None: """Hook function to enable subclasses to perform actions - after a db client is ready""" + after a fss client is ready""" @abc.abstractmethod async def _perform_collection( @@ -171,7 +171,7 @@ async def _perform_collection( """ async def collect(self) -> None: - """Execute database metric collection if the collector is enabled. Writes + """Execute feature store metric collection if the collector is enabled. Writes the resulting metrics to the associated output sink. Calling `collect` when `self.enabled` is `False` performs no actions.""" if not self.enabled: @@ -186,8 +186,8 @@ async def collect(self) -> None: return try: - # if we can't communicate w/the db, exit - if not await self._check_db(): + # if we can't communicate w/the fs, exit + if not await self._check_fs(): return all_metrics = await self._perform_collection() @@ -197,7 +197,7 @@ async def collect(self) -> None: logger.warning(f"Collect failed for {type(self).__name__}", exc_info=ex) async def shutdown(self) -> None: - """Execute cleanup of database client connections""" + """Execute cleanup of feature store client connections""" try: if self._client: logger.info( @@ -210,8 +210,8 @@ async def shutdown(self) -> None: f"An error occurred during {type(self).__name__} shutdown", exc_info=ex ) - async def _check_db(self) -> bool: - """Check if the target database is reachable. + async def _check_fs(self) -> bool: + """Check if the target feature store is reachable. :return: `True` if connection succeeds, `False` otherwise. """ @@ -219,7 +219,7 @@ async def _check_db(self) -> bool: if self._client: return await self._client.ping() except redisex.ConnectionError: - logger.warning(f"Cannot ping db {self._address}") + logger.warning(f"Cannot ping fs {self._address}") return False @@ -233,7 +233,7 @@ def __init__(self, entity: JobEntity, sink: Sink) -> None: async def _post_prepare(self) -> None: """Write column headers for a CSV formatted output sink after - the database connection is established""" + the feature store connection is established""" await self._sink.save("timestamp", *self._columns) async def _perform_collection( @@ -247,11 +247,11 @@ async def _perform_collection( if self._client is None: return [] - db_info = await self._client.info("memory") + fs_info = await self._client.info("memory") - used = float(db_info["used_memory"]) - peak = float(db_info["used_memory_peak"]) - total = float(db_info["total_system_memory"]) + used = float(fs_info["used_memory"]) + peak = float(fs_info["used_memory_peak"]) + total = float(fs_info["total_system_memory"]) value = (get_ts_ms(), used, peak, total) @@ -261,7 +261,7 @@ async def _perform_collection( class DBConnectionCollector(DBCollector): - """A `DBCollector` that collects database client-connection metrics""" + """A `DBCollector` that collects feature store client-connection metrics""" def __init__(self, entity: JobEntity, sink: Sink) -> None: super().__init__(entity, sink) @@ -269,7 +269,7 @@ def __init__(self, entity: JobEntity, sink: Sink) -> None: async def _post_prepare(self) -> None: """Write column headers for a CSV formatted output sink after - the database connection is established""" + the feature store connection is established""" await self._sink.save("timestamp", *self._columns) async def _perform_collection( @@ -306,7 +306,7 @@ def __init__(self, entity: JobEntity, sink: Sink) -> None: async def _post_prepare(self) -> None: """Write column headers for a CSV formatted output sink after - the database connection is established""" + the feature store connection is established""" await self._sink.save("timestamp", *self._columns) async def _perform_collection( @@ -457,9 +457,9 @@ def register_collectors(self, entity: JobEntity) -> None: """ collectors: t.List[Collector] = [] - # ONLY db telemetry is implemented at this time. This resolver must - # be updated when non-database or always-on collectors are introduced - if entity.is_db and entity.telemetry_on: + # ONLY fs telemetry is implemented at this time. This resolver must + # be updated when non-feature store or always-on collectors are introduced + if entity.is_fs and entity.telemetry_on: if mem_out := entity.collectors.get("memory", None): collectors.append(DBMemoryCollector(entity, FileSink(mem_out))) @@ -469,7 +469,7 @@ def register_collectors(self, entity: JobEntity) -> None: if num_out := entity.collectors.get("client_count", None): collectors.append(DBConnectionCountCollector(entity, FileSink(num_out))) else: - logger.debug(f"Collectors disabled for db {entity.name}") + logger.debug(f"Collectors disabled for fs {entity.name}") self.add_all(collectors) diff --git a/smartsim/_core/utils/telemetry/manifest.py b/smartsim/_core/utils/telemetry/manifest.py index e72a18fa0..33c2d8c2c 100644 --- a/smartsim/_core/utils/telemetry/manifest.py +++ b/smartsim/_core/utils/telemetry/manifest.py @@ -45,8 +45,8 @@ class Run: """the timestamp at the time the `Experiment.start` is called""" models: t.List[JobEntity] """models started in this run""" - orchestrators: t.List[JobEntity] - """orchestrators started in this run""" + featurestores: t.List[JobEntity] + """featurestores started in this run""" ensembles: t.List[JobEntity] """ensembles started in this run""" @@ -58,7 +58,7 @@ def flatten( :param filter_fn: optional boolean filter that returns True for entities to include in the result """ - entities = self.models + self.orchestrators + self.ensembles + entities = self.models + self.featurestores + self.ensembles if filter_fn: entities = [entity for entity in entities if filter_fn(entity)] return entities @@ -84,7 +84,7 @@ def load_entity( parent_keys = parent_keys.intersection(entity_dict.keys()) if parent_keys: container = "shards" if "shards" in parent_keys else "models" - child_type = "orchestrator" if container == "shards" else "model" + child_type = "featurestore" if container == "shards" else "model" for child_entity in entity_dict[container]: entity = JobEntity.from_manifest(child_type, child_entity, str(exp_dir)) entities.append(entity) @@ -111,7 +111,7 @@ def load_entities( """ persisted: t.Dict[str, t.List[JobEntity]] = { "model": [], - "orchestrator": [], + "featurestore": [], } for item in run[entity_type]: entities = Run.load_entity(entity_type, item, exp_dir) @@ -132,7 +132,7 @@ def load_run(raw_run: t.Dict[str, t.Any], exp_dir: pathlib.Path) -> "Run": # create an output mapping to hold the deserialized entities run_entities: t.Dict[str, t.List[JobEntity]] = { "model": [], - "orchestrator": [], + "featurestore": [], "ensemble": [], } @@ -152,7 +152,7 @@ def load_run(raw_run: t.Dict[str, t.Any], exp_dir: pathlib.Path) -> "Run": loaded_run = Run( raw_run["timestamp"], run_entities["model"], - run_entities["orchestrator"], + run_entities["featurestore"], run_entities["ensemble"], ) return loaded_run diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index 7b1288341..f00b4d435 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -440,7 +440,7 @@ def __init__(self, telemetry_monitor_args: TelemetryMonitorArgs): def _can_shutdown(self) -> bool: """Determines if the telemetry monitor can perform shutdown. An automatic shutdown will occur if there are no active jobs being monitored. - Managed jobs and databases are considered separately due to the way they + Managed jobs and feature stores are considered separately due to the way they are stored in the job manager :return: return True if capable of automatically shutting down @@ -453,20 +453,20 @@ def _can_shutdown(self) -> bool: unmanaged_jobs = ( list(self._action_handler.tracked_jobs) if self._action_handler else [] ) - # get an individual count of databases for logging - n_dbs: int = len( + # get an individual count of feature stores for logging + n_fss: int = len( [ job for job in managed_jobs + unmanaged_jobs - if isinstance(job, JobEntity) and job.is_db + if isinstance(job, JobEntity) and job.is_fs ] ) # if we have no jobs currently being monitored we can shutdown - n_jobs = len(managed_jobs) + len(unmanaged_jobs) - n_dbs - shutdown_ok = n_jobs + n_dbs == 0 + n_jobs = len(managed_jobs) + len(unmanaged_jobs) - n_fss + shutdown_ok = n_jobs + n_fss == 0 - logger.debug(f"{n_jobs} active job(s), {n_dbs} active db(s)") + logger.debug(f"{n_jobs} active job(s), {n_fss} active fs(s)") return shutdown_ok async def monitor(self) -> None: diff --git a/smartsim/_core/utils/telemetry/util.py b/smartsim/_core/utils/telemetry/util.py index 78f9c6db0..2189a5c78 100644 --- a/smartsim/_core/utils/telemetry/util.py +++ b/smartsim/_core/utils/telemetry/util.py @@ -55,7 +55,7 @@ def write_event( :param task_id: the task_id of a managed task :param step_id: the step_id of an unmanaged task :param entity_type: the SmartSimEntity subtype - (e.g. `orchestrator`, `ensemble`, `model`, `dbnode`, ...) + (e.g. `featurestore`, `ensemble`, `model`, `fsnode`, ...) :param event_type: the event subtype :param status_dir: path where the SmartSimEntity outputs are written :param detail: (optional) additional information to write with the event diff --git a/smartsim/database/__init__.py b/smartsim/database/__init__.py index 106f8e1e2..0801c682b 100644 --- a/smartsim/database/__init__.py +++ b/smartsim/database/__init__.py @@ -24,4 +24,4 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .orchestrator import Orchestrator +from .orchestrator import FeatureStore diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index d95ae465b..48bf7ca15 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -34,10 +34,10 @@ from smartredis.error import RedisReplyError from .._core.config import CONFIG -from .._core.utils import db_is_active -from .._core.utils.helpers import is_valid_cmd, unpack_db_identifier +from .._core.utils import fs_is_active +from .._core.utils.helpers import is_valid_cmd, unpack_fs_identifier from .._core.utils.network import get_ip_from_host -from ..entity import DBNode, EntityList, TelemetryConfiguration +from ..entity import FSNode, EntityList, TelemetryConfiguration from ..error import SmartSimError, SSConfigError, SSUnsupportedError from ..log import get_logger from ..servertype import CLUSTERED, STANDALONE @@ -111,7 +111,7 @@ def _get_single_command(run_command: str, batch: bool, single_cmd: bool) -> bool if run_command == "srun" and getenv("SLURM_HET_SIZE") is not None: msg = ( - "srun can not launch an orchestrator with single_cmd=True in " + "srun can not launch an FeatureStore with single_cmd=True in " + "a hetereogeneous job. Automatically switching to single_cmd=False." ) logger.info(msg) @@ -122,7 +122,7 @@ def _get_single_command(run_command: str, batch: bool, single_cmd: bool) -> bool if run_command == "aprun": msg = ( - "aprun can not launch an orchestrator with batch=True and " + "aprun can not launch an FeatureStore with batch=True and " + "single_cmd=True. Automatically switching to single_cmd=False." ) logger.info(msg) @@ -134,12 +134,12 @@ def _get_single_command(run_command: str, batch: bool, single_cmd: bool) -> bool def _check_local_constraints(launcher: str, batch: bool) -> None: """Check that the local launcher is not launched with invalid batch config""" if launcher == "local" and batch: - msg = "Local orchestrator can not be launched with batch=True" + msg = "Local FeatureStore can not be launched with batch=True" raise SmartSimError(msg) -class Orchestrator(EntityList[DBNode]): - """The Orchestrator is an in-memory database that can be launched +class FeatureStore(EntityList[FSNode]): + """The FeatureStore is an in-memory feature store that can be launched alongside entities in SmartSim. Data can be transferred between entities by using one of the Python, C, C++ or Fortran clients within an entity. @@ -152,7 +152,7 @@ def __init__( interface: t.Union[str, t.List[str]] = "lo", launcher: str = "local", run_command: str = "auto", - db_nodes: int = 1, + fs_nodes: int = 1, batch: bool = False, hosts: t.Optional[t.Union[t.List[str], str]] = None, account: t.Optional[str] = None, @@ -163,16 +163,16 @@ def __init__( threads_per_queue: t.Optional[int] = None, inter_op_threads: t.Optional[int] = None, intra_op_threads: t.Optional[int] = None, - db_identifier: str = "orchestrator", + fs_identifier: str = "featurestore", **kwargs: t.Any, ) -> None: - """Initialize an ``Orchestrator`` reference for local launch + """Initialize an ``FeatureStore`` reference for local launch Extra configurations for RedisAI See https://oss.redis.com/redisai/configuration/ - :param path: path to location of ``Orchestrator`` directory + :param path: path to location of ``FeatureStore`` directory :param port: TCP/IP port :param interface: network interface(s) :param launcher: type of launcher being used, options are "slurm", "pbs", @@ -180,18 +180,18 @@ def __init__( an attempt will be made to find an available launcher on the system. :param run_command: specify launch binary or detect automatically - :param db_nodes: number of database shards + :param fs_nodes: number of feature store shards :param batch: run as a batch workload :param hosts: specify hosts to launch on :param account: account to run batch on :param time: walltime for batch 'HH:MM:SS' format - :param alloc: allocation to launch database on + :param alloc: allocation to launch feature store on :param single_cmd: run all shards with one (MPMD) command :param threads_per_queue: threads per GPU device :param inter_op_threads: threads across CPU operations :param intra_op_threads: threads per CPU operation - :param db_identifier: an identifier to distinguish this orchestrator in - multiple-database experiments + :param fs_identifier: an identifier to distinguish this FeatureStore in + multiple-feature store experiments """ self.launcher, self.run_command = _autodetect(launcher, run_command) _check_run_command(self.launcher, self.run_command) @@ -215,11 +215,11 @@ def __init__( gpus_per_shard = int(kwargs.pop("gpus_per_shard", 0)) cpus_per_shard = int(kwargs.pop("cpus_per_shard", 4)) super().__init__( - name=db_identifier, + name=fs_identifier, path=str(path), port=port, interface=interface, - db_nodes=db_nodes, + fs_nodes=fs_nodes, batch=batch, launcher=self.launcher, run_command=self.run_command, @@ -252,7 +252,7 @@ def __init__( if self.launcher != "local": self.batch_settings = self._build_batch_settings( - db_nodes, + fs_nodes, alloc or "", batch, account or "", @@ -264,52 +264,52 @@ def __init__( self.set_hosts(hosts) elif not hosts and self.run_command == "mpirun": raise SmartSimError( - "hosts argument is required when launching Orchestrator with mpirun" + "hosts argument is required when launching FeatureStore with mpirun" ) self._reserved_run_args: t.Dict[t.Type[RunSettings], t.List[str]] = {} self._reserved_batch_args: t.Dict[t.Type[BatchSettings], t.List[str]] = {} self._fill_reserved() @property - def db_identifier(self) -> str: - """Return the DB identifier, which is common to a DB and all of its nodes + def fs_identifier(self) -> str: + """Return the FS identifier, which is common to a FS and all of its nodes - :return: DB identifier + :return: FS identifier """ return self.name @property def num_shards(self) -> int: - """Return the number of DB shards contained in the Orchestrator. - This might differ from the number of ``DBNode`` objects, as each - ``DBNode`` may start more than one shard (e.g. with MPMD). + """Return the number of FS shards contained in the FeatureStore. + This might differ from the number of ``FSNode`` objects, as each + ``FSNode`` may start more than one shard (e.g. with MPMD). - :returns: the number of DB shards contained in the Orchestrator + :returns: the number of FS shards contained in the FeatureStore """ return sum(node.num_shards for node in self.entities) @property - def db_nodes(self) -> int: - """Read only property for the number of nodes an ``Orchestrator`` is + def fs_nodes(self) -> int: + """Read only property for the number of nodes an ``FeatureStore`` is launched across. Notice that SmartSim currently assumes that each shard will be launched on its own node. Therefore this property is currently an alias to the ``num_shards`` attribute. - :returns: Number of database nodes + :returns: Number of feature store nodes """ return self.num_shards @property def hosts(self) -> t.List[str]: - """Return the hostnames of Orchestrator instance hosts + """Return the hostnames of FeatureStore instance hosts - Note that this will only be populated after the orchestrator + Note that this will only be populated after the FeatureStore has been launched by SmartSim. - :return: the hostnames of Orchestrator instance hosts + :return: the hostnames of FeatureStore instance hosts """ if not self._hosts: - self._hosts = self._get_db_hosts() + self._hosts = self._get_fs_hosts() return self._hosts @property @@ -330,22 +330,22 @@ def reset_hosts(self) -> None: self.set_hosts(self._user_hostlist) def remove_stale_files(self) -> None: - """Can be used to remove database files of a previous launch""" + """Can be used to remove feature store files of a previous launch""" - for db in self.entities: - db.remove_stale_dbnode_files() + for fs in self.entities: + fs.remove_stale_fsnode_files() def get_address(self) -> t.List[str]: - """Return database addresses + """Return feature store addresses :return: addresses - :raises SmartSimError: If database address cannot be found or is not active + :raises SmartSimError: If feature store address cannot be found or is not active """ if not self._hosts: - raise SmartSimError("Could not find database address") + raise SmartSimError("Could not find feature store address") if not self.is_active(): - raise SmartSimError("Database is not active") + raise SmartSimError("Feature store is not active") return self._get_address() def _get_address(self) -> t.List[str]: @@ -355,20 +355,20 @@ def _get_address(self) -> t.List[str]: ] def is_active(self) -> bool: - """Check if the database is active + """Check if the feature store is active - :return: True if database is active, False otherwise + :return: True if feature store is active, False otherwise """ if not self._hosts: return False - return db_is_active(self._hosts, self.ports, self.num_shards) + return fs_is_active(self._hosts, self.ports, self.num_shards) @property def _rai_module(self) -> t.Tuple[str, ...]: """Get the RedisAI module from third-party installations - :return: Tuple of args to pass to the orchestrator exe + :return: Tuple of args to pass to the FeatureStore exe to load and configure the RedisAI """ module = ["--loadmodule", CONFIG.redisai] @@ -389,7 +389,7 @@ def _redis_conf(self) -> str: return CONFIG.database_conf def set_cpus(self, num_cpus: int) -> None: - """Set the number of CPUs available to each database shard + """Set the number of CPUs available to each feature store shard This effectively will determine how many cpus can be used for compute threads, background threads, and network I/O. @@ -406,19 +406,19 @@ def set_cpus(self, num_cpus: int) -> None: if hasattr(self.batch_settings, "set_cpus_per_task"): self.batch_settings.set_cpus_per_task(num_cpus) - for db in self.entities: - db.run_settings.set_cpus_per_task(num_cpus) - if db.is_mpmd and hasattr(db.run_settings, "mpmd"): - for mpmd in db.run_settings.mpmd: + for fs in self.entities: + fs.run_settings.set_cpus_per_task(num_cpus) + if fs.is_mpmd and hasattr(fs.run_settings, "mpmd"): + for mpmd in fs.run_settings.mpmd: mpmd.set_cpus_per_task(num_cpus) def set_walltime(self, walltime: str) -> None: - """Set the batch walltime of the orchestrator + """Set the batch walltime of the FeatureStore - Note: This will only effect orchestrators launched as a batch + Note: This will only effect FeatureStores launched as a batch :param walltime: amount of time e.g. 10 hours is 10:00:00 - :raises SmartSimError: if orchestrator isn't launching as batch + :raises SmartSimError: if FeatureStore isn't launching as batch """ if not self.batch: raise SmartSimError("Not running as batch, cannot set walltime") @@ -427,7 +427,7 @@ def set_walltime(self, walltime: str) -> None: self.batch_settings.set_walltime(walltime) def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: - """Specify the hosts for the ``Orchestrator`` to launch on + """Specify the hosts for the ``FeatureStore`` to launch on :param host_list: list of host (compute node names) :raises TypeError: if wrong type @@ -445,8 +445,8 @@ def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: self.batch_settings.set_hostlist(host_list) if self.launcher == "lsf": - for db in self.entities: - db.set_hosts(host_list) + for fs in self.entities: + fs.set_hosts(host_list) elif ( self.launcher == "pals" and isinstance(self.entities[0].run_settings, PalsMpiexecSettings) @@ -455,26 +455,26 @@ def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: # In this case, --hosts is a global option, set it to first run command self.entities[0].run_settings.set_hostlist(host_list) else: - for host, db in zip(host_list, self.entities): - if isinstance(db.run_settings, AprunSettings): + for host, fs in zip(host_list, self.entities): + if isinstance(fs.run_settings, AprunSettings): if not self.batch: - db.run_settings.set_hostlist([host]) + fs.run_settings.set_hostlist([host]) else: - db.run_settings.set_hostlist([host]) + fs.run_settings.set_hostlist([host]) - if db.is_mpmd and hasattr(db.run_settings, "mpmd"): - for i, mpmd_runsettings in enumerate(db.run_settings.mpmd, 1): + if fs.is_mpmd and hasattr(fs.run_settings, "mpmd"): + for i, mpmd_runsettings in enumerate(fs.run_settings.mpmd, 1): mpmd_runsettings.set_hostlist(host_list[i]) def set_batch_arg(self, arg: str, value: t.Optional[str] = None) -> None: - """Set a batch argument the orchestrator should launch with + """Set a batch argument the FeatureStore should launch with Some commonly used arguments such as --job-name are used by SmartSim and will not be allowed to be set. :param arg: batch argument to set e.g. "exclusive" :param value: batch param - set to None if no param value - :raises SmartSimError: if orchestrator not launching as batch + :raises SmartSimError: if FeatureStore not launching as batch """ if not hasattr(self, "batch_settings") or not self.batch_settings: raise SmartSimError("Not running as batch, cannot set batch_arg") @@ -482,13 +482,13 @@ def set_batch_arg(self, arg: str, value: t.Optional[str] = None) -> None: if arg in self._reserved_batch_args[type(self.batch_settings)]: logger.warning( f"Can not set batch argument {arg}: " - "it is a reserved keyword in Orchestrator" + "it is a reserved keyword in FeatureStore" ) else: self.batch_settings.batch_args[arg] = value def set_run_arg(self, arg: str, value: t.Optional[str] = None) -> None: - """Set a run argument the orchestrator should launch + """Set a run argument the FeatureStore should launch each node with (it will be passed to `jrun`) Some commonly used arguments are used @@ -501,24 +501,24 @@ def set_run_arg(self, arg: str, value: t.Optional[str] = None) -> None: if arg in self._reserved_run_args[type(self.entities[0].run_settings)]: logger.warning( f"Can not set batch argument {arg}: " - "it is a reserved keyword in Orchestrator" + "it is a reserved keyword in FeatureStore" ) else: - for db in self.entities: - db.run_settings.run_args[arg] = value - if db.is_mpmd and hasattr(db.run_settings, "mpmd"): - for mpmd in db.run_settings.mpmd: + for fs in self.entities: + fs.run_settings.run_args[arg] = value + if fs.is_mpmd and hasattr(fs.run_settings, "mpmd"): + for mpmd in fs.run_settings.mpmd: mpmd.run_args[arg] = value def enable_checkpoints(self, frequency: int) -> None: - """Sets the database's save configuration to save the DB every 'frequency' - seconds given that at least one write operation against the DB occurred in - that time. E.g., if `frequency` is 900, then the database will save to disk + """Sets the feature store's save configuration to save the fs every 'frequency' + seconds given that at least one write operation against the fs occurred in + that time. E.g., if `frequency` is 900, then the feature store will save to disk after 900 seconds if there is at least 1 change to the dataset. - :param frequency: the given number of seconds before the DB saves + :param frequency: the given number of seconds before the FS saves """ - self.set_db_conf("save", f"{frequency} 1") + self.set_fs_conf("save", f"{frequency} 1") def set_max_memory(self, mem: str) -> None: """Sets the max memory configuration. By default there is no memory limit. @@ -535,33 +535,33 @@ def set_max_memory(self, mem: str) -> None: :param mem: the desired max memory size e.g. 3gb :raises SmartSimError: If 'mem' is an invalid memory value - :raises SmartSimError: If database is not active + :raises SmartSimError: If feature store is not active """ - self.set_db_conf("maxmemory", mem) + self.set_fs_conf("maxmemory", mem) def set_eviction_strategy(self, strategy: str) -> None: - """Sets how the database will select what to remove when + """Sets how the feature store will select what to remove when 'maxmemory' is reached. The default is noeviction. :param strategy: The max memory policy to use e.g. "volatile-lru", "allkeys-lru", etc. :raises SmartSimError: If 'strategy' is an invalid maxmemory policy - :raises SmartSimError: If database is not active + :raises SmartSimError: If feature store is not active """ - self.set_db_conf("maxmemory-policy", strategy) + self.set_fs_conf("maxmemory-policy", strategy) def set_max_clients(self, clients: int = 50_000) -> None: """Sets the max number of connected clients at the same time. - When the number of DB shards contained in the orchestrator is + When the number of FS shards contained in the feature store is more than two, then every node will use two connections, one incoming and another outgoing. :param clients: the maximum number of connected clients """ - self.set_db_conf("maxclients", str(clients)) + self.set_fs_conf("maxclients", str(clients)) def set_max_message_size(self, size: int = 1_073_741_824) -> None: - """Sets the database's memory size limit for bulk requests, + """Sets the feature store's memory size limit for bulk requests, which are elements representing single strings. The default is 1 gigabyte. Message size must be greater than or equal to 1mb. The specified memory size should be an integer that represents @@ -570,16 +570,16 @@ def set_max_message_size(self, size: int = 1_073_741_824) -> None: :param size: maximum message size in bytes """ - self.set_db_conf("proto-max-bulk-len", str(size)) + self.set_fs_conf("proto-max-bulk-len", str(size)) - def set_db_conf(self, key: str, value: str) -> None: + def set_fs_conf(self, key: str, value: str) -> None: """Set any valid configuration at runtime without the need - to restart the database. All configuration parameters - that are set are immediately loaded by the database and + to restart the feature store. All configuration parameters + that are set are immediately loaded by the feature store and will take effect starting with the next command executed. :param key: the configuration parameter - :param value: the database configuration parameter's new value + :param value: the feature store configuration parameter's new value """ if self.is_active(): addresses = [] @@ -587,12 +587,12 @@ def set_db_conf(self, key: str, value: str) -> None: for port in self.ports: addresses.append(":".join([get_ip_from_host(host), str(port)])) - db_name, name = unpack_db_identifier(self.db_identifier, "_") + fs_name, name = unpack_fs_identifier(self.fs_identifier, "_") - environ[f"SSDB{db_name}"] = addresses[0] + environ[f"SSDB{fs_name}"] = addresses[0] - db_type = CLUSTERED if self.num_shards > 2 else STANDALONE - environ[f"SR_DB_TYPE{db_name}"] = db_type + fs_type = CLUSTERED if self.num_shards > 2 else STANDALONE + environ[f"SR_DB_TYPE{fs_name}"] = fs_type options = ConfigOptions.create_from_environment(name) client = Client(options) @@ -608,17 +608,17 @@ def set_db_conf(self, key: str, value: str) -> None: except TypeError: raise TypeError( "Incompatible function arguments. The key and value used for " - "setting the database configurations must be strings." + "setting the feature store configurations must be strings." ) from None else: raise SmartSimError( - "The SmartSim Orchestrator must be active in order to set the " - "database's configurations." + "The SmartSim FeatureStore must be active in order to set the " + "feature store's configurations." ) @staticmethod def _build_batch_settings( - db_nodes: int, + fs_nodes: int, alloc: str, batch: bool, account: str, @@ -636,7 +636,7 @@ def _build_batch_settings( # on or if user specified batch=False (alloc will be found through env) if not alloc and batch: batch_settings = create_batch_settings( - launcher, nodes=db_nodes, time=time, account=account, **kwargs + launcher, nodes=fs_nodes, time=time, account=account, **kwargs ) return batch_settings @@ -647,12 +647,12 @@ def _build_run_settings( exe_args: t.List[t.List[str]], *, run_args: t.Optional[t.Dict[str, t.Any]] = None, - db_nodes: int = 1, + fs_nodes: int = 1, single_cmd: bool = True, **kwargs: t.Any, ) -> RunSettings: run_args = {} if run_args is None else run_args - mpmd_nodes = single_cmd and db_nodes > 1 + mpmd_nodes = single_cmd and fs_nodes > 1 if mpmd_nodes: run_settings = create_run_settings( @@ -702,7 +702,7 @@ def _build_run_settings_lsf( if gpus_per_shard is None: raise ValueError("Expected an integer number of gpus per shard") - # We always run the DB on cpus 0:cpus_per_shard-1 + # We always run the fs on cpus 0:cpus_per_shard-1 # and gpus 0:gpus_per_shard-1 for shard_id, args in enumerate(exe_args): host = shard_id @@ -711,8 +711,8 @@ def _build_run_settings_lsf( run_settings = JsrunSettings(exe, args, run_args=run_args.copy()) run_settings.set_binding("none") - # This makes sure output is written to orchestrator_0.out, - # orchestrator_1.out, and so on + # This makes sure output is written to featurestore_0.out, + # featurestore_1.out, and so on run_settings.set_individual_output("_%t") erf_sets = { @@ -739,91 +739,91 @@ def _build_run_settings_lsf( def _initialize_entities( self, *, - db_nodes: int = 1, + fs_nodes: int = 1, single_cmd: bool = True, port: int = 6379, **kwargs: t.Any, ) -> None: - db_nodes = int(db_nodes) - if db_nodes == 2: - raise SSUnsupportedError("Orchestrator does not support clusters of size 2") + fs_nodes = int(fs_nodes) + if fs_nodes == 2: + raise SSUnsupportedError("FeatureStore does not support clusters of size 2") - if self.launcher == "local" and db_nodes > 1: + if self.launcher == "local" and fs_nodes > 1: raise ValueError( - "Local Orchestrator does not support multiple database shards" + "Local FeatureStore does not support multiple feature store shards" ) - mpmd_nodes = (single_cmd and db_nodes > 1) or self.launcher == "lsf" + mpmd_nodes = (single_cmd and fs_nodes > 1) or self.launcher == "lsf" if mpmd_nodes: self._initialize_entities_mpmd( - db_nodes=db_nodes, single_cmd=single_cmd, port=port, **kwargs + fs_nodes=fs_nodes, single_cmd=single_cmd, port=port, **kwargs ) else: - cluster = db_nodes >= 3 + cluster = fs_nodes >= 3 - for db_id in range(db_nodes): - db_node_name = "_".join((self.name, str(db_id))) + for fs_id in range(fs_nodes): + fs_node_name = "_".join((self.name, str(fs_id))) - # create the exe_args list for launching multiple databases - # per node. also collect port range for dbnode + # create the exe_args list for launching multiple feature stores + # per node. also collect port range for fsnode start_script_args = self._get_start_script_args( - db_node_name, port, cluster + fs_node_name, port, cluster ) - # if only launching 1 db per command, we don't need a + # if only launching 1 fs per command, we don't need a # list of exe args lists run_settings = self._build_run_settings( sys.executable, [start_script_args], port=port, **kwargs ) - node = DBNode( - db_node_name, + node = FSNode( + fs_node_name, self.path, run_settings, [port], - [db_node_name + ".out"], - self.db_identifier, + [fs_node_name + ".out"], + self.fs_identifier, ) self.entities.append(node) self.ports = [port] def _initialize_entities_mpmd( - self, *, db_nodes: int = 1, port: int = 6379, **kwargs: t.Any + self, *, fs_nodes: int = 1, port: int = 6379, **kwargs: t.Any ) -> None: - cluster = db_nodes >= 3 + cluster = fs_nodes >= 3 mpmd_node_name = self.name + "_0" exe_args_mpmd: t.List[t.List[str]] = [] - for db_id in range(db_nodes): - db_shard_name = "_".join((self.name, str(db_id))) - # create the exe_args list for launching multiple databases - # per node. also collect port range for dbnode + for fs_id in range(fs_nodes): + fs_shard_name = "_".join((self.name, str(fs_id))) + # create the exe_args list for launching multiple feature stores + # per node. also collect port range for fsnode start_script_args = self._get_start_script_args( - db_shard_name, port, cluster + fs_shard_name, port, cluster ) exe_args = " ".join(start_script_args) exe_args_mpmd.append(sh_split(exe_args)) run_settings: t.Optional[RunSettings] = None if self.launcher == "lsf": run_settings = self._build_run_settings_lsf( - sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs + sys.executable, exe_args_mpmd, fs_nodes=fs_nodes, port=port, **kwargs ) - output_files = [f"{self.name}_{db_id}.out" for db_id in range(db_nodes)] + output_files = [f"{self.name}_{fs_id}.out" for fs_id in range(fs_nodes)] else: run_settings = self._build_run_settings( - sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs + sys.executable, exe_args_mpmd, fs_nodes=fs_nodes, port=port, **kwargs ) output_files = [mpmd_node_name + ".out"] if not run_settings: raise ValueError(f"Could not build run settings for {self.launcher}") - node = DBNode( + node = FSNode( mpmd_node_name, self.path, run_settings, [port], output_files, - db_identifier=self.db_identifier, + fs_identifier=self.fs_identifier, ) self.entities.append(node) self.ports = [port] @@ -846,13 +846,13 @@ def _get_start_script_args( cmd.append("+cluster") # is the shard part of a cluster return cmd - def _get_db_hosts(self) -> t.List[str]: + def _get_fs_hosts(self) -> t.List[str]: hosts = [] - for db in self.entities: - if not db.is_mpmd: - hosts.append(db.host) + for fs in self.entities: + if not fs.is_mpmd: + hosts.append(fs.host) else: - hosts.extend(db.hosts) + hosts.extend(fs.hosts) return hosts def _check_network_interface(self) -> None: diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index 40f03fcdd..7a8ec2cac 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .dbnode import DBNode +from .dbnode import FSNode from .dbobject import * from .ensemble import Ensemble from .entity import SmartSimEntity, TelemetryConfiguration diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 485bbcd88..6f3010ba2 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -42,12 +42,12 @@ logger = get_logger(__name__) -class DBNode(SmartSimEntity): - """DBNode objects are the entities that make up the orchestrator. - Each database node can be launched in a cluster configuration - and take launch multiple databases per node. +class FSNode(SmartSimEntity): + """FSNode objects are the entities that make up the feature store. + Each feature store node can be launched in a cluster configuration + and take launch multiple feature stores per node. - To configure how each instance of the database operates, look + To configure how each instance of the feature store operates, look into the smartsimdb.conf. """ @@ -58,9 +58,9 @@ def __init__( run_settings: RunSettings, ports: t.List[int], output_files: t.List[str], - db_identifier: str = "", + fs_identifier: str = "", ) -> None: - """Initialize a database node within an orchestrator.""" + """Initialize a feature store node within an feature store.""" super().__init__(name, path, run_settings) self.ports = ports self._hosts: t.Optional[t.List[str]] = None @@ -72,7 +72,7 @@ def __init__( ): raise ValueError("output_files must be of type list[str]") self._output_files = output_files - self.db_identifier = db_identifier + self.fs_identifier = fs_identifier @property def num_shards(self) -> int: @@ -88,14 +88,14 @@ def host(self) -> str: (host,) = self.hosts except ValueError: raise ValueError( - f"Multiple hosts detected for this DB Node: {', '.join(self.hosts)}" + f"Multiple hosts detected for this FS Node: {', '.join(self.hosts)}" ) from None return host @property def hosts(self) -> t.List[str]: if not self._hosts: - self._hosts = self._parse_db_hosts() + self._hosts = self._parse_fs_hosts() return self._hosts def clear_hosts(self) -> None: @@ -112,9 +112,9 @@ def is_mpmd(self) -> bool: def set_hosts(self, hosts: t.List[str]) -> None: self._hosts = [str(host) for host in hosts] - def remove_stale_dbnode_files(self) -> None: + def remove_stale_fsnode_files(self) -> None: """This function removes the .conf, .err, and .out files that - have the same names used by this dbnode that may have been + have the same names used by this fsnode that may have been created from a previous experiment execution. """ @@ -146,7 +146,7 @@ def _get_cluster_conf_filenames(self, port: int) -> t.List[str]: # cov-lsf This function should bu used if and only if ``_mpmd==True`` :param port: port number - :return: the dbnode configuration file name + :return: the fsnode configuration file name """ if self.num_shards == 1: return [f"nodes-{self.name}-{port}.conf"] @@ -182,7 +182,7 @@ def _parse_launched_shard_info_from_files( return cls._parse_launched_shard_info_from_iterable(ifstream, num_shards) def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": - """Parse the launched database shard info from the output files + """Parse the launched feature store shard info from the output files :raises SmartSimError: if all shard info could not be found :return: The found launched shard info @@ -206,16 +206,16 @@ def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": if len(ips) < self.num_shards: msg = ( - f"Failed to parse the launched DB shard information from file(s) " + f"Failed to parse the launched FS shard information from file(s) " f"{', '.join(output_files)}. Found the information for " - f"{len(ips)} out of {self.num_shards} DB shards." + f"{len(ips)} out of {self.num_shards} FS shards." ) logger.error(msg) raise SmartSimError(msg) return ips - def _parse_db_hosts(self) -> t.List[str]: - """Parse the database hosts/IPs from the output files + def _parse_fs_hosts(self) -> t.List[str]: + """Parse the feature store hosts/IPs from the output files The IP address is preferred, but if hostname is only present then a lookup to /etc/hosts is done through the socket library. @@ -228,7 +228,7 @@ def _parse_db_hosts(self) -> t.List[str]: @dataclass(frozen=True) class LaunchedShardData: - """Data class to write and parse data about a launched database shard""" + """Data class to write and parse data about a launched feature store shard""" name: str hostname: str diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index 0f834d253..9be96e671 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -30,28 +30,28 @@ from .._core._install.builder import Device from ..error import SSUnsupportedError -__all__ = ["DBObject", "DBModel", "DBScript"] +__all__ = ["FSObject", "FSModel", "FSScript"] -_DBObjectFuncT = t.TypeVar("_DBObjectFuncT", str, bytes) +_FSObjectFuncT = t.TypeVar("_FSObjectFuncT", str, bytes) -class DBObject(t.Generic[_DBObjectFuncT]): - """Base class for ML objects residing on DB. Should not +class FSObject(t.Generic[_FSObjectFuncT]): + """Base class for ML objects residing on FS. Should not be instantiated. """ def __init__( self, name: str, - func: t.Optional[_DBObjectFuncT], + func: t.Optional[_FSObjectFuncT], file_path: t.Optional[str], device: str, devices_per_node: int, first_device: int, ) -> None: self.name = name - self.func: t.Optional[_DBObjectFuncT] = func + self.func: t.Optional[_FSObjectFuncT] = func self.file: t.Optional[Path] = ( None # Need to have this explicitly to check on it ) @@ -107,9 +107,9 @@ def _check_device(device: str) -> str: return device def _enumerate_devices(self) -> t.List[str]: - """Enumerate devices for a DBObject + """Enumerate devices for a FSObject - :param dbobject: DBObject to enumerate + :param FSObject: FSObject to enumerate :return: list of device names """ @@ -149,7 +149,7 @@ def _check_devices( raise ValueError(msg) -class DBScript(DBObject[str]): +class FSScript(FSObject[str]): def __init__( self, name: str, @@ -204,7 +204,7 @@ def __str__(self) -> str: return desc_str -class DBModel(DBObject[bytes]): +class FSModel(FSObject[bytes]): def __init__( self, name: str, @@ -221,7 +221,7 @@ def __init__( inputs: t.Optional[t.List[str]] = None, outputs: t.Optional[t.List[str]] = None, ) -> None: - """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + """A TF, TF-lite, PT, or ONNX model to load into the FS at runtime One of either model (in memory representation) or model_path (file) must be provided diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index ed971c6ae..ade83b491 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -40,7 +40,7 @@ ) from ..log import get_logger from ..settings.base import BatchSettings, RunSettings -from .dbobject import DBModel, DBScript +from .dbobject import FSModel, FSScript from .entity import SmartSimEntity from .entityList import EntityList from .model import Model @@ -195,10 +195,10 @@ def add_model(self, model: Model) -> None: f"Model {model.name} already exists in ensemble {self.name}" ) - if self._db_models: - self._extend_entity_db_models(model, self._db_models) - if self._db_scripts: - self._extend_entity_db_scripts(model, self._db_scripts) + if self._fs_models: + self._extend_entity_fs_models(model, self._fs_models) + if self._fs_scripts: + self._extend_entity_fs_scripts(model, self._fs_scripts) self.entities.append(model) @@ -350,10 +350,10 @@ def add_ml_model( inputs: t.Optional[t.List[str]] = None, outputs: t.Optional[t.List[str]] = None, ) -> None: - """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + """A TF, TF-lite, PT, or ONNX model to load into the fs at runtime - Each ML Model added will be loaded into an - orchestrator (converged or not) prior to the execution + Each ML Model added will be loaded into a + feature store (converged or not) prior to the execution of every entity belonging to this ensemble One of either model (in memory representation) or model_path (file) @@ -374,7 +374,7 @@ def add_ml_model( :param inputs: model inputs (TF only) :param outputs: model outupts (TF only) """ - db_model = DBModel( + fs_model = FSModel( name=name, backend=backend, model=model, @@ -391,19 +391,19 @@ def add_ml_model( ) dupe = next( ( - db_model.name - for ensemble_ml_model in self._db_models - if ensemble_ml_model.name == db_model.name + fs_model.name + for ensemble_ml_model in self._fs_models + if ensemble_ml_model.name == fs_model.name ), None, ) if dupe: raise SSUnsupportedError( - f'An ML Model with name "{db_model.name}" already exists' + f'An ML Model with name "{fs_model.name}" already exists' ) - self._db_models.append(db_model) + self._fs_models.append(fs_model) for entity in self.models: - self._extend_entity_db_models(entity, [db_model]) + self._extend_entity_fs_models(entity, [fs_model]) def add_script( self, @@ -417,7 +417,7 @@ def add_script( """TorchScript to launch with every entity belonging to this ensemble Each script added to the model will be loaded into an - orchestrator (converged or not) prior to the execution + feature store (converged or not) prior to the execution of every entity belonging to this ensemble Device selection is either "GPU" or "CPU". If many devices are @@ -436,7 +436,7 @@ def add_script( :param devices_per_node: number of devices on each host :param first_device: first device to use on each host """ - db_script = DBScript( + fs_script = FSScript( name=name, script=script, script_path=script_path, @@ -446,19 +446,19 @@ def add_script( ) dupe = next( ( - db_script.name - for ensemble_script in self._db_scripts - if ensemble_script.name == db_script.name + fs_script.name + for ensemble_script in self._fs_scripts + if ensemble_script.name == fs_script.name ), None, ) if dupe: raise SSUnsupportedError( - f'A Script with name "{db_script.name}" already exists' + f'A Script with name "{fs_script.name}" already exists' ) - self._db_scripts.append(db_script) + self._fs_scripts.append(fs_script) for entity in self.models: - self._extend_entity_db_scripts(entity, [db_script]) + self._extend_entity_fs_scripts(entity, [fs_script]) def add_function( self, @@ -471,10 +471,10 @@ def add_function( """TorchScript function to launch with every entity belonging to this ensemble Each script function to the model will be loaded into a - non-converged orchestrator prior to the execution + non-converged feature store prior to the execution of every entity belonging to this ensemble. - For converged orchestrators, the :meth:`add_script` method should be used. + For converged feature stores, the :meth:`add_script` method should be used. Device selection is either "GPU" or "CPU". If many devices are present, a number can be passed for specification e.g. "GPU:1". @@ -490,7 +490,7 @@ def add_function( :param devices_per_node: number of devices on each host :param first_device: first device to use on each host """ - db_script = DBScript( + fs_script = FSScript( name=name, script=function, device=device, @@ -499,39 +499,39 @@ def add_function( ) dupe = next( ( - db_script.name - for ensemble_script in self._db_scripts - if ensemble_script.name == db_script.name + fs_script.name + for ensemble_script in self._fs_scripts + if ensemble_script.name == fs_script.name ), None, ) if dupe: raise SSUnsupportedError( - f'A Script with name "{db_script.name}" already exists' + f'A Script with name "{fs_script.name}" already exists' ) - self._db_scripts.append(db_script) + self._fs_scripts.append(fs_script) for entity in self.models: - self._extend_entity_db_scripts(entity, [db_script]) + self._extend_entity_fs_scripts(entity, [fs_script]) @staticmethod - def _extend_entity_db_models(model: Model, db_models: t.List[DBModel]) -> None: + def _extend_entity_fs_models(model: Model, fs_models: t.List[FSModel]) -> None: """ Ensures that the Machine Learning model names being added to the Ensemble are unique. This static method checks if the provided ML model names already exist in the Ensemble. An SSUnsupportedError is raised if any duplicate names are - found. Otherwise, it appends the given list of DBModels to the Ensemble. + found. Otherwise, it appends the given list of FSModel to the Ensemble. :param model: SmartSim Model object. - :param db_models: List of DBModels to append to the Ensemble. + :param fs_models: List of FSModel to append to the Ensemble. """ - for add_ml_model in db_models: + for add_ml_model in fs_models: dupe = next( ( - db_model.name - for db_model in model.db_models - if db_model.name == add_ml_model.name + fs_model.name + for fs_model in model.fs_models + if fs_model.name == add_ml_model.name ), None, ) @@ -542,24 +542,24 @@ def _extend_entity_db_models(model: Model, db_models: t.List[DBModel]) -> None: model.add_ml_model_object(add_ml_model) @staticmethod - def _extend_entity_db_scripts(model: Model, db_scripts: t.List[DBScript]) -> None: + def _extend_entity_fs_scripts(model: Model, fs_scripts: t.List[FSScript]) -> None: """ Ensures that the script/function names being added to the Ensemble are unique. This static method checks if the provided script/function names already exist in the Ensemble. An SSUnsupportedError is raised if any duplicate names - are found. Otherwise, it appends the given list of DBScripts to the + are found. Otherwise, it appends the given list of FSScripts to the Ensemble. :param model: SmartSim Model object. - :param db_scripts: List of DBScripts to append to the Ensemble. + :param fs_scripts: List of FSScripts to append to the Ensemble. """ - for add_script in db_scripts: + for add_script in fs_scripts: dupe = next( ( add_script.name - for db_script in model.db_scripts - if db_script.name == add_script.name + for fs_script in model.fs_scripts + if fs_script.name == add_script.name ), None, ) diff --git a/smartsim/entity/entityList.py b/smartsim/entity/entityList.py index 6d958bda6..1045d1ad4 100644 --- a/smartsim/entity/entityList.py +++ b/smartsim/entity/entityList.py @@ -68,8 +68,8 @@ def __init__(self, name: str, path: str, **kwargs: t.Any) -> None: # --------------------------------------------------------------------- # self.entities: t.Sequence[_T_co] = [] - self._db_models: t.Sequence["smartsim.entity.DBModel"] = [] - self._db_scripts: t.Sequence["smartsim.entity.DBScript"] = [] + self._fs_models: t.Sequence["smartsim.entity.FSModel"] = [] + self._fs_scripts: t.Sequence["smartsim.entity.FSScript"] = [] # # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -80,14 +80,14 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: raise NotImplementedError @property - def db_models(self) -> t.Iterable["smartsim.entity.DBModel"]: + def fs_models(self) -> t.Iterable["smartsim.entity.FSModel"]: """Return an immutable collection of attached models""" - return (model for model in self._db_models) + return (model for model in self._fs_models) @property - def db_scripts(self) -> t.Iterable["smartsim.entity.DBScript"]: + def fs_scripts(self) -> t.Iterable["smartsim.entity.FSScript"]: """Return an immutable collection of attached scripts""" - return (script for script in self._db_scripts) + return (script for script in self._fs_scripts) @property def batch(self) -> bool: @@ -98,7 +98,7 @@ def batch(self) -> bool: if self.batch_settings: return True return False - # local orchestrator cannot launch with batches + # local feature store cannot launch with batches except AttributeError: return False @@ -133,8 +133,8 @@ def __init__(self, name: str, path: str, **kwargs: t.Any) -> None: super().__init__(name, path, **kwargs) # Change container types to be invariant ``list``s self.entities: t.List[_T] = list(self.entities) - self._db_models: t.List["smartsim.entity.DBModel"] = list(self._db_models) - self._db_scripts: t.List["smartsim.entity.DBScript"] = list(self._db_scripts) + self._fs_models: t.List["smartsim.entity.FSModel"] = list(self._fs_models) + self._fs_scripts: t.List["smartsim.entity.FSScript"] = list(self._fs_scripts) def _initialize_entities(self, **kwargs: t.Any) -> None: """Initialize the SmartSimEntity objects in the container""" diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 3f78e042c..b8815dad2 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -39,7 +39,7 @@ from ..error import EntityExistsError, SSUnsupportedError from ..log import get_logger from ..settings.base import BatchSettings, RunSettings -from .dbobject import DBModel, DBScript +from .dbobject import FSModel, FSScript from .entity import SmartSimEntity from .files import EntityFiles @@ -75,33 +75,33 @@ def __init__( self.incoming_entities: t.List[SmartSimEntity] = [] self._key_prefixing_enabled = False self.batch_settings = batch_settings - self._db_models: t.List[DBModel] = [] - self._db_scripts: t.List[DBScript] = [] + self._fs_models: t.List[FSModel] = [] + self._fs_scripts: t.List[FSScript] = [] self.files: t.Optional[EntityFiles] = None @property - def db_models(self) -> t.Iterable[DBModel]: + def fs_models(self) -> t.Iterable[FSModel]: """Retrieve an immutable collection of attached models :return: Return an immutable collection of attached models """ - return (model for model in self._db_models) + return (model for model in self._fs_models) @property - def db_scripts(self) -> t.Iterable[DBScript]: + def fs_scripts(self) -> t.Iterable[FSScript]: """Retrieve an immutable collection attached of scripts :return: Return an immutable collection of attached scripts """ - return (script for script in self._db_scripts) + return (script for script in self._fs_scripts) @property def colocated(self) -> bool: - """Return True if this Model will run with a colocated Orchestrator + """Return True if this Model will run with a colocated FeatureStore - :return: Return True of the Model will run with a colocated Orchestrator + :return: Return True of the Model will run with a colocated FeatureStore """ - return bool(self.run_settings.colocated_db_settings) + return bool(self.run_settings.colocated_fs_settings) def register_incoming_entity(self, incoming_entity: SmartSimEntity) -> None: """Register future communication between entities. @@ -198,34 +198,34 @@ def print_attached_files(self) -> None: """Print a table of the attached files on std out""" print(self.attached_files_table) - def colocate_db(self, *args: t.Any, **kwargs: t.Any) -> None: - """An alias for ``Model.colocate_db_tcp``""" + def colocate_fs(self, *args: t.Any, **kwargs: t.Any) -> None: + """An alias for ``Model.colocate_fs_tcp``""" warnings.warn( ( - "`colocate_db` has been deprecated and will be removed in a \n" - "future release. Please use `colocate_db_tcp` or `colocate_db_uds`." + "`colocate_fs` has been deprecated and will be removed in a \n" + "future release. Please use `colocate_fs_tcp` or `colocate_fs_uds`." ), FutureWarning, ) - self.colocate_db_tcp(*args, **kwargs) + self.colocate_fs_tcp(*args, **kwargs) - def colocate_db_uds( + def colocate_fs_uds( self, unix_socket: str = "/tmp/redis.socket", socket_permissions: int = 755, - db_cpus: int = 1, + fs_cpus: int = 1, custom_pinning: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]] = None, debug: bool = False, - db_identifier: str = "", + fs_identifier: str = "", **kwargs: t.Any, ) -> None: - """Colocate an Orchestrator instance with this Model over UDS. + """Colocate an FeatureStore instance with this Model over UDS. This method will initialize settings which add an unsharded - database to this Model instance. Only this Model will be able to communicate - with this colocated database by using Unix Domain sockets. + feature store to this Model instance. Only this Model will be able to communicate + with this colocated feature store by using Unix Domain sockets. - Extra parameters for the db can be passed through kwargs. This includes + Extra parameters for the fs can be passed through kwargs. This includes many performance, caching and inference settings. .. highlight:: python @@ -243,11 +243,11 @@ def colocate_db_uds( :param unix_socket: path to where the socket file will be created :param socket_permissions: permissions for the socketfile - :param db_cpus: number of cpus to use for orchestrator - :param custom_pinning: CPUs to pin the orchestrator to. Passing an empty + :param fs_cpus: number of cpus to use for FeatureStore + :param custom_pinning: CPUs to pin the FeatureStore to. Passing an empty iterable disables pinning - :param debug: launch Model with extra debug information about the colocated db - :param kwargs: additional keyword arguments to pass to the orchestrator database + :param debug: launch Model with extra debug information about the colocated fs + :param kwargs: additional keyword arguments to pass to the FeatureStore feature store """ if not re.match(r"^[a-zA-Z0-9.:\,_\-/]*$", unix_socket): @@ -263,30 +263,30 @@ def colocate_db_uds( } common_options = { - "cpus": db_cpus, + "cpus": fs_cpus, "custom_pinning": custom_pinning, "debug": debug, - "db_identifier": db_identifier, + "fs_identifier": fs_identifier, } - self._set_colocated_db_settings(uds_options, common_options, **kwargs) + self._set_colocated_fs_settings(uds_options, common_options, **kwargs) - def colocate_db_tcp( + def colocate_fs_tcp( self, port: int = 6379, ifname: t.Union[str, list[str]] = "lo", - db_cpus: int = 1, + fs_cpus: int = 1, custom_pinning: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]] = None, debug: bool = False, - db_identifier: str = "", + fs_identifier: str = "", **kwargs: t.Any, ) -> None: - """Colocate an Orchestrator instance with this Model over TCP/IP. + """Colocate an FeatureStore instance with this Model over TCP/IP. This method will initialize settings which add an unsharded - database to this Model instance. Only this Model will be able to communicate - with this colocated database by using the loopback TCP interface. + feature store to this Model instance. Only this Model will be able to communicate + with this colocated feature store by using the loopback TCP interface. - Extra parameters for the db can be passed through kwargs. This includes + Extra parameters for the fs can be passed through kwargs. This includes many performance, caching and inference settings. .. highlight:: python @@ -302,25 +302,25 @@ def colocate_db_tcp( Generally these don't need to be changed. - :param port: port to use for orchestrator database - :param ifname: interface to use for orchestrator - :param db_cpus: number of cpus to use for orchestrator - :param custom_pinning: CPUs to pin the orchestrator to. Passing an empty + :param port: port to use for FeatureStore feature store + :param ifname: interface to use for FeatureStore + :param fs_cpus: number of cpus to use for FeatureStore + :param custom_pinning: CPUs to pin the FeatureStore to. Passing an empty iterable disables pinning - :param debug: launch Model with extra debug information about the colocated db - :param kwargs: additional keyword arguments to pass to the orchestrator database + :param debug: launch Model with extra debug information about the colocated fs + :param kwargs: additional keyword arguments to pass to the FeatureStore feature store """ tcp_options = {"port": port, "ifname": ifname} common_options = { - "cpus": db_cpus, + "cpus": fs_cpus, "custom_pinning": custom_pinning, "debug": debug, - "db_identifier": db_identifier, + "fs_identifier": fs_identifier, } - self._set_colocated_db_settings(tcp_options, common_options, **kwargs) + self._set_colocated_fs_settings(tcp_options, common_options, **kwargs) - def _set_colocated_db_settings( + def _set_colocated_fs_settings( self, connection_options: t.Mapping[str, t.Union[int, t.List[str], str]], common_options: t.Dict[ @@ -337,17 +337,17 @@ def _set_colocated_db_settings( ) -> None: """ Ingest the connection-specific options (UDS/TCP) and set the final settings - for the colocated database + for the colocated feature store """ if hasattr(self.run_settings, "mpmd") and len(self.run_settings.mpmd) > 0: raise SSUnsupportedError( - "Models colocated with databases cannot be run as a mpmd workload" + "Models colocated with feature stores cannot be run as a mpmd workload" ) - if hasattr(self.run_settings, "_prep_colocated_db"): + if hasattr(self.run_settings, "_prep_colocated_fs"): # pylint: disable-next=protected-access - self.run_settings._prep_colocated_db(common_options["cpus"]) + self.run_settings._prep_colocated_fs(common_options["cpus"]) if "limit_app_cpus" in kwargs: raise SSUnsupportedError( @@ -355,7 +355,7 @@ def _set_colocated_db_settings( "RunSettings using the correct binding option for your launcher." ) - # TODO list which db settings can be extras + # TODO list which fs settings can be extras custom_pinning_ = t.cast( t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], common_options.get("custom_pinning"), @@ -365,7 +365,7 @@ def _set_colocated_db_settings( custom_pinning_, cpus_ ) - colo_db_config: t.Dict[ + colo_fs_config: t.Dict[ str, t.Union[ bool, @@ -374,14 +374,14 @@ def _set_colocated_db_settings( None, t.List[str], t.Iterable[t.Union[int, t.Iterable[int]]], - t.List[DBModel], - t.List[DBScript], + t.List[FSModel], + t.List[FSScript], t.Dict[str, t.Union[int, None]], t.Dict[str, str], ], ] = {} - colo_db_config.update(connection_options) - colo_db_config.update(common_options) + colo_fs_config.update(connection_options) + colo_fs_config.update(common_options) redis_ai_temp = { "threads_per_queue": kwargs.get("threads_per_queue", None), @@ -389,16 +389,16 @@ def _set_colocated_db_settings( "intra_op_parallelism": kwargs.get("intra_op_parallelism", None), } # redisai arguments for inference settings - colo_db_config["rai_args"] = redis_ai_temp - colo_db_config["extra_db_args"] = { + colo_fs_config["rai_args"] = redis_ai_temp + colo_fs_config["extra_fs_args"] = { k: str(v) for k, v in kwargs.items() if k not in redis_ai_temp } - self._check_db_objects_colo() - colo_db_config["db_models"] = self._db_models - colo_db_config["db_scripts"] = self._db_scripts + self._check_fs_objects_colo() + colo_fs_config["fs_models"] = self._fs_models + colo_fs_config["fs_scripts"] = self._fs_scripts - self.run_settings.colocated_db_settings = colo_db_config + self.run_settings.colocated_fs_settings = colo_fs_config @staticmethod def _create_pinning_string( @@ -482,10 +482,10 @@ def add_ml_model( inputs: t.Optional[t.List[str]] = None, outputs: t.Optional[t.List[str]] = None, ) -> None: - """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + """A TF, TF-lite, PT, or ONNX model to load into the fs at runtime Each ML Model added will be loaded into an - orchestrator (converged or not) prior to the execution + FeatureStore (converged or not) prior to the execution of this Model instance One of either model (in memory representation) or model_path (file) @@ -493,7 +493,7 @@ def add_ml_model( :param name: key to store model under :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) - :param model: A model in memory (only supported for non-colocated orchestrators) + :param model: A model in memory (only supported for non-colocated feature stores) :param model_path: serialized model :param device: name of device for execution :param devices_per_node: The number of GPU devices available on the host. @@ -509,7 +509,7 @@ def add_ml_model( :param inputs: model inputs (TF only) :param outputs: model outupts (TF only) """ - db_model = DBModel( + fs_model = FSModel( name=name, backend=backend, model=model, @@ -524,7 +524,7 @@ def add_ml_model( inputs=inputs, outputs=outputs, ) - self.add_ml_model_object(db_model) + self.add_ml_model_object(fs_model) def add_script( self, @@ -538,7 +538,7 @@ def add_script( """TorchScript to launch with this Model instance Each script added to the model will be loaded into an - orchestrator (converged or not) prior to the execution + FeatureStore (converged or not) prior to the execution of this Model instance Device selection is either "GPU" or "CPU". If many devices are @@ -553,7 +553,7 @@ def add_script( must be provided :param name: key to store script under - :param script: TorchScript code (only supported for non-colocated orchestrators) + :param script: TorchScript code (only supported for non-colocated featurestores) :param script_path: path to TorchScript code :param device: device for script execution :param devices_per_node: The number of GPU devices available on the host. @@ -563,7 +563,7 @@ def add_script( This parameter only applies to GPU devices and will be ignored if device is specified as CPU. """ - db_script = DBScript( + fs_script = FSScript( name=name, script=script, script_path=script_path, @@ -571,7 +571,7 @@ def add_script( devices_per_node=devices_per_node, first_device=first_device, ) - self.add_script_object(db_script) + self.add_script_object(fs_script) def add_function( self, @@ -584,10 +584,10 @@ def add_function( """TorchScript function to launch with this Model instance Each script function to the model will be loaded into a - non-converged orchestrator prior to the execution + non-converged FeatureStore prior to the execution of this Model instance. - For converged orchestrators, the :meth:`add_script` method should be used. + For converged featurestores, the :meth:`add_script` method should be used. Device selection is either "GPU" or "CPU". If many devices are present, a number can be passed for specification e.g. "GPU:1". @@ -605,14 +605,14 @@ def add_function( This parameter only applies to GPU devices and will be ignored if device is specified as CPU. """ - db_script = DBScript( + fs_script = FSScript( name=name, script=function, device=device, devices_per_node=devices_per_node, first_device=first_device, ) - self.add_script_object(db_script) + self.add_script_object(fs_script) def __hash__(self) -> int: return hash(self.name) @@ -629,52 +629,52 @@ def __str__(self) -> str: # pragma: no cover entity_str = "Name: " + self.name + "\n" entity_str += "Type: " + self.type + "\n" entity_str += str(self.run_settings) + "\n" - if self._db_models: - entity_str += "DB Models: \n" + str(len(self._db_models)) + "\n" - if self._db_scripts: - entity_str += "DB Scripts: \n" + str(len(self._db_scripts)) + "\n" + if self._fs_models: + entity_str += "FS Models: \n" + str(len(self._fs_models)) + "\n" + if self._fs_scripts: + entity_str += "FS Scripts: \n" + str(len(self._fs_scripts)) + "\n" return entity_str - def add_ml_model_object(self, db_model: DBModel) -> None: - if not db_model.is_file and self.colocated: - err_msg = "ML model can not be set from memory for colocated databases.\n" + def add_ml_model_object(self, fs_model: FSModel) -> None: + if not fs_model.is_file and self.colocated: + err_msg = "ML model can not be set from memory for colocated feature stores.\n" err_msg += ( - f"Please store the ML model named {db_model.name} in binary format " + f"Please store the ML model named {fs_model.name} in binary format " ) err_msg += "and add it to the SmartSim Model as file." raise SSUnsupportedError(err_msg) - self._db_models.append(db_model) + self._fs_models.append(fs_model) - def add_script_object(self, db_script: DBScript) -> None: - if db_script.func and self.colocated: - if not isinstance(db_script.func, str): + def add_script_object(self, fs_script: FSScript) -> None: + if fs_script.func and self.colocated: + if not isinstance(fs_script.func, str): err_msg = ( - "Functions can not be set from memory for colocated databases.\n" - f"Please convert the function named {db_script.name} " + "Functions can not be set from memory for colocated feature stores.\n" + f"Please convert the function named {fs_script.name} " "to a string or store it as a text file and add it to the " "SmartSim Model with add_script." ) raise SSUnsupportedError(err_msg) - self._db_scripts.append(db_script) + self._fs_scripts.append(fs_script) - def _check_db_objects_colo(self) -> None: - for db_model in self._db_models: - if not db_model.is_file: + def _check_fs_objects_colo(self) -> None: + for fs_model in self._fs_models: + if not fs_model.is_file: err_msg = ( - "ML model can not be set from memory for colocated databases.\n" - f"Please store the ML model named {db_model.name} in binary " + "ML model can not be set from memory for colocated feature stores.\n" + f"Please store the ML model named {fs_model.name} in binary " "format and add it to the SmartSim Model as file." ) raise SSUnsupportedError(err_msg) - for db_script in self._db_scripts: - if db_script.func: - if not isinstance(db_script.func, str): + for fs_script in self._fs_scripts: + if fs_script.func: + if not isinstance(fs_script.func, str): err_msg = ( "Functions can not be set from memory for colocated " - "databases.\nPlease convert the function named " - f"{db_script.name} to a string or store it as a text" + "feature stores.\nPlease convert the function named " + f"{fs_script.name} to a string or store it as a text" "file and add it to the SmartSim Model with add_script." ) raise SSUnsupportedError(err_msg) diff --git a/smartsim/error/errors.py b/smartsim/error/errors.py index 9a6954907..a67cf03f1 100644 --- a/smartsim/error/errors.py +++ b/smartsim/error/errors.py @@ -82,8 +82,8 @@ class SSReservedKeywordError(SmartSimError): class SSDBIDConflictError(SmartSimError): - """Raised in the event that a database identifier - is not unique when multiple databases are created + """Raised in the event that a feature store identifier + is not unique when multiple feature stores are created """ diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 9f230b1a9..172bd837f 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -36,7 +36,7 @@ from smartsim.status import SmartSimStatus from ._core import Controller, Generator, Manifest -from .database import Orchestrator +from .database import FeatureStore from .entity import ( Ensemble, EntitySequence, @@ -85,8 +85,8 @@ class Experiment: The instances created by an Experiment represent executable code that is either user-specified, like the ``Model`` instance created - by ``Experiment.create_model``, or pre-configured, like the ``Orchestrator`` - instance created by ``Experiment.create_database``. + by ``Experiment.create_model``, or pre-configured, like the ``FeatureStore`` + instance created by ``Experiment.create_feature_store``. Experiment methods that accept a variable list of arguments, such as ``Experiment.start`` or ``Experiment.stop``, accept any number of the @@ -165,7 +165,7 @@ def __init__( self._control = Controller(launcher=launcher) self._launcher = launcher.lower() - self.db_identifiers: t.Set[str] = set() + self.fs_identifiers: t.Set[str] = set() self._telemetry_cfg = ExperimentTelemetryConfiguration() @_contextualize @@ -178,7 +178,7 @@ def start( ) -> None: """Start passed instances using Experiment launcher - Any instance ``Model``, ``Ensemble`` or ``Orchestrator`` + Any instance ``Model``, ``Ensemble`` or ``FeatureStore`` instance created by the Experiment can be passed as an argument to the start method. @@ -197,17 +197,17 @@ def start( .. highlight:: python .. code-block:: python - exp.start(model_1, model_2, db, ensemble, block=True) + exp.start(model_1, model_2, fs, ensemble, block=True) # alternatively - stage_1 = [model_1, model_2, db, ensemble] + stage_1 = [model_1, model_2, fs, ensemble] exp.start(*stage_1, block=True) If `block==True` the Experiment will poll the launched instances - at runtime until all non-database jobs have completed. Database + at runtime until all non-feature store jobs have completed. Feature store jobs *must* be killed by the user by passing them to ``Experiment.stop``. This allows for multiple stages of a workflow - to produce to and consume from the same Orchestrator database. + to produce to and consume from the same FeatureStore feature store. If `kill_on_interrupt=True`, then all jobs launched by this experiment are guaranteed to be killed when ^C (SIGINT) signal is @@ -215,7 +215,7 @@ def start( that all jobs launched by this experiment will be killed, and the zombie processes will need to be manually killed. - :param block: block execution until all non-database + :param block: block execution until all non-feature store jobs are finished :param summary: print a launch summary prior to launch :param kill_on_interrupt: flag for killing jobs when ^C (SIGINT) @@ -243,7 +243,7 @@ def stop( ) -> None: """Stop specific instances launched by this ``Experiment`` - Instances of ``Model``, ``Ensemble`` and ``Orchestrator`` + Instances of ``Model``, ``Ensemble`` and ``FeatureStore`` can all be passed as arguments to the stop method. Whichever launcher was specified at Experiment initialization @@ -258,7 +258,7 @@ def stop( exp.stop(model) # multiple - exp.stop(model_1, model_2, db, ensemble) + exp.stop(model_1, model_2, fs, ensemble) :param args: One or more SmartSimEntity or EntitySequence objects. :raises TypeError: if wrong type @@ -270,9 +270,9 @@ def stop( self._control.stop_entity(entity) for entity_list in stop_manifest.ensembles: self._control.stop_entity_list(entity_list) - dbs = stop_manifest.dbs - for db in dbs: - self._control.stop_db(db) + fss = stop_manifest.fss + for fs in fss: + self._control.stop_fs(fs) except SmartSimError as e: logger.error(e) raise @@ -295,7 +295,7 @@ def generate( directories will be symlinked, copied, or configured and written into the created directory for that instance. - Instances of ``Model``, ``Ensemble`` and ``Orchestrator`` + Instances of ``Model``, ``Ensemble`` and ``FeatureStore`` can all be passed as arguments to the generate method. :param tag: tag used in `to_configure` generator files @@ -358,8 +358,8 @@ def finished(self, entity: SmartSimEntity) -> bool: An instance of ``Model`` or ``Ensemble`` can be passed as an argument. - Passing ``Orchestrator`` will return an error as a - database deployment is never finished until stopped + Passing ``FeatureStore`` will return an error as a + feature store deployment is never finished until stopped by the user. :param entity: object launched by this ``Experiment`` @@ -394,7 +394,7 @@ def get_status( .. highlight:: python .. code-block:: python - statuses = exp.get_status(model, ensemble, orchestrator) + statuses = exp.get_status(model, ensemble, featurestore) complete = [s == smartsim.status.STATUS_COMPLETED for s in statuses] assert all(complete) @@ -551,24 +551,24 @@ def create_model( exp.generate(model) New in 0.4.0, ``Model`` instances can be colocated with an - Orchestrator database shard through ``Model.colocate_db``. This - will launch a single ``Orchestrator`` instance on each compute + FeatureStore feature store shard through ``Model.colocate_fs``. This + will launch a single ``FeatureStore`` instance on each compute host used by the (possibly distributed) application. This is useful for performant online inference or processing at runtime. New in 0.4.2, ``Model`` instances can now be colocated with - an Orchestrator database over either TCP or UDS using the - ``Model.colocate_db_tcp`` or ``Model.colocate_db_uds`` method - respectively. The original ``Model.colocate_db`` method is now - deprecated, but remains as an alias for ``Model.colocate_db_tcp`` + an FeatureStore feature store over either TCP or UDS using the + ``Model.colocate_fs_tcp`` or ``Model.colocate_fs_uds`` method + respectively. The original ``Model.colocate_fs`` method is now + deprecated, but remains as an alias for ``Model.colocate_fs_tcp`` for backward compatibility. :param name: name of the ``Model`` :param run_settings: defines how ``Model`` should be run :param params: ``Model`` parameters for writing into configuration files :param path: path to where the ``Model`` should be executed at runtime - :param enable_key_prefixing: If True, data sent to the ``Orchestrator`` + :param enable_key_prefixing: If True, data sent to the ``FeatureStore`` using SmartRedis from this ``Model`` will be prefixed with the ``Model`` name. :param batch_settings: Settings to run ``Model`` individually as a batch job. @@ -711,11 +711,11 @@ def create_batch_settings( raise @_contextualize - def create_database( + def create_feature_store( self, port: int = 6379, path: t.Optional[str] = None, - db_nodes: int = 1, + fs_nodes: int = 1, batch: bool = False, hosts: t.Optional[t.Union[t.List[str], str]] = None, run_command: str = "auto", @@ -724,30 +724,30 @@ def create_database( time: t.Optional[str] = None, queue: t.Optional[str] = None, single_cmd: bool = True, - db_identifier: str = "orchestrator", + fs_identifier: str = "featurestore", **kwargs: t.Any, - ) -> Orchestrator: - """Initialize an ``Orchestrator`` database + ) -> FeatureStore: + """Initialize a ``FeatureStore`` feature store - The ``Orchestrator`` database is a key-value store based + The ``FeatureStore`` feature store is a key-value store based on Redis that can be launched together with other ``Experiment`` created instances for online data storage. - When launched, ``Orchestrator`` can be used to communicate + When launched, ``FeatureStore`` can be used to communicate data between Fortran, Python, C, and C++ applications. Machine Learning models in Pytorch, Tensorflow, and ONNX (i.e. scikit-learn) - can also be stored within the ``Orchestrator`` database where they + can also be stored within the ``FeatureStore`` feature store where they can be called remotely and executed on CPU or GPU where - the database is hosted. + the feature store is hosted. - To enable a SmartSim ``Model`` to communicate with the database + To enable a SmartSim ``Model`` to communicate with the feature store the workload must utilize the SmartRedis clients. For more - information on the database, and SmartRedis clients see the + information on the feature store, and SmartRedis clients see the documentation at https://www.craylabs.org/docs/smartredis.html :param port: TCP/IP port - :param db_nodes: number of database shards + :param fs_nodes: number of feature store shards :param batch: run as a batch workload :param hosts: specify hosts to launch on :param run_command: specify launch binary or detect automatically @@ -756,21 +756,21 @@ def create_database( :param time: walltime for batch 'HH:MM:SS' format :param queue: queue to run the batch on :param single_cmd: run all shards with one (MPMD) command - :param db_identifier: an identifier to distinguish this orchestrator in - multiple-database experiments + :param fs_identifier: an identifier to distinguish this feature stores in + multiple-feature store experiments :raises SmartSimError: if detection of launcher or of run command fails :raises SmartSimError: if user indicated an incompatible run command for the launcher - :return: Orchestrator or derived class + :return: FeatureStore or derived class """ - self._append_to_db_identifier_list(db_identifier) - check_path = path or osp.join(self.exp_path, db_identifier) + self._append_to_fs_identifier_list(fs_identifier) + check_path = path or osp.join(self.exp_path, fs_identifier) entity_path: str = osp.abspath(check_path) - return Orchestrator( + return FeatureStore( port=port, path=entity_path, - db_nodes=db_nodes, + fs_nodes=fs_nodes, batch=batch, hosts=hosts, run_command=run_command, @@ -780,26 +780,26 @@ def create_database( queue=queue, single_cmd=single_cmd, launcher=self._launcher, - db_identifier=db_identifier, + fs_identifier=fs_identifier, **kwargs, ) @_contextualize - def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: - """Reconnect to a running ``Orchestrator`` + def reconnect_feature_store(self, checkpoint: str) -> FeatureStore: + """Reconnect to a running ``FeatureStore`` - This method can be used to connect to a ``Orchestrator`` deployment + This method can be used to connect to a ``FeatureStore`` deployment that was launched by a previous ``Experiment``. This can be helpful in the case where separate runs of an ``Experiment`` - wish to use the same ``Orchestrator`` instance currently + wish to use the same ``FeatureStore`` instance currently running on a system. :param checkpoint: the `smartsim_db.dat` file created - when an ``Orchestrator`` is launched + when an ``FeatureStore`` is launched """ try: - orc = self._control.reload_saved_db(checkpoint) - return orc + feature_store = self._control.reload_saved_fs(checkpoint) + return feature_store except SmartSimError as e: logger.error(e) raise @@ -869,27 +869,27 @@ def _launch_summary(self, manifest: Manifest) -> None: if manifest.models: summary += f"Models: {len(manifest.models)}\n" - if self._control.orchestrator_active: - summary += "Database Status: active\n" - elif manifest.dbs: - summary += "Database Status: launching\n" + if self._control.feature_store_active: + summary += "Feature Store Status: active\n" + elif manifest.fss: + summary += "Feature Store Status: launching\n" else: - summary += "Database Status: inactive\n" + summary += "Feature Store Status: inactive\n" summary += f"\n{str(manifest)}" logger.info(summary) def _create_entity_dir(self, start_manifest: Manifest) -> None: - def create_entity_dir(entity: t.Union[Orchestrator, Model, Ensemble]) -> None: + def create_entity_dir(entity: t.Union[FeatureStore, Model, Ensemble]) -> None: if not os.path.isdir(entity.path): os.makedirs(entity.path) for model in start_manifest.models: create_entity_dir(model) - for orch in start_manifest.dbs: - create_entity_dir(orch) + for feature_store in start_manifest.fss: + create_entity_dir(feature_store) for ensemble in start_manifest.ensembles: create_entity_dir(ensemble) @@ -900,13 +900,13 @@ def create_entity_dir(entity: t.Union[Orchestrator, Model, Ensemble]) -> None: def __str__(self) -> str: return self.name - def _append_to_db_identifier_list(self, db_identifier: str) -> None: - """Check if db_identifier already exists when calling create_database""" - if db_identifier in self.db_identifiers: + def _append_to_fs_identifier_list(self, fs_identifier: str) -> None: + """Check if fs_identifier already exists when calling create_feature_store""" + if fs_identifier in self.fs_identifiers: logger.warning( - f"A database with the identifier {db_identifier} has already been made " - "An error will be raised if multiple databases are started " + f"A feature store with the identifier {fs_identifier} has already been made " + "An error will be raised if multiple Feature Stores are started " "with the same identifier" ) # Otherwise, add - self.db_identifiers.add(db_identifier) + self.fs_identifiers.add(fs_identifier) diff --git a/smartsim/ml/data.py b/smartsim/ml/data.py index f2c37fdc4..875fe90f0 100644 --- a/smartsim/ml/data.py +++ b/smartsim/ml/data.py @@ -76,12 +76,12 @@ def __init__( self._ds_name = form_name(self.list_name, "info") def publish(self, client: Client) -> None: - """Upload DataInfo information to Orchestrator + """Upload DataInfo information to FeatureStore The information is put on the DB as a DataSet, with strings stored as metastrings and integers stored as metascalars. - :param client: Client to connect to Database + :param client: Client to connect to Feature Store """ info_ds = Dataset(self._ds_name) info_ds.add_meta_string("sample_name", self.sample_name) @@ -92,13 +92,13 @@ def publish(self, client: Client) -> None: client.put_dataset(info_ds) def download(self, client: Client) -> None: - """Download DataInfo information from Orchestrator + """Download DataInfo information from FeatureStore The information retrieved from the DB is used to populate this object's members. If the information is not available on the DB, the object members are not modified. - :param client: Client to connect to Database + :param client: Client to connect to Feature Store """ try: info_ds = client.get_dataset(self._ds_name) @@ -133,7 +133,7 @@ class TrainingDataUploader: This class can be used to upload samples following a simple convention for naming. Once created, the function `publish_info` can be used - to put all details about the data set on the Orchestrator. A training + to put all details about the data set on the FeatureStore. A training process can thus access them and get all relevant information to download the batches which are uploaded. @@ -141,11 +141,11 @@ class TrainingDataUploader: and the data will be stored following the naming convention specified by the attributes of this class. - :param list_name: Name of the dataset as stored on the Orchestrator + :param list_name: Name of the dataset as stored on the FeatureStore :param sample_name: Name of samples tensor in uploaded Datasets :param target_name: Name of targets tensor (if needed) in uploaded Datasets :param num_classes: Number of classes of targets, if categorical - :param cluster: Whether the SmartSim Orchestrator is being run as a cluster + :param cluster: Whether the SmartSim FeatureStore is being run as a cluster :param address: Address of Redis DB as : :param rank: Rank of DataUploader in multi-process application (e.g. MPI rank). :param verbose: If output should be logged to screen. @@ -260,7 +260,7 @@ class DataDownloader: download, if a string is passed, it is used to download DataInfo data from DB, assuming it was stored with ``list_name=data_info_or_list_name`` :param list_name: Name of aggregation list used to upload data - :param cluster: Whether the Orchestrator will be run as a cluster + :param cluster: Whether the FeatureStore will be run as a cluster :param address: Address of Redis client as : :param replica_rank: When StaticDataDownloader is used distributedly, indicates the rank of this object diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index cf69b65e5..9e16a21dc 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -44,7 +44,7 @@ def freeze_model( smartredis.client.set_model_from_file() method. This utiliy function provides everything users need to take - a trained model and put it inside an ``orchestrator`` instance + a trained model and put it inside an ``featurestore`` instance :param model: TensorFlow or Keras model :param output_dir: output dir to save model file to @@ -86,7 +86,7 @@ def serialize_model(model: keras.Model) -> t.Tuple[str, t.List[str], t.List[str] smartredis.client.set_model() method. This utiliy function provides everything users need to take - a trained model and put it inside an ``orchestrator`` instance. + a trained model and put it inside an ``featurestore`` instance. :param model: TensorFlow or Keras model :return: serialized model, model input layer names, model output layer names diff --git a/smartsim/settings/alpsSettings.py b/smartsim/settings/alpsSettings.py index 54b9c7525..5de88d739 100644 --- a/smartsim/settings/alpsSettings.py +++ b/smartsim/settings/alpsSettings.py @@ -68,7 +68,7 @@ def make_mpmd(self, settings: RunSettings) -> None: :param settings: ``AprunSettings`` instance """ - if self.colocated_db_settings: + if self.colocated_fs_settings: raise SSUnsupportedError( "Colocated models cannot be run as a mpmd workload" ) diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index 6373b52fd..353119ce5 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -30,7 +30,7 @@ from smartsim.settings.containers import Container from .._core.utils.helpers import expand_exe_path, fmt_dict, is_valid_cmd -from ..entity.dbobject import DBModel, DBScript +from ..entity.dbobject import FSModel, FSScript from ..log import get_logger logger = get_logger(__name__) @@ -89,7 +89,7 @@ def __init__( self.container = container self._run_command = run_command self.in_batch = False - self.colocated_db_settings: t.Optional[ + self.colocated_fs_settings: t.Optional[ t.Dict[ str, t.Union[ @@ -99,8 +99,8 @@ def __init__( None, t.List[str], t.Iterable[t.Union[int, t.Iterable[int]]], - t.List[DBModel], - t.List[DBScript], + t.List[FSModel], + t.List[FSScript], t.Dict[str, t.Union[int, None]], t.Dict[str, str], ], @@ -579,8 +579,8 @@ def __str__(self) -> str: # pragma: no-cover string += f"\nRun Command: {self.run_command}" if self.run_args: string += f"\nRun Arguments:\n{fmt_dict(self.run_args)}" - if self.colocated_db_settings: - string += "\nCo-located Database: True" + if self.colocated_fs_settings: + string += "\nCo-located Feature Store: True" return string diff --git a/smartsim/settings/lsfSettings.py b/smartsim/settings/lsfSettings.py index bce0581c5..3ab050cc2 100644 --- a/smartsim/settings/lsfSettings.py +++ b/smartsim/settings/lsfSettings.py @@ -90,15 +90,15 @@ def set_cpus_per_rs(self, cpus_per_rs: int) -> None: :param cpus_per_rs: number of cpus to use per resource set or ALL_CPUS """ - if self.colocated_db_settings: - db_cpus = int(t.cast(int, self.colocated_db_settings.get("db_cpus", 0))) - if not db_cpus: - raise ValueError("db_cpus must be configured on colocated_db_settings") + if self.colocated_fs_settings: + fs_cpus = int(t.cast(int, self.colocated_fs_settings.get("fs_cpus", 0))) + if not fs_cpus: + raise ValueError("fs_cpus must be configured on colocated_fs_settings") - if cpus_per_rs < db_cpus: + if cpus_per_rs < fs_cpus: raise ValueError( f"Cannot set cpus_per_rs ({cpus_per_rs}) to less than " - + f"db_cpus ({db_cpus})" + + f"fs_cpus ({fs_cpus})" ) if isinstance(cpus_per_rs, str): self.run_args["cpu_per_rs"] = cpus_per_rs @@ -199,7 +199,7 @@ def make_mpmd(self, settings: RunSettings) -> None: :param settings: ``JsrunSettings`` instance """ - if self.colocated_db_settings: + if self.colocated_fs_settings: raise SSUnsupportedError( "Colocated models cannot be run as a mpmd workload" ) @@ -329,25 +329,25 @@ def __str__(self) -> str: string += "\nERF settings: " + pformat(self.erf_sets) return string - def _prep_colocated_db(self, db_cpus: int) -> None: + def _prep_colocated_fs(self, fs_cpus: int) -> None: cpus_per_flag_set = False for cpu_per_rs_flag in ["cpu_per_rs", "c"]: if run_arg_value := self.run_args.get(cpu_per_rs_flag, 0): cpus_per_flag_set = True cpu_per_rs = int(run_arg_value) - if cpu_per_rs < db_cpus: + if cpu_per_rs < fs_cpus: msg = ( f"{cpu_per_rs_flag} flag was set to {cpu_per_rs}, but " - f"colocated DB requires {db_cpus} CPUs per RS. Automatically " - f"setting {cpu_per_rs_flag} flag to {db_cpus}" + f"colocated db requires {fs_cpus} CPUs per RS. Automatically " + f"setting {cpu_per_rs_flag} flag to {fs_cpus}" ) logger.info(msg) - self.run_args[cpu_per_rs_flag] = db_cpus + self.run_args[cpu_per_rs_flag] = fs_cpus if not cpus_per_flag_set: - msg = f"Colocated DB requires {db_cpus} CPUs per RS. Automatically setting " - msg += f"--cpus_per_rs=={db_cpus}" + msg = f"Colocated fs requires {fs_cpus} CPUs per RS. Automatically setting " + msg += f"--cpus_per_rs=={fs_cpus}" logger.info(msg) - self.set_cpus_per_rs(db_cpus) + self.set_cpus_per_rs(fs_cpus) rs_per_host_set = False for rs_per_host_flag in ["rs_per_host", "r"]: @@ -357,13 +357,13 @@ def _prep_colocated_db(self, db_cpus: int) -> None: if rs_per_host != 1: msg = f"{rs_per_host_flag} flag was set to {rs_per_host}, " msg += ( - "but colocated DB requires running ONE resource set per host. " + "but colocated fs requires running ONE resource set per host. " ) msg += f"Automatically setting {rs_per_host_flag} flag to 1" logger.info(msg) self.run_args[rs_per_host_flag] = "1" if not rs_per_host_set: - msg = "Colocated DB requires one resource set per host. " + msg = "Colocated fs requires one resource set per host. " msg += " Automatically setting --rs_per_host==1" logger.info(msg) self.set_rs_per_host(1) diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py index c64c66cbf..fd49f4373 100644 --- a/smartsim/settings/mpiSettings.py +++ b/smartsim/settings/mpiSettings.py @@ -97,7 +97,7 @@ def make_mpmd(self, settings: RunSettings) -> None: :param settings: MpirunSettings instance """ - if self.colocated_db_settings: + if self.colocated_fs_settings: raise SSUnsupportedError( "Colocated models cannot be run as a mpmd workload" ) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 09d48181a..9a54e7933 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -175,7 +175,7 @@ def set_resource(self, resource_name: str, value: t.Union[str, int]) -> None: :param value: value """ # TODO add error checking here - # TODO include option to overwrite place (warning for orchestrator?) + # TODO include option to overwrite place (warning for featurestore?) updated_dict = self.resources updated_dict.update({resource_name: value}) self._sanity_check_resources(updated_dict) diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index 6cb13c54a..25e21602a 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -90,7 +90,7 @@ def make_mpmd(self, settings: RunSettings) -> None: :param settings: SrunSettings instance """ - if self.colocated_db_settings: + if self.colocated_fs_settings: raise SSUnsupportedError( "Colocated models cannot be run as a mpmd workload" ) diff --git a/tests/backends/run_sklearn_onnx.py b/tests/backends/run_sklearn_onnx.py index f10c8c7fb..77683ee90 100644 --- a/tests/backends/run_sklearn_onnx.py +++ b/tests/backends/run_sklearn_onnx.py @@ -75,7 +75,7 @@ def run_model(client, model_name, device, model, model_input, in_name, out_names def run(device): - # connect a client to the database + # connect a client to the feature store client = Client(cluster=False) # linreg test diff --git a/tests/backends/run_torch.py b/tests/backends/run_torch.py index 6e9ba2859..83c8a9a8e 100644 --- a/tests/backends/run_torch.py +++ b/tests/backends/run_torch.py @@ -75,7 +75,7 @@ def calc_svd(input_tensor): def run(device): - # connect a client to the database + # connect a client to the feature store client = Client(cluster=False) # test the SVD function diff --git a/tests/backends/test_cli_mini_exp.py b/tests/backends/test_cli_mini_exp.py index f7563fc96..b1c508747 100644 --- a/tests/backends/test_cli_mini_exp.py +++ b/tests/backends/test_cli_mini_exp.py @@ -48,7 +48,7 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( - local_db, + local_fs, test_dir, monkeypatch, ): @@ -58,23 +58,23 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( """ @contextmanager - def _mock_make_managed_local_orc(*a, **kw): - (client_addr,) = local_db.get_address() + def _mock_make_managed_local_feature_store(*a, **kw): + (client_addr,) = local_fs.get_address() yield smartredis.Client(False, address=client_addr) monkeypatch.setattr( smartsim._core._cli.validate, - "_make_managed_local_orc", - _mock_make_managed_local_orc, + "_make_managed_local_feature_store", + _mock_make_managed_local_feature_store, ) backends = installed_redisai_backends() - (db_port,) = local_db.ports + (fs_port,) = local_fs.ports smartsim._core._cli.validate.test_install( # Shouldn't matter bc we are stubbing creation of orc # but best to give it "correct" vals for safety location=test_dir, - port=db_port, + port=fs_port, # Always test on CPU, heads don't always have GPU device=build.Device.CPU, # Test the backends the dev has installed diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index e377f5631..098f6ec4e 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -30,7 +30,7 @@ import numpy as np import pytest -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.error.errors import SSInternalError from smartsim.experiment import Experiment from smartsim.log import get_logger @@ -171,12 +171,12 @@ def test_tf_dataloaders(test_dir, wlmutils): exp = Experiment( "test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher() ) - orc: Orchestrator = wlmutils.get_orchestrator() - exp.generate(orc) - exp.start(orc) + feature_store: FeatureStore = wlmutils.get_feature_store() + exp.generate(feature_store) + exp.start(feature_store) try: - os.environ["SSDB"] = orc.get_address()[0] + os.environ["SSDB"] = feature_store.get_address()[0] data_info = run_local_uploaders(mpi_size=2, format="tf") os.environ["SSKEYIN"] = "test_uploader_0,test_uploader_1" @@ -212,7 +212,7 @@ def test_tf_dataloaders(test_dir, wlmutils): except Exception as e: raise e finally: - exp.stop(orc) + exp.stop(feature_store) os.environ.pop("SSDB", "") os.environ.pop("SSKEYIN", "") os.environ.pop("SSKEYOUT", "") @@ -238,13 +238,13 @@ def test_torch_dataloaders(fileutils, test_dir, wlmutils): exp = Experiment( "test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher() ) - orc: Orchestrator = wlmutils.get_orchestrator() + feature_store: FeatureStore = wlmutils.get_feature_store() config_dir = fileutils.get_test_dir_path("ml") - exp.generate(orc) - exp.start(orc) + exp.generate(feature_store) + exp.start(feature_store) try: - os.environ["SSDB"] = orc.get_address()[0] + os.environ["SSDB"] = feature_store.get_address()[0] data_info = run_local_uploaders(mpi_size=2) os.environ["SSKEYIN"] = "test_uploader_0,test_uploader_1" @@ -294,7 +294,7 @@ def test_torch_dataloaders(fileutils, test_dir, wlmutils): except Exception as e: raise e finally: - exp.stop(orc) + exp.stop(feature_store) os.environ.pop("SSDB", "") os.environ.pop("SSKEYIN", "") os.environ.pop("SSKEYOUT", "") @@ -337,22 +337,22 @@ def test_wrong_dataloaders(test_dir, wlmutils): exp_path=test_dir, launcher=wlmutils.get_test_launcher(), ) - orc = wlmutils.get_orchestrator() - exp.generate(orc) - exp.start(orc) + feature_store = wlmutils.get_feature_store() + exp.generate(feature_store) + exp.start(feature_store) if shouldrun_tf: with pytest.raises(SSInternalError): _ = TFDataGenerator( data_info_or_list_name="test_data_list", - address=orc.get_address()[0], + address=feature_store.get_address()[0], cluster=False, max_fetch_trials=1, ) with pytest.raises(TypeError): _ = TFStaticDataGenerator( test_data_info_repr=1, - address=orc.get_address()[0], + address=feature_store.get_address()[0], cluster=False, max_fetch_trials=1, ) @@ -361,9 +361,9 @@ def test_wrong_dataloaders(test_dir, wlmutils): with pytest.raises(SSInternalError): torch_data_gen = TorchDataGenerator( data_info_or_list_name="test_data_list", - address=orc.get_address()[0], + address=feature_store.get_address()[0], cluster=False, ) torch_data_gen.init_samples(init_trials=1) - exp.stop(orc) + exp.stop(feature_store) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index eb0198229..6b5831373 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -32,7 +32,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends from smartsim.entity import Ensemble -from smartsim.entity.dbobject import DBModel +from smartsim.entity.dbobject import FSModel from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger from smartsim.status import SmartSimStatus @@ -146,11 +146,11 @@ def save_torch_cnn(path, file_name): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): - """Test TensorFlow DB Models on remote DB""" +def test_tf_fs_model(fileutils, test_dir, wlmutils, mlutils): + """Test TensorFlow fs Models on remote fs""" # Set experiment name - exp_name = "test-tf-db-model" + exp_name = "test-tf-fs-model" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -172,10 +172,10 @@ def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) - # Create database + # Create feature store host = wlmutils.choose_host(run_settings) - db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db) + fs = exp.create_feature_store(port=test_port, interface=test_interface, hosts=host) + exp.generate(fs) # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() @@ -206,31 +206,31 @@ def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): ) logger.debug("The following ML models have been added:") - for db_model in smartsim_model._db_models: - logger.debug(db_model) + for fs_model in smartsim_model._fs_models: + logger.debug(fs_model) # Assert we have added both models - assert len(smartsim_model._db_models) == 2 + assert len(smartsim_model._fs_models) == 2 exp.generate(smartsim_model) # Launch and check successful completion try: - exp.start(db, smartsim_model, block=True) + exp.start(fs, smartsim_model, block=True) statuses = exp.get_status(smartsim_model) assert all( stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: - exp.stop(db) + exp.stop(fs) @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): - """Test PyTorch DB Models on remote DB""" +def test_pt_fs_model(fileutils, test_dir, wlmutils, mlutils): + """Test PyTorch fs Models on remote fs""" # Set experiment name - exp_name = "test-pt-db-model" + exp_name = "test-pt-fs-model" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -252,10 +252,10 @@ def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) - # Create database + # Create feature store host = wlmutils.choose_host(run_settings) - db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db) + fs = exp.create_feature_store(port=test_port, interface=test_interface, hosts=host) + exp.generate(fs) # Create and save ML model to filesystem save_torch_cnn(test_dir, "model1.pt") @@ -273,31 +273,31 @@ def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): ) logger.debug("The following ML models have been added:") - for db_model in smartsim_model._db_models: - logger.debug(db_model) + for fs_model in smartsim_model._fs_models: + logger.debug(fs_model) # Assert we have added both models - assert len(smartsim_model._db_models) == 1 + assert len(smartsim_model._fs_models) == 1 exp.generate(smartsim_model) # Launch and check successful completion try: - exp.start(db, smartsim_model, block=True) + exp.start(fs, smartsim_model, block=True) statuses = exp.get_status(smartsim_model) assert all( stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: - exp.stop(db) + exp.stop(fs) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): - """Test DBModels on remote DB, with an ensemble""" +def test_fs_model_ensemble(fileutils, test_dir, wlmutils, mlutils): + """Test FSModels on remote fs, with an ensemble""" # Set experiment name - exp_name = "test-db-model-ensemble" + exp_name = "test-fs-model-ensemble" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -324,10 +324,10 @@ def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) - # Create database + # Create feature store host = wlmutils.choose_host(run_settings) - db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db) + fs = exp.create_feature_store(port=test_port, interface=test_interface, hosts=host) + exp.generate(fs) # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() @@ -376,29 +376,29 @@ def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have added one model to the ensemble - assert len(smartsim_ensemble._db_models) == 1 + assert len(smartsim_ensemble._fs_models) == 1 # Assert we have added two models to each entity - assert all([len(entity._db_models) == 2 for entity in smartsim_ensemble]) + assert all([len(entity._fs_models) == 2 for entity in smartsim_ensemble]) exp.generate(smartsim_ensemble) # Launch and check successful completion try: - exp.start(db, smartsim_ensemble, block=True) + exp.start(fs, smartsim_ensemble, block=True) statuses = exp.get_status(smartsim_ensemble) assert all( stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: - exp.stop(db) + exp.stop(fs) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): - """Test DB Models on colocated DB (TensorFlow backend)""" + """Test fs Models on colocated fs (TensorFlow backend)""" # Set experiment name - exp_name = "test-colocated-db-model-tf" + exp_name = "test-colocated-fs-model-tf" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -419,8 +419,8 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): # Create colocated Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.colocate_db_tcp( - port=test_port, db_cpus=1, debug=True, ifname=test_interface + colo_model.colocate_fs_tcp( + port=test_port, fs_cpus=1, debug=True, ifname=test_interface ) # Create and save ML model to filesystem @@ -450,7 +450,7 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have added both models - assert len(colo_model._db_models) == 2 + assert len(colo_model._fs_models) == 2 exp.generate(colo_model) @@ -466,11 +466,11 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): - """Test DB Models on colocated DB (PyTorch backend)""" +def test_colocated_fs_model_pytorch(fileutils, test_dir, wlmutils, mlutils): + """Test fs Models on colocated fs (PyTorch backend)""" # Set experiment name - exp_name = "test-colocated-db-model-pytorch" + exp_name = "test-colocated-fs-model-pytorch" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -491,8 +491,8 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.colocate_db_tcp( - port=test_port, db_cpus=1, debug=True, ifname=test_interface + colo_model.colocate_fs_tcp( + port=test_port, fs_cpus=1, debug=True, ifname=test_interface ) # Create and save ML model to filesystem @@ -510,7 +510,7 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have added both models - assert len(colo_model._db_models) == 1 + assert len(colo_model._fs_models) == 1 exp.generate(colo_model) @@ -526,13 +526,13 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): - """Test DBModel on colocated ensembles, first colocating DB, - then adding DBModel. +def test_colocated_fs_model_ensemble(fileutils, test_dir, wlmutils, mlutils): + """Test fsModel on colocated ensembles, first colocating fs, + then adding fsModel. """ # Set experiment name - exp_name = "test-colocated-db-model-ensemble" + exp_name = "test-colocated-fs-model-ensemble" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -556,20 +556,20 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): "colocated_ens", run_settings=colo_settings, replicas=2 ) - # Create a third model with a colocated database + # Create a third model with a colocated feature store colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.colocate_db_tcp( - port=test_port, db_cpus=1, debug=True, ifname=test_interface + colo_model.colocate_fs_tcp( + port=test_port, fs_cpus=1, debug=True, ifname=test_interface ) # Create and save the ML models to the filesystem model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") - # Colocate a database with the ensemble with two ensemble members + # Colocate a feature store with the ensemble with two ensemble members for i, entity in enumerate(colo_ensemble): - entity.colocate_db_tcp( - port=test_port + i + 1, db_cpus=1, debug=True, ifname=test_interface + entity.colocate_fs_tcp( + port=test_port + i + 1, fs_cpus=1, debug=True, ifname=test_interface ) # Add ML model to each ensemble member individual to test that they # do not conflict with models add to the Ensemble object @@ -627,13 +627,13 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): - """Test DBModel on colocated ensembles, first adding the DBModel to the - ensemble, then colocating DB. +def test_colocated_fs_model_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): + """Test fsModel on colocated ensembles, first adding the fsModel to the + ensemble, then colocating fs. """ # Set experiment name - exp_name = "test-colocated-db-model-ensemble-reordered" + exp_name = "test-colocated-fs-model-ensemble-reordered" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -676,10 +676,10 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml outputs=outputs, ) - # Colocate a database with the first ensemble members + # Colocate a feature store with the first ensemble members for i, entity in enumerate(colo_ensemble): - entity.colocate_db_tcp( - port=test_port + i, db_cpus=1, debug=True, ifname=test_interface + entity.colocate_fs_tcp( + port=test_port + i, fs_cpus=1, debug=True, ifname=test_interface ) # Add ML models to each ensemble member to make sure they # do not conflict with other ML models @@ -698,10 +698,10 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml # Add another ensemble member colo_ensemble.add_model(colo_model) - # Colocate a database with the new ensemble member - colo_model.colocate_db_tcp( + # Colocate a feature store with the new ensemble member + colo_model.colocate_fs_tcp( port=test_port + len(colo_ensemble) - 1, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -731,11 +731,11 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): - """Test error when colocated db model has no file.""" +def test_colocated_fs_model_errors(fileutils, test_dir, wlmutils, mlutils): + """Test error when colocated fs model has no file.""" # Set experiment name - exp_name = "test-colocated-db-model-error" + exp_name = "test-colocated-fs-model-error" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -756,8 +756,8 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.colocate_db_tcp( - port=test_port, db_cpus=1, debug=True, ifname=test_interface + colo_model.colocate_fs_tcp( + port=test_port, fs_cpus=1, debug=True, ifname=test_interface ) # Get and save TF model @@ -782,10 +782,10 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): "colocated_ens", run_settings=colo_settings, replicas=2 ) - # Colocate a db with each ensemble member + # Colocate a fs with each ensemble member for i, entity in enumerate(colo_ensemble): - entity.colocate_db_tcp( - port=test_port + i, db_cpus=1, debug=True, ifname=test_interface + entity.colocate_fs_tcp( + port=test_port + i, fs_cpus=1, debug=True, ifname=test_interface ) # Check that an error is raised because in-memory models @@ -804,11 +804,11 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): # Check error is still thrown if an in-memory model is used # with a colocated deployment. This test varies by adding - # the SmartSIm model with a colocated database to the ensemble + # the SmartSIm model with a colocated feature store to the ensemble # after the ML model was been added to the ensemble. colo_settings2 = exp.create_run_settings(exe=sys.executable, exe_args=test_script) - # Reverse order of DBModel and model + # Reverse order of fsModel and model colo_ensemble2 = exp.create_ensemble( "colocated_ens", run_settings=colo_settings2, replicas=2 ) @@ -824,9 +824,9 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): ) for i, entity in enumerate(colo_ensemble2): with pytest.raises(SSUnsupportedError): - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -836,13 +836,13 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TensorFlow to run") -def test_inconsistent_params_db_model(): - """Test error when devices_per_node parameter>1 when devices is set to CPU in DBModel""" +def test_inconsistent_params_fs_model(): + """Test error when devices_per_node parameter>1 when devices is set to CPU in fsModel""" # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() with pytest.raises(SSUnsupportedError) as ex: - DBModel( + FSModel( "cnn", "TF", model=model, @@ -860,11 +860,11 @@ def test_inconsistent_params_db_model(): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_db_model_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): - """Test DBModels on remote DB, with an ensemble""" +def test_fs_model_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): + """Test fsModels on remote fs, with an ensemble""" # Set experiment name - exp_name = "test-db-model-ensemble-duplicate" + exp_name = "test-fs-model-ensemble-duplicate" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 9d0b04c8e..b567800f7 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -32,7 +32,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends -from smartsim.entity.dbobject import DBScript +from smartsim.entity.dbobject import FSScript from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger from smartsim.settings import MpiexecSettings, MpirunSettings @@ -42,7 +42,7 @@ should_run = True -supported_dbs = ["uds", "tcp"] +supported_fss = ["uds", "tcp"] try: import torch @@ -57,11 +57,11 @@ def timestwo(x): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script(fileutils, test_dir, wlmutils, mlutils): - """Test DB scripts on remote DB""" +def test_fs_script(fileutils, test_dir, wlmutils, mlutils): + """Test fs scripts on remote fs""" # Set experiment name - exp_name = "test-db-script" + exp_name = "test-fs-script" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -84,10 +84,10 @@ def test_db_script(fileutils, test_dir, wlmutils, mlutils): # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) - # Create the SmartSim database + # Create the SmartSim feature store host = wlmutils.choose_host(run_settings) - db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db, smartsim_model) + fs = exp.create_feature_store(port=test_port, interface=test_interface, hosts=host) + exp.generate(fs, smartsim_model) # Define the torch script string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" @@ -120,23 +120,23 @@ def test_db_script(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have all three scripts - assert len(smartsim_model._db_scripts) == 3 + assert len(smartsim_model._fs_scripts) == 3 # Launch and check successful completion try: - exp.start(db, smartsim_model, block=True) + exp.start(fs, smartsim_model, block=True) statuses = exp.get_status(smartsim_model) assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: - exp.stop(db) + exp.stop(fs) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): - """Test DB scripts on remote DB""" +def test_fs_script_ensemble(fileutils, test_dir, wlmutils, mlutils): + """Test fs scripts on remote fs""" # Set experiment name - exp_name = "test-db-script" + exp_name = "test-fs-script" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -158,16 +158,16 @@ def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): # Create Ensemble with two identical models ensemble = exp.create_ensemble( - "dbscript_ensemble", run_settings=run_settings, replicas=2 + "fsscript_ensemble", run_settings=run_settings, replicas=2 ) # Create SmartSim model smartsim_model = exp.create_model("smartsim_model", run_settings) - # Create SmartSim database + # Create SmartSim feature store host = wlmutils.choose_host(run_settings) - db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db) + fs = exp.create_feature_store(port=test_port, interface=test_interface, hosts=host) + exp.generate(fs) # Create the script string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" @@ -212,27 +212,27 @@ def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have added both models to the ensemble - assert len(ensemble._db_scripts) == 2 + assert len(ensemble._fs_scripts) == 2 # Assert we have added all three models to entities in ensemble - assert all([len(entity._db_scripts) == 3 for entity in ensemble]) + assert all([len(entity._fs_scripts) == 3 for entity in ensemble]) exp.generate(ensemble) try: - exp.start(db, ensemble, block=True) + exp.start(fs, ensemble, block=True) statuses = exp.get_status(ensemble) assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: - exp.stop(db) + exp.stop(fs) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): - """Test DB Scripts on colocated DB""" +def test_colocated_fs_script(fileutils, test_dir, wlmutils, mlutils): + """Test fs Scripts on colocated fs""" # Set the experiment name - exp_name = "test-colocated-db-script" + exp_name = "test-colocated-fs-script" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -252,10 +252,10 @@ def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): colo_settings.set_nodes(1) colo_settings.set_tasks(1) - # Create model with colocated database + # Create model with colocated feature store colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.colocate_db_tcp( - port=test_port, db_cpus=1, debug=True, ifname=test_interface + colo_model.colocate_fs_tcp( + port=test_port, fs_cpus=1, debug=True, ifname=test_interface ) # Create string for script creation @@ -279,12 +279,12 @@ def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have added both models - assert len(colo_model._db_scripts) == 2 + assert len(colo_model._fs_scripts) == 2 exp.generate(colo_model) - for db_script in colo_model._db_scripts: - logger.debug(db_script) + for fs_script in colo_model._fs_scripts: + logger.debug(fs_script) try: exp.start(colo_model, block=True) @@ -295,13 +295,13 @@ def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): - """Test DB Scripts on colocated DB from ensemble, first colocating DB, +def test_colocated_fs_script_ensemble(fileutils, test_dir, wlmutils, mlutils): + """Test fs Scripts on colocated fs from ensemble, first colocating fs, then adding script. """ # Set experiment name - exp_name = "test-colocated-db-script" + exp_name = "test-colocated-fs-script" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -329,13 +329,13 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): # Create a SmartSim model colo_model = exp.create_model("colocated_model", colo_settings) - # Colocate a db with each ensemble entity and add a script + # Colocate a fs with each ensemble entity and add a script # to each entity via file for i, entity in enumerate(colo_ensemble): entity.disable_key_prefixing() - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -348,10 +348,10 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): first_device=0, ) - # Colocate a db with the non-ensemble Model - colo_model.colocate_db_tcp( + # Colocate a fs with the non-ensemble Model + colo_model.colocate_fs_tcp( port=test_port + len(colo_ensemble), - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -379,9 +379,9 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have added one model to the ensemble - assert len(colo_ensemble._db_scripts) == 1 + assert len(colo_ensemble._fs_scripts) == 1 # Assert we have added both models to each entity - assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) + assert all([len(entity._fs_scripts) == 2 for entity in colo_ensemble]) exp.generate(colo_ensemble) @@ -395,12 +395,12 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): - """Test DB Scripts on colocated DB from ensemble, first adding the - script to the ensemble, then colocating the DB""" +def test_colocated_fs_script_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): + """Test fs Scripts on colocated fs from ensemble, first adding the + script to the ensemble, then colocating the fs""" # Set Experiment name - exp_name = "test-colocated-db-script-reord" + exp_name = "test-colocated-fs-script-reord" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -438,13 +438,13 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m first_device=0, ) - # Add a colocated database to the ensemble members + # Add a colocated feature store to the ensemble members # and then add a script via file for i, entity in enumerate(colo_ensemble): entity.disable_key_prefixing() - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -457,10 +457,10 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m first_device=0, ) - # Add a colocated database to the non-ensemble SmartSim Model - colo_model.colocate_db_tcp( + # Add a colocated feature store to the non-ensemble SmartSim Model + colo_model.colocate_fs_tcp( port=test_port + len(colo_ensemble), - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -477,9 +477,9 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m ) # Assert we have added one model to the ensemble - assert len(colo_ensemble._db_scripts) == 1 + assert len(colo_ensemble._fs_scripts) == 1 # Assert we have added both models to each entity - assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) + assert all([len(entity._fs_scripts) == 2 for entity in colo_ensemble]) exp.generate(colo_ensemble) @@ -493,11 +493,11 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): - """Test DB Scripts error when setting a serialized function on colocated DB""" +def test_fs_script_errors(fileutils, test_dir, wlmutils, mlutils): + """Test fs Scripts error when setting a serialized function on colocated fs""" # Set Experiment name - exp_name = "test-colocated-db-script" + exp_name = "test-colocated-fs-script" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -516,11 +516,11 @@ def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): colo_settings.set_nodes(1) colo_settings.set_tasks(1) - # Create a SmartSim model with a colocated database + # Create a SmartSim model with a colocated feature store colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.colocate_db_tcp( + colo_model.colocate_fs_tcp( port=test_port, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -542,17 +542,17 @@ def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): "colocated_ensemble", run_settings=colo_settings, replicas=2 ) - # Add a colocated database for each ensemble member + # Add a colocated feature store for each ensemble member for i, entity in enumerate(colo_ensemble): - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) # Check that an exception is raised when adding an in-memory - # function to the ensemble with colocated databases + # function to the ensemble with colocated feature stores with pytest.raises(SSUnsupportedError): colo_ensemble.add_function( "test_func", @@ -578,31 +578,31 @@ def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): ) # Check that an error is raised when trying to add - # a colocated database to ensemble members that have + # a colocated feature store to ensemble members that have # an in-memory script for i, entity in enumerate(colo_ensemble): with pytest.raises(SSUnsupportedError): - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) # Check that an error is raised when trying to add - # a colocated database to an Ensemble that has + # a colocated feature store to an Ensemble that has # an in-memory script with pytest.raises(SSUnsupportedError): colo_ensemble.add_model(colo_model) -def test_inconsistent_params_db_script(fileutils): - """Test error when devices_per_node>1 and when devices is set to CPU in DBScript constructor""" +def test_inconsistent_params_fs_script(fileutils): + """Test error when devices_per_node>1 and when devices is set to CPU in FSScript constructor""" torch_script = fileutils.get_test_conf_path("torchscript.py") with pytest.raises(SSUnsupportedError) as ex: - _ = DBScript( - name="test_script_db", + _ = FSScript( + name="test_script_fs", script_path=torch_script, device="CPU", devices_per_node=2, @@ -613,8 +613,8 @@ def test_inconsistent_params_db_script(fileutils): == "Cannot set devices_per_node>1 if CPU is specified under devices" ) with pytest.raises(SSUnsupportedError) as ex: - _ = DBScript( - name="test_script_db", + _ = FSScript( + name="test_script_fs", script_path=torch_script, device="CPU", devices_per_node=1, @@ -627,11 +627,11 @@ def test_inconsistent_params_db_script(fileutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): - """Test DB scripts on remote DB""" +def test_fs_script_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): + """Test fs scripts on remote fs""" # Set experiment name - exp_name = "test-db-script" + exp_name = "test-fs-script" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -653,7 +653,7 @@ def test_db_script_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): # Create Ensemble with two identical models ensemble = exp.create_ensemble( - "dbscript_ensemble", run_settings=run_settings, replicas=2 + "fsscript_ensemble", run_settings=run_settings, replicas=2 ) # Create SmartSim model diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index 871c3f059..f642d09dc 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -80,7 +80,7 @@ def test_sklearn_onnx(test_dir, mlutils, wlmutils): exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() - db = wlmutils.get_orchestrator(nodes=1) + db = wlmutils.get_feature_store(nodes=1) db.set_path(test_dir) exp.start(db) diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index 92cd01695..e16800c2a 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -65,7 +65,7 @@ def test_keras_model(test_dir, mlutils, wlmutils): exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() - db = wlmutils.get_orchestrator(nodes=1) + db = wlmutils.get_feature_store(nodes=1) db.set_path(test_dir) exp.start(db) diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index a36037de4..94fc8793e 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -65,7 +65,7 @@ def test_torch_model_and_script(test_dir, mlutils, wlmutils): exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() - db = wlmutils.get_orchestrator(nodes=1) + db = wlmutils.get_feature_store(nodes=1) db.set_path(test_dir) exp.start(db) diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index b3a0ba57b..20e7261c7 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -39,176 +39,176 @@ if (pytest.test_launcher == "pbs") and (not pytest.has_aprun): pytestmark = pytest.mark.skip( - reason="Launching orchestrators in a batch job is not supported on PBS without ALPS" + reason="Launching feature stores in a batch job is not supported on PBS without ALPS" ) -def test_launch_orc_auto_batch(test_dir, wlmutils): - """test single node orchestrator""" +def test_launch_feature_store_auto_batch(test_dir, wlmutils): + """test single node feature store""" launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-orc-batch" + exp_name = "test-launch-auto-feature-store-batch" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), batch=True, interface=network_interface, single_cmd=False, ) - orc.batch_settings.set_account(wlmutils.get_test_account()) + feature_store.batch_settings.set_account(wlmutils.get_test_account()) - orc.batch_settings.set_walltime("00:02:00") + feature_store.batch_settings.set_walltime("00:02:00") - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) # don't use assert so that we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_batch_single(test_dir, wlmutils): - """test clustered 3-node orchestrator with single command""" +def test_launch_cluster_feature_store_batch_single(test_dir, wlmutils): + """test clustered 3-node feature store with single command""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-cluster-orc-batch-single" + exp_name = "test-launch-auto-cluster-feature-store-batch-single" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, interface=network_interface, single_cmd=True, ) - orc.batch_settings.set_account(wlmutils.get_test_account()) + feature_store.batch_settings.set_account(wlmutils.get_test_account()) - orc.batch_settings.set_walltime("00:02:00") + feature_store.batch_settings.set_walltime("00:02:00") - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) - # don't use assert so that orc we don't leave an orphan process + # don't use assert so that feature_store we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): - """test clustered 3-node orchestrator""" +def test_launch_cluster_feature_store_batch_multi(test_dir, wlmutils): + """test clustered 3-node feature store""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-cluster-orc-batch-multi" + exp_name = "test-launch-auto-cluster-feature-store-batch-multi" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, interface=network_interface, single_cmd=False, ) - orc.batch_settings.set_account(wlmutils.get_test_account()) + feature_store.batch_settings.set_account(wlmutils.get_test_account()) - orc.batch_settings.set_walltime("00:03:00") + feature_store.batch_settings.set_walltime("00:03:00") - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) - # don't use assert so that orc we don't leave an orphan process + # don't use assert so that feature_store we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_reconnect(test_dir, wlmutils): - """test reconnecting to clustered 3-node orchestrator""" +def test_launch_cluster_feature_store_reconnect(test_dir, wlmutils): + """test reconnecting to clustered 3-node feature store""" p_test_dir = pathlib.Path(test_dir) launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-cluster-orc-batch-reconect" + exp_name = "test-launch-cluster-feature-store-batch-reconect" exp_1_dir = p_test_dir / exp_name exp_1_dir.mkdir() exp = Experiment(exp_name, launcher=launcher, exp_path=str(exp_1_dir)) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( - wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface + feature_store = exp.create_feature_store( + wlmutils.get_test_port(), fs_nodes=3, batch=True, interface=network_interface ) - orc.batch_settings.set_account(wlmutils.get_test_account()) + feature_store.batch_settings.set_account(wlmutils.get_test_account()) - orc.batch_settings.set_walltime("00:03:00") + feature_store.batch_settings.set_walltime("00:03:00") - exp.start(orc, block=True) + exp.start(feature_store, block=True) - statuses = exp.get_status(orc) + statuses = exp.get_status(feature_store) try: assert all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses) except Exception: - exp.stop(orc) + exp.stop(feature_store) raise - exp_name = "test-orc-cluster-orc-batch-reconnect-2nd" + exp_name = "test-feature_store-cluster-feature-store-batch-reconnect-2nd" exp_2_dir = p_test_dir / exp_name exp_2_dir.mkdir() exp_2 = Experiment(exp_name, launcher=launcher, exp_path=str(exp_2_dir)) try: - checkpoint = osp.join(orc.path, "smartsim_db.dat") - reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) + checkpoint = osp.join(feature_store.path, "smartsim_db.dat") + reloaded_feature_store = exp_2.reconnect_feature_store(checkpoint) # let statuses update once time.sleep(5) - statuses = exp_2.get_status(reloaded_orc) + statuses = exp_2.get_status(reloaded_feature_store) assert all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses) except Exception: # Something went wrong! Let the experiment that started the DB # clean up the DB - exp.stop(orc) + exp.stop(feature_store) raise try: # Test experiment 2 can stop the DB - exp_2.stop(reloaded_orc) + exp_2.stop(reloaded_feature_store) assert all( stat == SmartSimStatus.STATUS_CANCELLED - for stat in exp_2.get_status(reloaded_orc) + for stat in exp_2.get_status(reloaded_feature_store) ) except Exception: # Something went wrong! Let the experiment that started the DB # clean up the DB - exp.stop(orc) + exp.stop(feature_store) raise else: # Ensure it is the same DB that Experiment 1 was tracking time.sleep(5) assert not any( - stat == SmartSimStatus.STATUS_RUNNING for stat in exp.get_status(orc) + stat == SmartSimStatus.STATUS_RUNNING for stat in exp.get_status(feature_store) ) diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index 97a47542d..f3892e324 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -33,12 +33,12 @@ from smartsim.status import SmartSimStatus if sys.platform == "darwin": - supported_dbs = ["tcp", "deprecated"] + supported_fss = ["tcp", "deprecated"] else: - supported_dbs = ["uds", "tcp", "deprecated"] + supported_fss = ["uds", "tcp", "deprecated"] -# Set to true if DB logs should be generated for debugging -DEBUG_DB = False +# Set to true if fs logs should be generated for debugging +DEBUG_fs = False # retrieved from pytest fixtures launcher = pytest.test_launcher @@ -46,18 +46,18 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -@pytest.mark.parametrize("db_type", supported_dbs) -def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, db_type): - """Test the launch of a model with a colocated database and local launcher""" +@pytest.mark.parametrize("fs_type", supported_fss) +def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, fs_type): + """Test the launch of a model with a colocated feature store and local launcher""" - db_args = {"debug": DEBUG_DB} + fs_args = {"debug": DEBUG_fs} exp = Experiment("colocated_model_defaults", launcher=launcher, exp_path=test_dir) colo_model = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) exp.generate(colo_model) - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0" + assert colo_model.run_settings.colocated_fs_settings["custom_pinning"] == "0" exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( @@ -72,22 +72,22 @@ def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, db_type ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type): +@pytest.mark.parametrize("fs_type", supported_fss) +def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, fs_type): exp = Experiment( "colocated_model_pinning_auto_1cpu", launcher=launcher, exp_path=test_dir ) - db_args = { - "db_cpus": 1, + fs_args = { + "fs_cpus": 1, "custom_pinning": [], - "debug": DEBUG_DB, + "debug": DEBUG_fs, } # Check to make sure that the CPU mask was correctly generated colo_model = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] is None + assert colo_model.run_settings.colocated_fs_settings["custom_pinning"] is None exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) @@ -96,21 +96,21 @@ def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_auto_2cpu(fileutils, test_dir, coloutils, db_type): +@pytest.mark.parametrize("fs_type", supported_fss) +def test_colocated_model_pinning_auto_2cpu(fileutils, test_dir, coloutils, fs_type): exp = Experiment( "colocated_model_pinning_auto_2cpu", launcher=launcher, exp_path=test_dir, ) - db_args = {"db_cpus": 2, "debug": DEBUG_DB} + fs_args = {"fs_cpus": 2, "debug": DEBUG_fs} # Check to make sure that the CPU mask was correctly generated colo_model = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1" + assert colo_model.run_settings.colocated_fs_settings["custom_pinning"] == "0,1" exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) @@ -119,8 +119,8 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, test_dir, coloutils, db_ty ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type): +@pytest.mark.parametrize("fs_type", supported_fss) +def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, fs_type): # Check to make sure that the CPU mask was correctly generated # Assume that there are at least 4 cpus on the node @@ -130,12 +130,12 @@ def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type): exp_path=test_dir, ) - db_args = {"db_cpus": 4, "custom_pinning": range(4), "debug": DEBUG_DB} + fs_args = {"fs_cpus": 4, "custom_pinning": range(4), "debug": DEBUG_fs} colo_model = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1,2,3" + assert colo_model.run_settings.colocated_fs_settings["custom_pinning"] == "0,1,2,3" exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) @@ -144,8 +144,8 @@ def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type): ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type): +@pytest.mark.parametrize("fs_type", supported_fss) +def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, fs_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this has more than 2 CPUs on the supercomputer node @@ -155,12 +155,12 @@ def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type): exp_path=test_dir, ) - db_args = {"db_cpus": 2, "custom_pinning": [0, 2]} + fs_args = {"fs_cpus": 2, "custom_pinning": [0, 2]} colo_model = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,2" + assert colo_model.run_settings.colocated_fs_settings["custom_pinning"] == "0,2" exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) @@ -169,8 +169,8 @@ def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type): ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_mixed(fileutils, test_dir, coloutils, db_type): +@pytest.mark.parametrize("fs_type", supported_fss) +def test_colocated_model_pinning_mixed(fileutils, test_dir, coloutils, fs_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this at least 4 CPUs on the supercomputer node @@ -180,12 +180,12 @@ def test_colocated_model_pinning_mixed(fileutils, test_dir, coloutils, db_type): exp_path=test_dir, ) - db_args = {"db_cpus": 2, "custom_pinning": [range(2), 3]} + fs_args = {"fs_cpus": 2, "custom_pinning": [range(2), 3]} colo_model = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1,3" + assert colo_model.run_settings.colocated_fs_settings["custom_pinning"] == "0,1,3" exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py index b6054a78b..ede0817ef 100644 --- a/tests/on_wlm/test_containers_wlm.py +++ b/tests/on_wlm/test_containers_wlm.py @@ -60,12 +60,12 @@ def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): "smartredis_ensemble_exchange", exp_path=test_dir, launcher=launcher ) - # create and start a database - orc = exp.create_database( + # create and start a feature store + feature_store = exp.create_feature_store( port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface() ) - exp.generate(orc) - exp.start(orc, block=False) + exp.generate(feature_store) + exp.start(feature_store, block=False) container = Singularity(containerURI) rs = exp.create_run_settings( @@ -94,10 +94,10 @@ def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): # get and confirm statuses statuses = exp.get_status(ensemble) if not all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]): - exp.stop(orc) + exp.stop(feature_store) assert False # client ensemble failed - # stop the orchestrator - exp.stop(orc) + # stop the feature store + exp.stop(feature_store) print(exp.summary()) diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index cacdd5be5..fc475a7e2 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -34,16 +34,16 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_launch_orc_auto(test_dir, wlmutils): - """test single node orchestrator""" +def test_launch_feature_store_auto(test_dir, wlmutils): + """test single node feature store""" launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-orc" + exp_name = "test-launch-auto-feature_store" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), batch=False, interface=network_interface, @@ -51,78 +51,78 @@ def test_launch_orc_auto(test_dir, wlmutils): hosts=wlmutils.get_test_hostlist(), ) - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) # don't use assert so that we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_single(test_dir, wlmutils): - """test clustered 3-node orchestrator with single command""" +def test_launch_cluster_feature_store_single(test_dir, wlmutils): + """test clustered 3-node feature store with single command""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-cluster-orc-single" + exp_name = "test-launch-auto-cluster-feature_store-single" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface=network_interface, single_cmd=True, hosts=wlmutils.get_test_hostlist(), ) - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) - # don't use assert so that orc we don't leave an orphan process + # don't use assert so that feature_store we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_multi(test_dir, wlmutils): - """test clustered 3-node orchestrator with multiple commands""" +def test_launch_cluster_feature_store_multi(test_dir, wlmutils): + """test clustered 3-node feature store with multiple commands""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-cluster-orc-multi" + exp_name = "test-launch-auto-cluster-feature-store-multi" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface=network_interface, single_cmd=False, hosts=wlmutils.get_test_hostlist(), ) - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) - # don't use assert so that orc we don't leave an orphan process + # don't use assert so that feature_store we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) diff --git a/tests/on_wlm/test_het_job.py b/tests/on_wlm/test_het_job.py index aeea7b474..459f2a952 100644 --- a/tests/on_wlm/test_het_job.py +++ b/tests/on_wlm/test_het_job.py @@ -63,19 +63,19 @@ def test_set_het_groups(monkeypatch, test_dir): rs.set_het_group([4]) -def test_orch_single_cmd(monkeypatch, wlmutils, test_dir): +def test_feature_store_single_cmd(monkeypatch, wlmutils, test_dir): """Test that single cmd is rejected in a heterogeneous job""" monkeypatch.setenv("SLURM_HET_SIZE", "1") - exp_name = "test-orch-single-cmd" + exp_name = "test-feature-store-single-cmd" exp = Experiment(exp_name, launcher="slurm", exp_path=test_dir) - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface=wlmutils.get_test_interface(), single_cmd=True, hosts=wlmutils.get_test_hostlist(), ) - for node in orc: + for node in feature_store: assert node.is_mpmd == False diff --git a/tests/on_wlm/test_symlinking.py b/tests/on_wlm/test_symlinking.py index 246457d1c..058b56e74 100644 --- a/tests/on_wlm/test_symlinking.py +++ b/tests/on_wlm/test_symlinking.py @@ -134,8 +134,8 @@ def test_batch_orchestrator_symlinks(test_dir, wlmutils): launcher = wlmutils.get_test_launcher() exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) port = 2424 - db = exp.create_database( - db_nodes=3, + db = exp.create_feature_store( + fs_nodes=3, port=port, batch=True, interface=wlmutils.get_test_interface(), @@ -149,7 +149,7 @@ def test_batch_orchestrator_symlinks(test_dir, wlmutils): _should_be_symlinked(pathlib.Path(db.path, f"{db.name}.out"), False) _should_be_symlinked(pathlib.Path(db.path, f"{db.name}.err"), False) - for i in range(db.db_nodes): + for i in range(db.fs_nodes): _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.out"), False) _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.err"), False) _should_not_be_symlinked( diff --git a/tests/on_wlm/test_wlm_orc_config_settings.py b/tests/on_wlm/test_wlm_orc_config_settings.py index f9ab60609..fc661638a 100644 --- a/tests/on_wlm/test_wlm_orc_config_settings.py +++ b/tests/on_wlm/test_wlm_orc_config_settings.py @@ -40,55 +40,55 @@ pytestmark = pytest.mark.skip(reason="SmartRedis version is < 0.3.1") -def test_config_methods_on_wlm_single(dbutils, db): - """Test all configuration file edit methods on single node WLM db""" +def test_config_methods_on_wlm_single(fsutils, fs): + """Test all configuration file edit methods on single node WLM fs""" # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs - configs = dbutils.get_db_configs() + configs = fsutils.get_fs_configs() for setting, value in configs.items(): - config_set_method = dbutils.get_config_edit_method(db, setting) + config_set_method = fsutils.get_config_edit_method(fs, setting) config_set_method(value) - # ensure SmartSimError is raised when a clustered database's - # Orchestrator.set_db_conf is given invalid CONFIG key-value pairs - ss_error_configs = dbutils.get_smartsim_error_db_configs() + # ensure SmartSimError is raised when a clustered feature store's + # FeatureStore.set_fs_conf is given invalid CONFIG key-value pairs + ss_error_configs = fsutils.get_smartsim_error_fs_configs() for key, value_list in ss_error_configs.items(): for value in value_list: with pytest.raises(SmartSimError): - db.set_db_conf(key, value) + fs.set_fs_conf(key, value) - # ensure TypeError is raised when a clustered database's - # Orchestrator.set_db_conf is given invalid CONFIG key-value pairs - type_error_configs = dbutils.get_type_error_db_configs() + # ensure TypeError is raised when a clustered feature store's + # FeatureStore.set_fs_conf is given invalid CONFIG key-value pairs + type_error_configs = fsutils.get_type_error_fs_configs() for key, value_list in type_error_configs.items(): for value in value_list: with pytest.raises(TypeError): - db.set_db_conf(key, value) + fs.set_fs_conf(key, value) -def test_config_methods_on_wlm_cluster(dbutils, db_cluster): - """Test all configuration file edit methods on an active clustered db""" +def test_config_methods_on_wlm_cluster(fsutils, fs_cluster): + """Test all configuration file edit methods on an active clustered fs""" # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs - configs = dbutils.get_db_configs() + configs = fsutils.get_fs_configs() for setting, value in configs.items(): - config_set_method = dbutils.get_config_edit_method(db_cluster, setting) + config_set_method = fsutils.get_config_edit_method(fs_cluster, setting) config_set_method(value) - # ensure SmartSimError is raised when a clustered database's - # Orchestrator.set_db_conf is given invalid CONFIG key-value pairs - ss_error_configs = dbutils.get_smartsim_error_db_configs() + # ensure SmartSimError is raised when a clustered feature store's + # FeatureStore.set_fs_conf is given invalid CONFIG key-value pairs + ss_error_configs = fsutils.get_smartsim_error_fs_configs() for key, value_list in ss_error_configs.items(): for value in value_list: with pytest.raises(SmartSimError): - db_cluster.set_db_conf(key, value) + fs_cluster.set_fs_conf(key, value) - # ensure TypeError is raised when a clustered database's - # Orchestrator.set_db_conf is given invalid CONFIG key-value pairs - type_error_configs = dbutils.get_type_error_db_configs() + # ensure TypeError is raised when a clustered feature store's + # FeatureStore.set_fs_conf is given invalid CONFIG key-value pairs + type_error_configs = fsutils.get_type_error_fs_configs() for key, value_list in type_error_configs.items(): for value in value_list: with pytest.raises(TypeError): - db_cluster.set_db_conf(key, value) + fs_cluster.set_fs_conf(key, value) diff --git a/tests/test_alps_settings.py b/tests/test_alps_settings.py index b3c4c3bdb..f96d0e60d 100644 --- a/tests/test_alps_settings.py +++ b/tests/test_alps_settings.py @@ -67,7 +67,7 @@ def test_aprun_add_mpmd(): def test_catch_colo_mpmd(): settings = AprunSettings("python") - settings.colocated_db_settings = {"port": 6379, "cpus": 1} + settings.colocated_fs_settings = {"port": 6379, "cpus": 1} settings_2 = AprunSettings("python") with pytest.raises(SSUnsupportedError): settings.make_mpmd(settings_2) diff --git a/tests/test_cli.py b/tests/test_cli.py index 710a9a659..bcec732e2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -232,7 +232,7 @@ def test_cli_command_execution(capsys): exp_b_help = "this is my mock help text for build" exp_b_cmd = "build" - dbcli_exec = lambda x, y: mock_execute_custom(msg="Database", good=True) + dbcli_exec = lambda x, y: mock_execute_custom(msg="FeatureStore", good=True) build_exec = lambda x, y: mock_execute_custom(msg="Builder", good=True) menu = [ @@ -249,7 +249,7 @@ def test_cli_command_execution(capsys): captured = capsys.readouterr() # capture new output # show that `smart dbcli` calls the build parser and build execute function - assert "Database" in captured.out + assert "FeatureStore" in captured.out assert ret_val == 0 build_args = ["smart", exp_b_cmd] @@ -670,13 +670,13 @@ def mock_operation(*args, **kwargs) -> int: def test_cli_full_dbcli_execute(capsys, monkeypatch): """Ensure that the execute method of dbcli is called""" exp_retval = 0 - exp_output = "mocked-get_db_path utility" + exp_output = "mocked-get_fs_path utility" def mock_operation(*args, **kwargs) -> int: return exp_output - # mock out the internal get_db_path method so we don't actually do file system ops - monkeypatch.setattr(smartsim._core._cli.dbcli, "get_db_path", mock_operation) + # mock out the internal get_fs_path method so we don't actually do file system ops + monkeypatch.setattr(smartsim._core._cli.dbcli, "get_fs_path", mock_operation) command = "dbcli" cfg = MenuItemConfig(command, f"test {command} help text", dbcli_execute) @@ -703,7 +703,7 @@ def mock_operation(*args, **kwargs) -> int: print(exp_output) return exp_retval - # mock out the internal get_db_path method so we don't actually do file system ops + # mock out the internal get_fs_path method so we don't actually do file system ops monkeypatch.setattr(smartsim._core._cli.site, "get_install_path", mock_operation) command = "site" @@ -731,9 +731,9 @@ def mock_operation(*args, **kwargs) -> int: print(exp_output) return exp_retval - # mock out the internal get_db_path method so we don't actually do file system ops + # mock out the internal get_fs_path method so we don't actually do file system ops monkeypatch.setattr(smartsim._core._cli.build, "tabulate", mock_operation) - monkeypatch.setattr(smartsim._core._cli.build, "build_database", mock_operation) + monkeypatch.setattr(smartsim._core._cli.build, "build_feature_store", mock_operation) monkeypatch.setattr(smartsim._core._cli.build, "build_redis_ai", mock_operation) monkeypatch.setattr( smartsim._core._cli.build, "check_py_torch_version", mock_operation diff --git a/tests/test_collector_manager.py b/tests/test_collector_manager.py index 9d7933379..5590f8b9f 100644 --- a/tests/test_collector_manager.py +++ b/tests/test_collector_manager.py @@ -246,11 +246,11 @@ async def test_collector_manager_collect_filesink( @pytest.mark.asyncio async def test_collector_manager_collect_integration( - test_dir: str, mock_entity: MockCollectorEntityFunc, local_db, mock_sink + test_dir: str, mock_entity: MockCollectorEntityFunc, local_fs, mock_sink ) -> None: """Ensure that all collectors are executed and some metric is retrieved""" - entity1 = mock_entity(port=local_db.ports[0], name="e1", telemetry_on=True) - entity2 = mock_entity(port=local_db.ports[0], name="e2", telemetry_on=True) + entity1 = mock_entity(port=local_fs.ports[0], name="e1", telemetry_on=True) + entity2 = mock_entity(port=local_fs.ports[0], name="e2", telemetry_on=True) # todo: consider a MockSink so i don't have to save the last value in the collector sinks = [mock_sink(), mock_sink(), mock_sink()] @@ -339,20 +339,20 @@ async def snooze() -> None: pytest.param("model", True, id="models, telemetry enabled"), pytest.param("ensemble", False, id="ensemble"), pytest.param("ensemble", True, id="ensemble, telemetry enabled"), - pytest.param("orchestrator", False, id="orchestrator"), - pytest.param("orchestrator", True, id="orchestrator, telemetry enabled"), - pytest.param("dbnode", False, id="dbnode"), - pytest.param("dbnode", True, id="dbnode, telemetry enabled"), + pytest.param("featurestore", False, id="featurestore"), + pytest.param("featurestore", True, id="featurestore, telemetry enabled"), + pytest.param("fsnode", False, id="fsnode"), + pytest.param("fsnode", True, id="fsnode, telemetry enabled"), ], ) @pytest.mark.asyncio -async def test_collector_manager_find_nondb( +async def test_collector_manager_find_nonfs( mock_entity: MockCollectorEntityFunc, e_type: str, telemetry_on: bool, ) -> None: """Ensure that the number of collectors returned for entity types match expectations - NOTE: even orchestrator returns 0 mapped collectors because no collector output + NOTE: even featurestore returns 0 mapped collectors because no collector output paths are set on the entity""" entity = mock_entity(port=1234, name="e1", type=e_type, telemetry_on=telemetry_on) manager = CollectorManager(timeout_ms=10000) @@ -381,7 +381,7 @@ async def test_collector_manager_find_db(mock_entity: MockCollectorEntityFunc) - # 1. ensure DBConnectionCountCollector is mapped entity = mock_entity( - port=1234, name="entity1", type="orchestrator", telemetry_on=True + port=1234, name="entity1", type="featurestore", telemetry_on=True ) entity.collectors["client"] = "mock/path.csv" manager = CollectorManager() @@ -395,7 +395,7 @@ async def test_collector_manager_find_db(mock_entity: MockCollectorEntityFunc) - # 3. ensure DBConnectionCountCollector is mapped entity = mock_entity( - port=1234, name="entity1", type="orchestrator", telemetry_on=True + port=1234, name="entity1", type="featurestore", telemetry_on=True ) entity.collectors["client_count"] = "mock/path.csv" manager = CollectorManager() @@ -409,7 +409,7 @@ async def test_collector_manager_find_db(mock_entity: MockCollectorEntityFunc) - # ensure DbMemoryCollector is mapped entity = mock_entity( - port=1234, name="entity1", type="orchestrator", telemetry_on=True + port=1234, name="entity1", type="featurestore", telemetry_on=True ) entity.collectors["memory"] = "mock/path.csv" manager = CollectorManager() @@ -427,7 +427,7 @@ async def test_collector_manager_find_entity_disabled( mock_entity: MockCollectorEntityFunc, ) -> None: """Ensure that disabling telemetry on the entity results in no collectors""" - entity: JobEntity = mock_entity(port=1234, name="entity1", type="orchestrator") + entity: JobEntity = mock_entity(port=1234, name="entity1", type="featurestore") # set paths for all known collectors entity.collectors["client"] = "mock/path.csv" diff --git a/tests/test_collectors.py b/tests/test_collectors.py index fd2ed9080..09fac1484 100644 --- a/tests/test_collectors.py +++ b/tests/test_collectors.py @@ -171,12 +171,12 @@ async def test_dbmemcollector_collect( async def test_dbmemcollector_integration( mock_entity: MockCollectorEntityFunc, mock_sink: MockSink, - local_db: smartsim.experiment.Orchestrator, + local_fs: smartsim.experiment.FeatureStore, monkeypatch: pytest.MonkeyPatch, ) -> None: - """Integration test with a real orchestrator instance to ensure + """Integration test with a real feature store instance to ensure output data matches expectations and proper db client API uage""" - entity = mock_entity(port=local_db.ports[0], telemetry_on=True) + entity = mock_entity(port=local_fs.ports[0], telemetry_on=True) sink = mock_sink() collector = DBMemoryCollector(entity, sink) @@ -268,12 +268,12 @@ async def test_dbconn_count_collector_collect( async def test_dbconncollector_integration( mock_entity: MockCollectorEntityFunc, mock_sink: MockSink, - local_db: smartsim.experiment.Orchestrator, + local_fs: smartsim.experiment.FeatureStore, monkeypatch: pytest.MonkeyPatch, ) -> None: - """Integration test with a real orchestrator instance to ensure + """Integration test with a real feature store instance to ensure output data matches expectations and proper db client API uage""" - entity = mock_entity(port=local_db.ports[0], telemetry_on=True) + entity = mock_entity(port=local_fs.ports[0], telemetry_on=True) sink = mock_sink() collector = DBConnectionCollector(entity, sink) diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index fe347ee30..0fcb8a8c8 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -38,17 +38,17 @@ if sys.platform == "darwin": - supported_dbs = ["tcp", "deprecated"] + supported_fss = ["tcp", "deprecated"] else: - supported_dbs = ["uds", "tcp", "deprecated"] + supported_fss = ["uds", "tcp", "deprecated"] is_mac = sys.platform == "darwin" @pytest.mark.skipif(not is_mac, reason="MacOS-only test") def test_macosx_warning(fileutils, test_dir, coloutils): - db_args = {"custom_pinning": [1]} - db_type = "uds" # Test is insensitive to choice of db + fs_args = {"custom_pinning": [1]} + fs_type = "uds" # Test is insensitive to choice of fs exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.warns( @@ -57,42 +57,42 @@ def test_macosx_warning(fileutils, test_dir, coloutils): ): _ = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) def test_unsupported_limit_app(fileutils, test_dir, coloutils): - db_args = {"limit_app_cpus": True} - db_type = "uds" # Test is insensitive to choice of db + fs_args = {"limit_app_cpus": True} + fs_type = "uds" # Test is insensitive to choice of fs exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.raises(SSUnsupportedError): coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) @pytest.mark.skipif(is_mac, reason="Unsupported on MacOSX") @pytest.mark.parametrize("custom_pinning", [1, "10", "#", 1.0, ["a"], [1.0]]) def test_unsupported_custom_pinning(fileutils, test_dir, coloutils, custom_pinning): - db_type = "uds" # Test is insensitive to choice of db - db_args = {"custom_pinning": custom_pinning} + fs_type = "uds" # Test is insensitive to choice of fs + fs_args = {"custom_pinning": custom_pinning} exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.raises(TypeError): coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) @@ -113,21 +113,21 @@ def test_create_pinning_string(pin_list, num_cpus, expected): assert Model._create_pinning_string(pin_list, num_cpus) == expected -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_launch_colocated_model_defaults( - fileutils, test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, fs_type, launcher="local" ): - """Test the launch of a model with a colocated database and local launcher""" + """Test the launch of a model with a colocated feature store and local launcher""" - db_args = {} + fs_args = {} exp = Experiment("colocated_model_defaults", launcher=launcher, exp_path=test_dir) colo_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) if is_mac: @@ -135,7 +135,7 @@ def test_launch_colocated_model_defaults( else: true_pinning = "0" assert ( - colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning + colo_model.run_settings.colocated_fs_settings["custom_pinning"] == true_pinning ) exp.generate(colo_model) exp.start(colo_model, block=True) @@ -150,31 +150,31 @@ def test_launch_colocated_model_defaults( ), f"Statuses {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_launch_multiple_colocated_models( - fileutils, test_dir, coloutils, wlmutils, db_type, launcher="local" + fileutils, test_dir, coloutils, wlmutils, fs_type, launcher="local" ): - """Test the concurrent launch of two models with a colocated database and local launcher""" + """Test the concurrent launch of two models with a colocated feature store and local launcher""" - db_args = {} + fs_args = {} exp = Experiment("multi_colo_models", launcher=launcher, exp_path=test_dir) colo_models = [ coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_model_name="colo0", port=wlmutils.get_test_port(), ), coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_model_name="colo1", port=wlmutils.get_test_port() + 1, ), @@ -190,58 +190,58 @@ def test_launch_multiple_colocated_models( assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_colocated_model_disable_pinning( - fileutils, test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, fs_type, launcher="local" ): exp = Experiment( "colocated_model_pinning_auto_1cpu", launcher=launcher, exp_path=test_dir ) - db_args = { - "db_cpus": 1, + fs_args = { + "fs_cpus": 1, "custom_pinning": [], } # Check to make sure that the CPU mask was correctly generated colo_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] is None + assert colo_model.run_settings.colocated_fs_settings["custom_pinning"] is None exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_colocated_model_pinning_auto_2cpu( - fileutils, test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, fs_type, launcher="local" ): exp = Experiment( "colocated_model_pinning_auto_2cpu", launcher=launcher, exp_path=test_dir ) - db_args = { - "db_cpus": 2, + fs_args = { + "fs_cpus": 2, } # Check to make sure that the CPU mask was correctly generated colo_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) if is_mac: true_pinning = None else: true_pinning = "0,1" assert ( - colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning + colo_model.run_settings.colocated_fs_settings["custom_pinning"] == true_pinning ) exp.generate(colo_model) exp.start(colo_model, block=True) @@ -250,9 +250,9 @@ def test_colocated_model_pinning_auto_2cpu( @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_colocated_model_pinning_range( - fileutils, test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, fs_type, launcher="local" ): # Check to make sure that the CPU mask was correctly generated @@ -260,16 +260,16 @@ def test_colocated_model_pinning_range( "colocated_model_pinning_manual", launcher=launcher, exp_path=test_dir ) - db_args = {"db_cpus": 2, "custom_pinning": range(2)} + fs_args = {"fs_cpus": 2, "custom_pinning": range(2)} colo_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1" + assert colo_model.run_settings.colocated_fs_settings["custom_pinning"] == "0,1" exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) @@ -277,9 +277,9 @@ def test_colocated_model_pinning_range( @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_colocated_model_pinning_list( - fileutils, test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, fs_type, launcher="local" ): # Check to make sure that the CPU mask was correctly generated @@ -287,16 +287,16 @@ def test_colocated_model_pinning_list( "colocated_model_pinning_manual", launcher=launcher, exp_path=test_dir ) - db_args = {"db_cpus": 1, "custom_pinning": [1]} + fs_args = {"fs_cpus": 1, "custom_pinning": [1]} colo_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) - assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "1" + assert colo_model.run_settings.colocated_fs_settings["custom_pinning"] == "1" exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) @@ -311,4 +311,4 @@ def test_colo_uds_verifies_socket_file_name(test_dir, launcher="local"): colo_model = exp.create_model("wrong_uds_socket_name", colo_settings) with pytest.raises(ValueError): - colo_model.colocate_db_uds(unix_socket="this is not a valid name!") + colo_model.colocate_fs_uds(unix_socket="this is not a valid name!") diff --git a/tests/test_colo_model_lsf.py b/tests/test_colo_model_lsf.py index 5e1c449cc..0c051f9c0 100644 --- a/tests/test_colo_model_lsf.py +++ b/tests/test_colo_model_lsf.py @@ -47,29 +47,29 @@ class ExpectationMet(Exception): def show_expectation_met(*args, **kwargs): - raise ExpectationMet("mock._prep_colocated_db") + raise ExpectationMet("mock._prep_colocated_fs") def test_jsrun_prep(fileutils, coloutils, monkeypatch): """Ensure that JsrunSettings prep method is executed as expected""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") # mock the prep method to raise an exception that short circuits test when goal is met - monkeypatch.setattr(JsrunSettings, "_prep_colocated_db", show_expectation_met) + monkeypatch.setattr(JsrunSettings, "_prep_colocated_fs", show_expectation_met) - db_args = {"custom_pinning": [1]} - db_type = "uds" # Test is insensitive to choice of db + fs_args = {"custom_pinning": [1]} + fs_type = "uds" # Test is insensitive to choice of fs exp = Experiment("colocated_model_lsf", launcher="lsf") - with pytest.raises(ExpectationMet, match="mock._prep_colocated_db") as ex: + with pytest.raises(ExpectationMet, match="mock._prep_colocated_fs") as ex: run_settings = JsrunSettings("foo") coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) @@ -78,10 +78,10 @@ def test_non_js_run_prep(fileutils, coloutils, monkeypatch): """Ensure that RunSettings does not attempt to call a prep method""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") # mock prep method to ensure that the exception isn't thrown w/non-JsrunSettings arg - monkeypatch.setattr(JsrunSettings, "_prep_colocated_db", show_expectation_met) + monkeypatch.setattr(JsrunSettings, "_prep_colocated_fs", show_expectation_met) - db_args = {"custom_pinning": [1]} - db_type = "tcp" # Test is insensitive to choice of db + fs_args = {"custom_pinning": [1]} + fs_type = "tcp" # Test is insensitive to choice of fs exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -89,10 +89,10 @@ def test_non_js_run_prep(fileutils, coloutils, monkeypatch): colo_model: Model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) @@ -119,14 +119,14 @@ def test_jsrun_prep_cpu_per_flag_set_check( exp_value, test_value, ): - """Ensure that _prep_colocated_db honors basic cpu_per_rs config and allows a + """Ensure that _prep_colocated_fs honors basic cpu_per_rs config and allows a valid input parameter to result in the correct output. If no expected input (or incorrect key) is given, the default should be returned using default config key""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") - # excluding "db_cpus" should result in default value in comparison & output - db_args = {"custom_pinning": [1]} - db_type = "uds" # Test is insensitive to choice of db + # excluding "fs_cpus" should result in default value in comparison & output + fs_args = {"custom_pinning": [1]} + fs_type = "uds" # Test is insensitive to choice of fs exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -135,10 +135,10 @@ def test_jsrun_prep_cpu_per_flag_set_check( colo_model: Model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) @@ -151,14 +151,14 @@ def test_jsrun_prep_cpu_per_flag_set_check( pytest.param("cpu_per_rs", "cpu_per_rs", 11, 11, id="cpu_per_rs matches input"), pytest.param("c", "c", 22, 22, id="c matches input"), pytest.param( - "cpu_per_rs", "cpu_per_rsx", 3, 33, id="key typo: db_cpus out (not default)" + "cpu_per_rs", "cpu_per_rsx", 3, 33, id="key typo: fs_cpus out (not default)" ), pytest.param( - "cpu_per_rs", "cx", 3, 44, id="key typo: get db_cpus out (not default)" + "cpu_per_rs", "cx", 3, 44, id="key typo: get fs_cpus out (not default)" ), ], ) -def test_jsrun_prep_db_cpu_override( +def test_jsrun_prep_fs_cpu_override( fileutils, coloutils, monkeypatch, @@ -167,12 +167,12 @@ def test_jsrun_prep_db_cpu_override( exp_value, test_value, ): - """Ensure that both cpu_per_rs and c input config override db_cpus""" + """Ensure that both cpu_per_rs and c input config override fs_cpus""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") - # setting "db_cpus" should result in non-default value in comparison & output - db_args = {"custom_pinning": [1], "db_cpus": 3} - db_type = "tcp" # Test is insensitive to choice of db + # setting "fs_cpus" should result in non-default value in comparison & output + fs_args = {"custom_pinning": [1], "fs_cpus": 3} + fs_type = "tcp" # Test is insensitive to choice of fs exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -181,10 +181,10 @@ def test_jsrun_prep_db_cpu_override( colo_model: Model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) @@ -195,14 +195,14 @@ def test_jsrun_prep_db_cpu_override( "exp_run_arg_key,run_arg_key,exp_value,test_value", [ pytest.param( - "cpu_per_rs", "cpu_per_rs", 8, 3, id="cpu_per_rs swaps to db_cpus" + "cpu_per_rs", "cpu_per_rs", 8, 3, id="cpu_per_rs swaps to fs_cpus" ), - pytest.param("c", "c", 8, 4, id="c swaps to db_cpus"), - pytest.param("cpu_per_rs", "cpu_per_rsx", 8, 5, id="key typo: db_cpus out"), - pytest.param("cpu_per_rs", "cx", 8, 6, id="key typo: get db_cpus out"), + pytest.param("c", "c", 8, 4, id="c swaps to fs_cpus"), + pytest.param("cpu_per_rs", "cpu_per_rsx", 8, 5, id="key typo: fs_cpus out"), + pytest.param("cpu_per_rs", "cx", 8, 6, id="key typo: get fs_cpus out"), ], ) -def test_jsrun_prep_db_cpu_replacement( +def test_jsrun_prep_fs_cpu_replacement( fileutils, coloutils, monkeypatch, @@ -211,12 +211,12 @@ def test_jsrun_prep_db_cpu_replacement( exp_value, test_value, ): - """Ensure that db_cpus default is used if user config suggests underutilizing resources""" + """Ensure that fs_cpus default is used if user config suggests underutilizing resources""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") - # setting "db_cpus" should result in non-default value in comparison & output - db_args = {"custom_pinning": [1], "db_cpus": 8} - db_type = "uds" # Test is insensitive to choice of db + # setting "fs_cpus" should result in non-default value in comparison & output + fs_args = {"custom_pinning": [1], "fs_cpus": 8} + fs_type = "uds" # Test is insensitive to choice of fs exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -225,10 +225,10 @@ def test_jsrun_prep_db_cpu_replacement( colo_model: Model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) @@ -265,8 +265,8 @@ def test_jsrun_prep_rs_per_host( required to meet limitations (e.g. rs_per_host MUST equal 1)""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") - db_args = {"custom_pinning": [1]} - db_type = "tcp" # Test is insensitive to choice of db + fs_args = {"custom_pinning": [1]} + fs_type = "tcp" # Test is insensitive to choice of fs exp = Experiment("colocated_model_lsf", launcher="lsf") @@ -275,12 +275,12 @@ def test_jsrun_prep_rs_per_host( colo_model: Model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) - # NOTE: _prep_colocated_db sets this to a string & not an integer + # NOTE: _prep_colocated_fs sets this to a string & not an integer assert str(colo_model.run_settings.run_args[exp_run_arg_key]) == str(exp_value) diff --git a/tests/test_configs/telemetry/colocatedmodel.json b/tests/test_configs/telemetry/colocatedmodel.json index f3e93ac76..2ac14696c 100644 --- a/tests/test_configs/telemetry/colocatedmodel.json +++ b/tests/test_configs/telemetry/colocatedmodel.json @@ -33,7 +33,7 @@ "Configure": [], "Copy": [] }, - "colocated_db": { + "colocated_fs": { "settings": { "unix_socket": "/tmp/redis.socket", "socket_permissions": 755, @@ -41,13 +41,13 @@ "cpus": 1, "custom_pinning": "0", "debug": false, - "db_identifier": "", + "fs_identifier": "", "rai_args": { "threads_per_queue": null, "inter_op_parallelism": null, "intra_op_parallelism": null }, - "extra_db_args": {} + "extra_fs_args": {} }, "scripts": [], "models": [] @@ -62,8 +62,8 @@ "err_file": "/tmp/my-exp/colocated_model/colocated_model.err" } ], - "orchestrator": [], + "featurestore": [], "ensemble": [] } ] -} +} \ No newline at end of file diff --git a/tests/test_configs/telemetry/db_and_model.json b/tests/test_configs/telemetry/db_and_model.json index 36edc7486..f56b28d7e 100644 --- a/tests/test_configs/telemetry/db_and_model.json +++ b/tests/test_configs/telemetry/db_and_model.json @@ -13,16 +13,16 @@ "run_id": "2ca19ad", "timestamp": 1699038647234488933, "model": [], - "orchestrator": [ + "featurestore": [ { - "name": "orchestrator", + "name": "featurestore", "type": "redis", "interface": [ "ipogif0" ], "shards": [ { - "name": "orchestrator_0", + "name": "featurestore_0", "hostname": "10.128.0.4", "port": 6780, "cluster": false, @@ -33,7 +33,7 @@ "client_count_file": null, "memory_file": "/path/to/some/mem.log", "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/featurestore/featurestore_0", "step_id": "4139111.27", "task_id": "1452", "managed": true @@ -71,7 +71,7 @@ "Configure": [], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", "step_id": "4139111.28", @@ -82,8 +82,8 @@ "err_file": "/tmp/my-exp/perroquet/perroquet.err" } ], - "orchestrator": [], + "featurestore": [], "ensemble": [] } ] -} +} \ No newline at end of file diff --git a/tests/test_configs/telemetry/db_and_model_1run.json b/tests/test_configs/telemetry/db_and_model_1run.json index 44e32bfe4..59617f1f3 100644 --- a/tests/test_configs/telemetry/db_and_model_1run.json +++ b/tests/test_configs/telemetry/db_and_model_1run.json @@ -36,7 +36,7 @@ "Configure": [], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", "step_id": "4139111.28", @@ -47,16 +47,16 @@ "err_file": "/tmp/my-exp/perroquet/perroquet.err" } ], - "orchestrator": [ + "featurestore": [ { - "name": "orchestrator", + "name": "featurestore", "type": "redis", "interface": [ "ipogif0" ], "shards": [ { - "name": "orchestrator_0", + "name": "featurestore_0", "hostname": "10.128.0.4", "port": 6780, "cluster": false, @@ -64,7 +64,7 @@ "out_file": "/path/to/some/file.out", "err_file": "/path/to/some/file.err", "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/featurestore/featurestore_0", "step_id": "4139111.27", "task_id": "1452", "managed": true @@ -76,4 +76,4 @@ "ensemble": [] } ] -} +} \ No newline at end of file diff --git a/tests/test_configs/telemetry/ensembles.json b/tests/test_configs/telemetry/ensembles.json index 67e53ca09..be6f087cb 100644 --- a/tests/test_configs/telemetry/ensembles.json +++ b/tests/test_configs/telemetry/ensembles.json @@ -13,7 +13,7 @@ "run_id": "d041b90", "timestamp": 1698679830384608928, "model": [], - "orchestrator": [], + "featurestore": [], "ensemble": [ { "name": "my-ens", @@ -326,4 +326,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/tests/test_configs/telemetry/serialmodels.json b/tests/test_configs/telemetry/serialmodels.json index 40337eceb..cddc3a830 100644 --- a/tests/test_configs/telemetry/serialmodels.json +++ b/tests/test_configs/telemetry/serialmodels.json @@ -179,8 +179,8 @@ "err_file": "/tmp/my-exp/perroquet_4/perroquet_4.err" } ], - "orchestrator": [], + "featurestore": [], "ensemble": [] } ] -} +} \ No newline at end of file diff --git a/tests/test_configs/telemetry/telemetry.json b/tests/test_configs/telemetry/telemetry.json index 916f5922b..535df556d 100644 --- a/tests/test_configs/telemetry/telemetry.json +++ b/tests/test_configs/telemetry/telemetry.json @@ -6,7 +6,7 @@ }, "runs": [ { - "run_id": "d999ad89-020f-4e6a-b834-dbd88658ce84", + "run_id": "d999ad89-020f-4e6a-b834-fsd88658ce84", "timestamp": 1697824072792854287, "model": [ { @@ -33,20 +33,20 @@ "Configure": [], "Copy": [] }, - "colocated_db": { + "colocated_fs": { "settings": { "port": 5757, "ifname": "lo", "cpus": 1, "custom_pinning": "0", "debug": false, - "db_identifier": "COLO", + "fs_identifier": "COLO", "rai_args": { "threads_per_queue": null, "inter_op_parallelism": null, "intra_op_parallelism": null }, - "extra_db_args": {} + "extra_fs_args": {} }, "scripts": [], "models": [ @@ -59,7 +59,7 @@ ] }, "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d999ad89-020f-4e6a-b834-dbd88658ce84/model/my-model", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d999ad89-020f-4e6a-b834-fsd88658ce84/model/my-model", "step_id": "4121050.30", "task_id": "25230", "managed": true @@ -68,61 +68,61 @@ "err_file": "/path/to/my-exp/my-model/my-model.err" } ], - "orchestrator": [], + "featurestore": [], "ensemble": [] }, { "run_id": "fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa", "timestamp": 1697824102122439975, "model": [], - "orchestrator": [ + "featurestore": [ { - "name": "orchestrator", + "name": "featurestore", "type": "redis", "interface": [ "ipogif0" ], "shards": [ { - "name": "orchestrator_1", + "name": "featurestore_1", "hostname": "10.128.0.70", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_1-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_1-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/featurestore/featurestore", "step_id": "4121050.31+2", "task_id": "25241", "managed": true } }, { - "name": "orchestrator_2", + "name": "featurestore_2", "hostname": "10.128.0.71", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_2-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_2-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/featurestore/featurestore", "step_id": "4121050.31+2", "task_id": "25241", "managed": true } }, { - "name": "orchestrator_0", + "name": "featurestore_0", "hostname": "10.128.0.69", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_0-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_0-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/featurestore/featurestore", "step_id": "4121050.31+2", "task_id": "25241", "managed": true @@ -137,7 +137,7 @@ "run_id": "d65ae1df-cb5e-45d9-ab09-6fa641755997", "timestamp": 1697824127962219505, "model": [], - "orchestrator": [], + "featurestore": [], "ensemble": [ { "name": "my-ens", @@ -186,7 +186,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_0", "step_id": "4121050.32", @@ -225,7 +225,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_1", "step_id": "4121050.33", @@ -264,7 +264,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_2", "step_id": "4121050.34", @@ -303,7 +303,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_3", "step_id": "4121050.35", @@ -342,7 +342,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_4", "step_id": "4121050.36", @@ -381,7 +381,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_5", "step_id": "4121050.37", @@ -420,7 +420,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_6", "step_id": "4121050.38", @@ -459,7 +459,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_7", "step_id": "4121050.39", @@ -501,20 +501,20 @@ "Configure": [], "Copy": [] }, - "colocated_db": { + "colocated_fs": { "settings": { "port": 5757, "ifname": "lo", "cpus": 1, "custom_pinning": "0", "debug": false, - "db_identifier": "COLO", + "fs_identifier": "COLO", "rai_args": { "threads_per_queue": null, "inter_op_parallelism": null, "intra_op_parallelism": null }, - "extra_db_args": {} + "extra_fs_args": {} }, "scripts": [], "models": [ @@ -536,61 +536,61 @@ "err_file": "/path/to/my-exp/my-model/my-model.err" } ], - "orchestrator": [], + "featurestore": [], "ensemble": [] }, { "run_id": "b33a5d27-6822-4795-8e0e-cfea18551fa4", "timestamp": 1697835261956135240, "model": [], - "orchestrator": [ + "featurestore": [ { - "name": "orchestrator", + "name": "featurestore", "type": "redis", "interface": [ "ipogif0" ], "shards": [ { - "name": "orchestrator_0", + "name": "featurestore_0", "hostname": "10.128.0.2", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_0-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_0-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/featurestore/featurestore", "step_id": "4121904.1+2", "task_id": "28289", "managed": true } }, { - "name": "orchestrator_2", + "name": "featurestore_2", "hostname": "10.128.0.4", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_2-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_2-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/featurestore/featurestore", "step_id": "4121904.1+2", "task_id": "28289", "managed": true } }, { - "name": "orchestrator_1", + "name": "featurestore_1", "hostname": "10.128.0.3", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_1-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_1-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/featurestore/featurestore", "step_id": "4121904.1+2", "task_id": "28289", "managed": true @@ -605,7 +605,7 @@ "run_id": "45772df2-fd80-43fd-adf0-d5e319870182", "timestamp": 1697835287798613875, "model": [], - "orchestrator": [], + "featurestore": [], "ensemble": [ { "name": "my-ens", @@ -654,7 +654,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_0", "step_id": "4121904.2", @@ -693,7 +693,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_1", "step_id": "4121904.3", @@ -732,7 +732,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_2", "step_id": "4121904.4", @@ -771,7 +771,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_3", "step_id": "4121904.5", @@ -810,7 +810,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_4", "step_id": "4121904.6", @@ -849,7 +849,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_5", "step_id": "4121904.7", @@ -888,7 +888,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_6", "step_id": "4121904.8", @@ -927,7 +927,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_7", "step_id": "4121904.9", @@ -942,4 +942,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/tests/test_containers.py b/tests/test_containers.py index 98fa5e1bb..18651183b 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -32,7 +32,7 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity import Ensemble from smartsim.settings.containers import Singularity from smartsim.status import SmartSimStatus @@ -155,10 +155,10 @@ def test_singularity_smartredis(test_dir, fileutils, wlmutils): "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" ) - # create and start a database - orc = Orchestrator(port=wlmutils.get_test_port()) - exp.generate(orc) - exp.start(orc, block=False) + # create and start a feature store + feature_store = FeatureStore(port=wlmutils.get_test_port()) + exp.generate(feature_store) + exp.start(feature_store, block=False) container = Singularity(containerURI) @@ -187,10 +187,10 @@ def test_singularity_smartredis(test_dir, fileutils, wlmutils): # get and confirm statuses statuses = exp.get_status(ensemble) if not all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]): - exp.stop(orc) + exp.stop(feature_store) assert False # client ensemble failed - # stop the orchestrator - exp.stop(orc) + # stop the FeatureStore + exp.stop(feature_store) print(exp.summary()) diff --git a/tests/test_controller.py b/tests/test_controller.py index 149872708..34e20aabc 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -30,7 +30,7 @@ from smartsim._core.control.controller import Controller from smartsim._core.launcher.step import Step -from smartsim.database.orchestrator import Orchestrator +from smartsim.database.orchestrator import FeatureStore from smartsim.entity.ensemble import Ensemble from smartsim.settings.slurmSettings import SbatchSettings, SrunSettings @@ -40,7 +40,7 @@ bs = SbatchSettings() ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore(fs_nodes=3, batch=True, launcher="slurm", run_command="srun") class MockStep(Step): @@ -58,7 +58,7 @@ def get_launch_cmd(self): "collection", [ pytest.param(ens, id="Ensemble"), - pytest.param(orc, id="Database"), + pytest.param(feature_store, id="FeatureStore"), ], ) def test_controller_batch_step_creation_preserves_entity_order(collection, monkeypatch): diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index f1e6fef9f..c3b5df4dc 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -29,7 +29,7 @@ from smartsim._core.control import Controller, Manifest from smartsim._core.launcher.step import Step -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity import Model from smartsim.entity.ensemble import Ensemble from smartsim.error import SmartSimError, SSUnsupportedError @@ -47,15 +47,15 @@ ens = Ensemble("ensemble_name", params={}, run_settings=entity_settings, replicas=2) # Ensemble entity slightly different but with same name ens_2 = Ensemble("ensemble_name", params={}, run_settings=entity_settings, replicas=3) -orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore(fs_nodes=3, batch=True, launcher="slurm", run_command="srun") -def test_finished_entity_orc_error(): - """Orchestrators are never 'finished', either run forever or stopped by user""" - orc = Orchestrator() +def test_finished_entity_feature_store_error(): + """FeatureStores are never 'finished', either run forever or stopped by user""" + feature_store = FeatureStore() cont = Controller(launcher="local") with pytest.raises(TypeError): - cont.finished(orc) + cont.finished(feature_store) def test_finished_entity_wrong_type(): @@ -100,26 +100,26 @@ def test_no_launcher(): cont.init_launcher(None) -def test_wrong_orchestrator(wlmutils): +def test_wrong_feature_store(wlmutils): # lo interface to avoid warning from SmartSim - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, interface="lo", run_command="aprun", launcher="pbs", ) cont = Controller(launcher="local") - manifest = Manifest(orc) + manifest = Manifest(feature_store) with pytest.raises(SmartSimError): cont._launch("exp_name", "exp_path", manifest) -def test_bad_orc_checkpoint(): +def test_bad_feature_store_checkpoint(): checkpoint = "./bad-checkpoint" cont = Controller(launcher="local") with pytest.raises(FileNotFoundError): - cont.reload_saved_db(checkpoint) + cont.reload_saved_fs(checkpoint) class MockStep(Step): @@ -136,12 +136,12 @@ def get_launch_cmd(self): [ pytest.param(ens, id="Ensemble_running"), pytest.param(model, id="Model_running"), - pytest.param(orc, id="Orch_running"), + pytest.param(feature_store, id="Feature_store_running"), ], ) def test_duplicate_running_entity(test_dir, wlmutils, entity): """This test validates that users cannot reuse entity names - that are running in JobManager.jobs or JobManager.db_jobs + that are running in JobManager.jobs or JobManager.fs_jobs """ step_settings = RunSettings("echo") step = MockStep("mock-step", test_dir, step_settings) @@ -170,17 +170,17 @@ def test_restarting_entity(test_dir, wlmutils, entity): controller._launch_step(step, entity=entity) -def test_restarting_orch(test_dir, wlmutils): - """Validate restarting a completed Orchestrator job""" +def test_restarting_feature_storeh(test_dir, wlmutils): + """Validate restarting a completed FeatureStore job""" step_settings = RunSettings("echo") step = MockStep("mock-step", test_dir, step_settings) step.meta["status_dir"] = test_dir - orc.path = test_dir + feature_store.path = test_dir test_launcher = wlmutils.get_test_launcher() controller = Controller(test_launcher) - controller._jobs.add_job(orc.name, job_id="1234", entity=orc) - controller._jobs.move_to_completed(controller._jobs.db_jobs.get(orc.name)) - controller._launch_step(step, entity=orc) + controller._jobs.add_job(feature_store.name, job_id="1234", entity=feature_store) + controller._jobs.move_to_completed(controller._jobs.fs_jobs.get(feature_store.name)) + controller._launch_step(step, entity=feature_store) @pytest.mark.parametrize( diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index 227572ac9..231961c33 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -33,38 +33,38 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator -from smartsim.entity.dbnode import DBNode, LaunchedShardData +from smartsim.database import FeatureStore +from smartsim.entity.dbnode import FSNode, LaunchedShardData from smartsim.error.errors import SmartSimError # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a -def test_parse_db_host_error(): - orc = Orchestrator() - orc.entities[0].path = "not/a/path" - # Fail to obtain database hostname +def test_parse_fs_host_error(): + feature_store = FeatureStore() + feature_store.entities[0].path = "not/a/path" + # Fail to obtain feature store hostname with pytest.raises(SmartSimError): - orc.entities[0].host + feature_store.entities[0].host def test_hosts(test_dir, wlmutils): exp_name = "test_hosts" exp = Experiment(exp_name, exp_path=test_dir) - orc = Orchestrator(port=wlmutils.get_test_port(), interface="lo", launcher="local") - orc.set_path(test_dir) - exp.start(orc) + feature_store = FeatureStore(port=wlmutils.get_test_port(), interface="lo", launcher="local") + feature_store.set_path(test_dir) + exp.start(feature_store) hosts = [] try: - hosts = orc.hosts - assert len(hosts) == orc.db_nodes == 1 + hosts = feature_store.hosts + assert len(hosts) == feature_store.fs_nodes == 1 finally: - # stop the database even if there is an error raised - exp.stop(orc) - orc.remove_stale_files() + # stop the feature store even if there is an error raised + exp.stop(feature_store) + feature_store.remove_stale_files() def _random_shard_info(): @@ -91,7 +91,7 @@ def test_launched_shard_info_can_be_serialized(): @pytest.mark.parametrize("limit", [None, 1]) -def test_db_node_can_parse_launched_shard_info(limit): +def test_fs_node_can_parse_launched_shard_info(limit): rand_shards = [_random_shard_info() for _ in range(3)] with io.StringIO(textwrap.dedent("""\ This is some file like str @@ -100,7 +100,7 @@ def test_db_node_can_parse_launched_shard_info(limit): SMARTSIM_ORC_SHARD_INFO: {} ^^^^^^^^^^^^^^^^^^^^^^^ We should be able to parse the serialized - launched db info from this file if the line is + launched fs info from this file if the line is prefixed with this tag. Here are two more for good measure: @@ -109,28 +109,28 @@ def test_db_node_can_parse_launched_shard_info(limit): All other lines should be ignored. """).format(*(json.dumps(s.to_dict()) for s in rand_shards))) as stream: - parsed_shards = DBNode._parse_launched_shard_info_from_iterable(stream, limit) + parsed_shards = FSNode._parse_launched_shard_info_from_iterable(stream, limit) if limit is not None: rand_shards = rand_shards[:limit] assert rand_shards == parsed_shards def test_set_host(): - orc = Orchestrator() - orc.entities[0].set_hosts(["host"]) - assert orc.entities[0].host == "host" + feature_store = FeatureStore() + feature_store.entities[0].set_hosts(["host"]) + assert feature_store.entities[0].host == "host" @pytest.mark.parametrize("nodes, mpmd", [[3, False], [3, True], [1, False]]) -def test_db_id_and_name(mpmd, nodes, wlmutils): +def test_fs_id_and_name(mpmd, nodes, wlmutils): if nodes > 1 and wlmutils.get_test_launcher() not in pytest.wlm_options: - pytest.skip(reason="Clustered DB can only be checked on WLMs") - orc = Orchestrator( - db_identifier="test_db", - db_nodes=nodes, + pytest.skip(reason="Clustered fs can only be checked on WLMs") + feature_store = FeatureStore( + fs_identifier="test_fs", + fs_nodes=nodes, single_cmd=mpmd, launcher=wlmutils.get_test_launcher(), ) - for i, node in enumerate(orc.entities): - assert node.name == f"{orc.name}_{i}" - assert node.db_identifier == orc.db_identifier + for i, node in enumerate(feature_store.entities): + assert node.name == f"{feature_store.name}_{i}" + assert node.fs_identifier == feature_store.fs_identifier diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 32c642eb4..bd609b530 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -35,7 +35,7 @@ from smartsim._core.config import CONFIG from smartsim._core.config.config import Config from smartsim._core.utils import serialize -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity import Model from smartsim.error import SmartSimError from smartsim.error.errors import SSUnsupportedError @@ -248,21 +248,21 @@ def test_error_on_cobalt() -> None: exp = Experiment("cobalt_exp", launcher="cobalt") -def test_default_orch_path( +def test_default_feature_store_path( monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" ) -> None: - """Ensure the default file structure is created for Orchestrator""" + """Ensure the default file structure is created for FeatureStore""" - exp_name = "default-orch-path" + exp_name = "default-feature-store-path" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) - db = exp.create_database( + db = exp.create_feature_store( port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface() ) exp.start(db) - orch_path = pathlib.Path(test_dir) / db.name - assert orch_path.exists() - assert db.path == str(orch_path) + feature_store_path = pathlib.Path(test_dir) / db.name + assert feature_store_path.exists() + assert db.path == str(feature_store_path) def test_default_model_path( @@ -303,24 +303,24 @@ def test_default_ensemble_path( assert member.path == str(ensemble_path / member.name) -def test_user_orch_path( +def test_user_feature_store_path( monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" ) -> None: - """Ensure a relative path is used to created Orchestrator folder""" + """Ensure a relative path is used to created FeatureStore folder""" - exp_name = "default-orch-path" + exp_name = "default-feature-store-path" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) - db = exp.create_database( + db = exp.create_feature_store( port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface(), path="./testing_folder1234", ) exp.start(db) - orch_path = pathlib.Path(osp.abspath("./testing_folder1234")) - assert orch_path.exists() - assert db.path == str(orch_path) - shutil.rmtree(orch_path) + feature_store_path = pathlib.Path(osp.abspath("./testing_folder1234")) + assert feature_store_path.exists() + assert db.path == str(feature_store_path) + shutil.rmtree(feature_store_path) def test_default_model_with_path( diff --git a/tests/test_generator.py b/tests/test_generator.py index fd9a5b836..a9891a0e4 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -32,7 +32,7 @@ from smartsim import Experiment from smartsim._core.generation import Generator -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.settings import RunSettings # The tests in this file belong to the group_a group @@ -123,21 +123,21 @@ def test_full_exp(fileutils, test_dir, wlmutils): script = fileutils.get_test_conf_path("sleep.py") model.attach_generator_files(to_copy=script) - orc = Orchestrator(wlmutils.get_test_port()) + feature_store = FeatureStore(wlmutils.get_test_port()) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test_ens", params=params, run_settings=rs) config = get_gen_file(fileutils, "in.atm") ensemble.attach_generator_files(to_configure=config) - exp.generate(orc, ensemble, model) + exp.generate(feature_store, ensemble, model) # test for ensemble assert osp.isdir(osp.join(test_dir, "test_ens/")) for i in range(9): assert osp.isdir(osp.join(test_dir, "test_ens/test_ens_" + str(i))) - # test for orc dir - assert osp.isdir(osp.join(test_dir, orc.name)) + # test for feature_store dir + assert osp.isdir(osp.join(test_dir, feature_store.name)) # test for model file assert osp.isdir(osp.join(test_dir, "model")) diff --git a/tests/test_indirect.py b/tests/test_indirect.py index 814302968..7766b5825 100644 --- a/tests/test_indirect.py +++ b/tests/test_indirect.py @@ -54,7 +54,7 @@ [ pytest.param("indirect.py", {"+name", "+command", "+entity_type", "+telemetry_dir", "+working_dir"}, id="no args"), pytest.param("indirect.py -c echo +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="cmd typo"), - pytest.param("indirect.py -t orchestrator +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="etype typo"), + pytest.param("indirect.py -t featurestore +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="etype typo"), pytest.param("indirect.py -d /foo/bar +entity_type ttt +command ccc +output_file ooo +working_dir www +error_file eee", {"+telemetry_dir"}, id="dir typo"), pytest.param("indirect.py +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="no cmd"), pytest.param("indirect.py +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="no etype"), diff --git a/tests/test_interrupt.py b/tests/test_interrupt.py index c38ae0225..abb8e6dc0 100644 --- a/tests/test_interrupt.py +++ b/tests/test_interrupt.py @@ -75,9 +75,9 @@ def test_interrupt_blocked_jobs(test_dir): time.sleep(2) # allow time for jobs to be stopped active_jobs = exp._control._jobs.jobs - active_db_jobs = exp._control._jobs.db_jobs + active_fs_jobs = exp._control._jobs.fs_jobs completed_jobs = exp._control._jobs.completed - assert len(active_jobs) + len(active_db_jobs) == 0 + assert len(active_jobs) + len(active_fs_jobs) == 0 assert len(completed_jobs) == num_jobs @@ -120,7 +120,7 @@ def test_interrupt_multi_experiment_unblocked_jobs(test_dir): time.sleep(2) # allow time for jobs to be stopped for i, experiment in enumerate(experiments): active_jobs = experiment._control._jobs.jobs - active_db_jobs = experiment._control._jobs.db_jobs + active_fs_jobs = experiment._control._jobs.fs_jobs completed_jobs = experiment._control._jobs.completed - assert len(active_jobs) + len(active_db_jobs) == 0 + assert len(active_jobs) + len(active_fs_jobs) == 0 assert len(completed_jobs) == jobs_per_experiment[i] diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index 21b3184e5..1676b8029 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -28,7 +28,7 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.error import SSUnsupportedError from smartsim.settings import JsrunSettings, RunSettings from smartsim.status import SmartSimStatus @@ -61,18 +61,18 @@ def test_model_failure(fileutils, test_dir): assert all([stat == SmartSimStatus.STATUS_FAILED for stat in statuses]) -def test_orchestrator_relaunch(test_dir, wlmutils): - """Test when users try to launch second orchestrator""" - exp_name = "test-orc-on-relaunch" +def test_feature_store_relaunch(test_dir, wlmutils): + """Test when users try to launch second FeatureStore""" + exp_name = "test-feature-store-on-relaunch" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - orc = Orchestrator(port=wlmutils.get_test_port(), db_identifier="orch_1") - orc.set_path(test_dir) - orc_1 = Orchestrator(port=wlmutils.get_test_port() + 1, db_identifier="orch_2") - orc_1.set_path(test_dir) + feature_store = FeatureStore(port=wlmutils.get_test_port(), fs_identifier="feature_store_1") + feature_store.set_path(test_dir) + feature_store_1 = FeatureStore(port=wlmutils.get_test_port() + 1, fs_identifier="feature_store_2") + feature_store_1.set_path(test_dir) try: - exp.start(orc) - exp.start(orc_1) + exp.start(feature_store) + exp.start(feature_store_1) finally: - exp.stop(orc) - exp.stop(orc_1) + exp.stop(feature_store) + exp.stop(feature_store_1) diff --git a/tests/test_lsf_settings.py b/tests/test_lsf_settings.py index fcb351648..64dbd001c 100644 --- a/tests/test_lsf_settings.py +++ b/tests/test_lsf_settings.py @@ -144,7 +144,7 @@ def test_jsrun_mpmd(): def test_catch_colo_mpmd(): settings = JsrunSettings("python") - settings.colocated_db_settings = {"port": 6379, "cpus": 1} + settings.colocated_fs_settings = {"port": 6379, "cpus": 1} settings_2 = JsrunSettings("python") with pytest.raises(SSUnsupportedError): settings.make_mpmd(settings_2) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index c26868ebb..9475ea42f 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -40,8 +40,8 @@ from smartsim._core.control.manifest import ( _LaunchedManifestMetadata as LaunchedManifestMetadata, ) -from smartsim.database import Orchestrator -from smartsim.entity.dbobject import DBModel, DBScript +from smartsim.database import FeatureStore +from smartsim.entity.dbobject import FSModel, FSScript from smartsim.error import SmartSimError from smartsim.settings import RunSettings @@ -58,21 +58,21 @@ model_2 = exp.create_model("model_1", run_settings=rs) ensemble = exp.create_ensemble("ensemble", run_settings=rs, replicas=1) -orc = Orchestrator() -orc_1 = deepcopy(orc) -orc_1.name = "orc2" +feature_store = FeatureStore() +feature_store_1 = deepcopy(feature_store) +feature_store_1.name = "feature_store2" -db_script = DBScript("some-script", "def main():\n print('hello world')\n") -db_model = DBModel("some-model", "TORCH", b"some-model-bytes") +fs_script = FSScript("some-script", "def main():\n print('hello world')\n") +fs_model = FSModel("some-model", "TORCH", b"some-model-bytes") def test_separate(): - manifest = Manifest(model, ensemble, orc) + manifest = Manifest(model, ensemble, feature_store) assert manifest.models[0] == model assert len(manifest.models) == 1 assert manifest.ensembles[0] == ensemble assert len(manifest.ensembles) == 1 - assert manifest.dbs[0] == orc + assert manifest.fss[0] == feature_store def test_separate_type(): @@ -106,51 +106,51 @@ class Person: @pytest.mark.parametrize( - "patch, has_db_objects", + "patch, has_fs_objects", [ - pytest.param((), False, id="No DB Objects"), - pytest.param((model, "_db_models", [db_model]), True, id="Model w/ DB Model"), + pytest.param((), False, id="No fs Objects"), + pytest.param((model, "_fs_models", [fs_model]), True, id="Model w/ fs Model"), pytest.param( - (model, "_db_scripts", [db_script]), True, id="Model w/ DB Script" + (model, "_fs_scripts", [fs_script]), True, id="Model w/ fs Script" ), pytest.param( - (ensemble, "_db_models", [db_model]), True, id="Ensemble w/ DB Model" + (ensemble, "_fs_models", [fs_model]), True, id="Ensemble w/ fs Model" ), pytest.param( - (ensemble, "_db_scripts", [db_script]), True, id="Ensemble w/ DB Script" + (ensemble, "_fs_scripts", [fs_script]), True, id="Ensemble w/ fs Script" ), pytest.param( - (ensemble.entities[0], "_db_models", [db_model]), + (ensemble.entities[0], "_fs_models", [fs_model]), True, - id="Ensemble Member w/ DB Model", + id="Ensemble Member w/ fs Model", ), pytest.param( - (ensemble.entities[0], "_db_scripts", [db_script]), + (ensemble.entities[0], "_fs_scripts", [fs_script]), True, - id="Ensemble Member w/ DB Script", + id="Ensemble Member w/ fs Script", ), ], ) -def test_manifest_detects_db_objects(monkeypatch, patch, has_db_objects): +def test_manifest_detects_fs_objects(monkeypatch, patch, has_fs_objects): if patch: monkeypatch.setattr(*patch) - assert Manifest(model, ensemble).has_db_objects == has_db_objects + assert Manifest(model, ensemble).has_fs_objects == has_fs_objects def test_launched_manifest_transform_data(): models = [(model, 1), (model_2, 2)] ensembles = [(ensemble, [(m, i) for i, m in enumerate(ensemble.entities)])] - dbs = [(orc, [(n, i) for i, n in enumerate(orc.entities)])] + fss = [(feature_store, [(n, i) for i, n in enumerate(feature_store.entities)])] launched = LaunchedManifest( metadata=LaunchedManifestMetadata("name", "path", "launcher", "run_id"), models=models, ensembles=ensembles, - databases=dbs, + featurestores=fss, ) transformed = launched.map(lambda x: str(x)) assert transformed.models == tuple((m, str(i)) for m, i in models) assert transformed.ensembles[0][1] == tuple((m, str(i)) for m, i in ensembles[0][1]) - assert transformed.databases[0][1] == tuple((n, str(i)) for n, i in dbs[0][1]) + assert transformed.featurestores[0][1] == tuple((n, str(i)) for n, i in fss[0][1]) def test_launched_manifest_builder_correctly_maps_data(): @@ -158,12 +158,12 @@ def test_launched_manifest_builder_correctly_maps_data(): lmb.add_model(model, 1) lmb.add_model(model_2, 1) lmb.add_ensemble(ensemble, [i for i in range(len(ensemble.entities))]) - lmb.add_database(orc, [i for i in range(len(orc.entities))]) + lmb.add_feature_store(feature_store, [i for i in range(len(feature_store.entities))]) manifest = lmb.finalize() assert len(manifest.models) == 2 assert len(manifest.ensembles) == 1 - assert len(manifest.databases) == 1 + assert len(manifest.featurestores) == 1 def test_launced_manifest_builder_raises_if_lens_do_not_match(): @@ -171,7 +171,7 @@ def test_launced_manifest_builder_raises_if_lens_do_not_match(): with pytest.raises(ValueError): lmb.add_ensemble(ensemble, list(range(123))) with pytest.raises(ValueError): - lmb.add_database(orc, list(range(123))) + lmb.add_feature_store(feature_store, list(range(123))) def test_launched_manifest_builer_raises_if_attaching_data_to_empty_collection( diff --git a/tests/test_model.py b/tests/test_model.py index 64a68b299..de912d169 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -71,7 +71,7 @@ def test_catch_colo_mpmd_model(): # make it colocated which should raise and error with pytest.raises(SSUnsupportedError): - model.colocate_db() + model.colocate_fs() def test_attach_batch_settings_to_model(): diff --git a/tests/test_mpi_settings.py b/tests/test_mpi_settings.py index 7d8db6e75..40c3f4ce0 100644 --- a/tests/test_mpi_settings.py +++ b/tests/test_mpi_settings.py @@ -173,7 +173,7 @@ def test_mpi_add_mpmd(): def test_catch_colo_mpmd(): settings = _BaseMPISettings(*default_mpi_args, **default_mpi_kwargs) - settings.colocated_db_settings = {"port": 6379, "cpus": 1} + settings.colocated_fs_settings = {"port": 6379, "cpus": 1} settings_2 = _BaseMPISettings(*default_mpi_args, **default_mpi_kwargs) with pytest.raises(SSUnsupportedError): settings.make_mpmd(settings_2) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 13c8d86e7..959abb294 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -28,7 +28,7 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity.entity import SmartSimEntity from smartsim.error.errors import SSDBIDConflictError from smartsim.log import get_logger @@ -40,7 +40,7 @@ logger = get_logger(__name__) -supported_dbs = ["uds", "tcp"] +supported_fss = ["uds", "tcp"] on_wlm = (pytest.test_launcher in pytest.wlm_options,) @@ -69,73 +69,73 @@ def check_not_failed(exp, *args): assert all(stat is not SmartSimStatus.STATUS_FAILED for stat in statuses) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_db_identifier_standard_then_colo_error( - fileutils, wlmutils, coloutils, db_type, test_dir +@pytest.mark.parametrize("fs_type", supported_fss) +def test_fs_identifier_standard_then_colo_error( + fileutils, wlmutils, coloutils, fs_type, test_dir ): - """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp - with unique db_identifiers""" + """Test that it is possible to create_feature_store then colocate_fs_uds/colocate_fs_tcp + with unique fs_identifiers""" # Set experiment name - exp_name = "test_db_identifier_standard_then_colo" + exp_name = "test_fs_identifier_standard_then_colo" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") + test_script = fileutils.get_test_conf_path("smartredis/fs_id_err.py") # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # create regular database - orc = exp.create_database( + # create regular feature store + feature_store = exp.create_feature_store( port=test_port, interface=test_interface, - db_identifier="testdb_colo", + fs_identifier="testdb_colo", hosts=choose_host(wlmutils), ) - assert orc.name == "testdb_colo" + assert feature_store.name == "testdb_colo" - db_args = { + fs_args = { "port": test_port + 1, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } smartsim_model = coloutils.setup_test_colo( - fileutils, db_type, exp, test_script, db_args, on_wlm=on_wlm + fileutils, fs_type, exp, test_script, fs_args, on_wlm=on_wlm ) assert ( - smartsim_model.run_settings.colocated_db_settings["db_identifier"] + smartsim_model.run_settings.colocated_fs_settings["fs_identifier"] == "testdb_colo" ) - with make_entity_context(exp, orc), make_entity_context(exp, smartsim_model): - exp.start(orc) + with make_entity_context(exp, feature_store), make_entity_context(exp, smartsim_model): + exp.start(feature_store) with pytest.raises(SSDBIDConflictError) as ex: exp.start(smartsim_model) assert ( - "has already been used. Pass in a unique name for db_identifier" + "has already been used. Pass in a unique name for fs_identifier" in ex.value.args[0] ) - check_not_failed(exp, orc) + check_not_failed(exp, feature_store) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_db_identifier_colo_then_standard( - fileutils, wlmutils, coloutils, db_type, test_dir +@pytest.mark.parametrize("fs_type", supported_fss) +def test_fs_identifier_colo_then_standard( + fileutils, wlmutils, coloutils, fs_type, test_dir ): - """Test colocate_db_uds/colocate_db_tcp then create_database with database + """Test colocate_fs_uds/colocate_fs_tcp then create_feature_store with feature store identifiers. """ # Set experiment name - exp_name = "test_db_identifier_colo_then_standard" + exp_name = "test_fs_identifier_colo_then_standard" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -154,50 +154,50 @@ def test_db_identifier_colo_then_standard( # Create the SmartSim Model smartsim_model = exp.create_model("colocated_model", colo_settings) - db_args = { + fs_args = { "port": test_port, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } smartsim_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, test_script, - db_args, + fs_args, on_wlm=on_wlm, ) assert ( - smartsim_model.run_settings.colocated_db_settings["db_identifier"] + smartsim_model.run_settings.colocated_fs_settings["fs_identifier"] == "testdb_colo" ) - # Create Database - orc = exp.create_database( + # Create feature store + feature_store = exp.create_feature_store( port=test_port + 1, interface=test_interface, - db_identifier="testdb_colo", + fs_identifier="testdb_colo", hosts=choose_host(wlmutils), ) - assert orc.name == "testdb_colo" + assert feature_store.name == "testdb_colo" - with make_entity_context(exp, orc), make_entity_context(exp, smartsim_model): + with make_entity_context(exp, feature_store), make_entity_context(exp, smartsim_model): exp.start(smartsim_model, block=True) - exp.start(orc) + exp.start(feature_store) - check_not_failed(exp, orc, smartsim_model) + check_not_failed(exp, feature_store, smartsim_model) -def test_db_identifier_standard_twice_not_unique(wlmutils, test_dir): - """Test uniqueness of db_identifier several calls to create_database, with non unique names, +def test_fs_identifier_standard_twice_not_unique(wlmutils, test_dir): + """Test uniqueness of fs_identifier several calls to create_feature_store, with non unique names, checking error is raised before exp start is called""" # Set experiment name - exp_name = "test_db_identifier_multiple_create_database_not_unique" + exp_name = "test_fs_identifier_multiple_create_feature_store_not_unique" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -207,42 +207,42 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, test_dir): # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # CREATE DATABASE with db_identifier - orc = exp.create_database( + # CREATE feature store with fs_identifier + feature_store = exp.create_feature_store( port=test_port, interface=test_interface, - db_identifier="my_db", + fs_identifier="my_fs", hosts=choose_host(wlmutils), ) - assert orc.name == "my_db" + assert feature_store.name == "my_fs" - orc2 = exp.create_database( + feature_store2 = exp.create_feature_store( port=test_port + 1, interface=test_interface, - db_identifier="my_db", + fs_identifier="my_fs", hosts=choose_host(wlmutils, index=1), ) - assert orc2.name == "my_db" + assert feature_store2.name == "my_fs" - # CREATE DATABASE with db_identifier - with make_entity_context(exp, orc), make_entity_context(exp, orc2): - exp.start(orc) + # CREATE feature store with fs_identifier + with make_entity_context(exp, feature_store), make_entity_context(exp, feature_store2): + exp.start(feature_store) with pytest.raises(SSDBIDConflictError) as ex: - exp.start(orc2) + exp.start(feature_store) assert ( - "has already been used. Pass in a unique name for db_identifier" + "has already been used. Pass in a unique name for fs_identifier" in ex.value.args[0] ) - check_not_failed(exp, orc) + check_not_failed(exp, feature_store) -def test_db_identifier_create_standard_once(test_dir, wlmutils): - """One call to create database with a database identifier""" +def test_fs_identifier_create_standard_once(test_dir, wlmutils): + """One call to create feature store with a feature storeidentifier""" # Set experiment name - exp_name = "test_db_identifier_create_standard_once" + exp_name = "test_fs_identifier_create_standard_once" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -252,22 +252,22 @@ def test_db_identifier_create_standard_once(test_dir, wlmutils): # Create the SmartSim Experiment exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create the SmartSim database - db = exp.create_database( + # Create the SmartSim feature store + fs = exp.create_feature_store( port=test_port, - db_nodes=1, + fs_nodes=1, interface=test_interface, - db_identifier="testdb_reg", + fs_identifier="testdb_reg", hosts=choose_host(wlmutils), ) - with make_entity_context(exp, db): - exp.start(db) + with make_entity_context(exp, fs): + exp.start(fs) - check_not_failed(exp, db) + check_not_failed(exp, fs) -def test_multidb_create_standard_twice(wlmutils, test_dir): - """Multiple calls to create database with unique db_identifiers""" +def test_multifs_create_standard_twice(wlmutils, test_dir): + """Multiple calls to create feature store with unique fs_identifiers""" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -276,36 +276,36 @@ def test_multidb_create_standard_twice(wlmutils, test_dir): # start a new Experiment for this section exp = Experiment( - "test_multidb_create_standard_twice", exp_path=test_dir, launcher=test_launcher + "test_multifs_create_standard_twice", exp_path=test_dir, launcher=test_launcher ) - # create and start an instance of the Orchestrator database - db = exp.create_database( + # create and start an instance of the FeatureStore feature store + fs = exp.create_feature_store( port=test_port, interface=test_interface, - db_identifier="testdb_reg", + fs_identifier="testdb_reg", hosts=choose_host(wlmutils, 1), ) - # create database with different db_id - db2 = exp.create_database( + # create feature store with different fs_id + fs2 = exp.create_feature_store( port=test_port + 1, interface=test_interface, - db_identifier="testdb_reg2", + fs_identifier="testdb_reg2", hosts=choose_host(wlmutils, 2), ) # launch - with make_entity_context(exp, db), make_entity_context(exp, db2): - exp.start(db, db2) + with make_entity_context(exp, fs), make_entity_context(exp, fs2): + exp.start(fs, fs2) - with make_entity_context(exp, db), make_entity_context(exp, db2): - exp.start(db, db2) + with make_entity_context(exp, fs), make_entity_context(exp, fs2): + exp.start(fs, fs2) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): - """create one model with colocated database with db_identifier""" +@pytest.mark.parametrize("fs_type", supported_fss) +def test_multifs_colo_once(fileutils, test_dir, wlmutils, coloutils, fs_type): + """create one model with colocated feature store with fs_identifier""" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -315,7 +315,7 @@ def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): # start a new Experiment for this section exp = Experiment( - "test_multidb_colo_once", launcher=test_launcher, exp_path=test_dir + "test_multifs_colo_once", launcher=test_launcher, exp_path=test_dir ) # create run settings @@ -326,20 +326,20 @@ def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) - db_args = { + fs_args = { "port": test_port + 1, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } - # Create model with colocated database + # Create model with colocated feature store smartsim_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, test_script, - db_args, + fs_args, on_wlm=on_wlm, ) @@ -349,9 +349,9 @@ def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): check_not_failed(exp, smartsim_model) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_standard_then_colo(fileutils, test_dir, wlmutils, coloutils, db_type): - """Create regular database then colocate_db_tcp/uds with unique db_identifiers""" +@pytest.mark.parametrize("fs_type", supported_fss) +def test_multifs_standard_then_colo(fileutils, test_dir, wlmutils, coloutils, fs_type): + """Create regular feature store then colocate_fs_tcp/uds with unique fs_identifiers""" # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() @@ -362,43 +362,43 @@ def test_multidb_standard_then_colo(fileutils, test_dir, wlmutils, coloutils, db # start a new Experiment for this section exp = Experiment( - "test_multidb_standard_then_colo", exp_path=test_dir, launcher=test_launcher + "test_multifs_standard_then_colo", exp_path=test_dir, launcher=test_launcher ) - # create and generate an instance of the Orchestrator database - db = exp.create_database( + # create and generate an instance of the FeatureStore feature store + fs = exp.create_feature_store( port=test_port, interface=test_interface, - db_identifier="testdb_reg", + fs_identifier="testdb_reg", hosts=choose_host(wlmutils), ) - db_args = { + fs_args = { "port": test_port + 1, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } - # Create model with colocated database + # Create model with colocated feature store smartsim_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, test_script, - db_args, + fs_args, on_wlm=on_wlm, ) - with make_entity_context(exp, db), make_entity_context(exp, smartsim_model): - exp.start(db) + with make_entity_context(exp, fs), make_entity_context(exp, smartsim_model): + exp.start(fs) exp.start(smartsim_model, block=True) - check_not_failed(exp, smartsim_model, db) + check_not_failed(exp, smartsim_model, fs) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db_type): - """create regular database then colocate_db_tcp/uds with unique db_identifiers""" +@pytest.mark.parametrize("fs_type", supported_fss) +def test_multifs_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, fs_type): + """create regular feature store then colocate_fs_tcp/uds with unique fs_identifiers""" # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() @@ -409,48 +409,48 @@ def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db # start a new Experiment exp = Experiment( - "test_multidb_colo_then_standard", exp_path=test_dir, launcher=test_launcher + "test_multifs_colo_then_standard", exp_path=test_dir, launcher=test_launcher ) - db_args = { + fs_args = { "port": test_port, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } - # Create model with colocated database + # Create model with colocated feature store smartsim_model = coloutils.setup_test_colo( - fileutils, db_type, exp, test_script, db_args, on_wlm=on_wlm + fileutils, fs_type, exp, test_script, fs_args, on_wlm=on_wlm ) - # create and start an instance of the Orchestrator database - db = exp.create_database( + # create and start an instance of the FeatureStore feature store + fs = exp.create_feature_store( port=test_port + 1, interface=test_interface, - db_identifier="testdb_reg", + fs_identifier="testdb_reg", hosts=choose_host(wlmutils), ) - with make_entity_context(exp, db), make_entity_context(exp, smartsim_model): - exp.start(db) + with make_entity_context(exp, fs), make_entity_context(exp, smartsim_model): + exp.start(fs) exp.start(smartsim_model, block=True) - check_not_failed(exp, db, smartsim_model) + check_not_failed(exp, fs, smartsim_model) @pytest.mark.skipif( pytest.test_launcher not in pytest.wlm_options, reason="Not testing WLM integrations", ) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_launch_cluster_orc_single_dbid( - test_dir, coloutils, fileutils, wlmutils, db_type +@pytest.mark.parametrize("fs_type", supported_fss) +def test_launch_cluster_feature_store_single_dbid( + test_dir, coloutils, fileutils, wlmutils, fs_type ): - """test clustered 3-node orchestrator with single command with a database identifier""" + """test clustered 3-node FeatureStore with single command with a feature store identifier""" # TODO detect number of nodes in allocation and skip if not sufficent - exp_name = "test_launch_cluster_orc_single_dbid" + exp_name = "test_launch_cluster_feature_store_single_dbid" launcher = wlmutils.get_test_launcher() test_port = wlmutils.get_test_port() test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") @@ -458,32 +458,32 @@ def test_launch_cluster_orc_single_dbid( # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc: Orchestrator = exp.create_database( + feature_store: FeatureStore = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface=network_interface, single_cmd=True, hosts=wlmutils.get_test_hostlist(), - db_identifier="testdb_reg", + fs_identifier="testdb_reg", ) - db_args = { + fs_args = { "port": test_port, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } - # Create model with colocated database + # Create model with colocated feature store smartsim_model = coloutils.setup_test_colo( - fileutils, db_type, exp, test_script, db_args, on_wlm=on_wlm + fileutils, fs_type, exp, test_script, fs_args, on_wlm=on_wlm ) - with make_entity_context(exp, orc), make_entity_context(exp, smartsim_model): - exp.start(orc, block=True) + with make_entity_context(exp, feature_store), make_entity_context(exp, smartsim_model): + exp.start(feature_store, block=True) exp.start(smartsim_model, block=True) - job_dict = exp._control._jobs.get_db_host_addresses() - assert len(job_dict[orc.entities[0].db_identifier]) == 3 + job_dict = exp._control._jobs.get_fs_host_addresses() + assert len(job_dict[feature_store.entities[0].fs_identifier]) == 3 - check_not_failed(exp, orc, smartsim_model) + check_not_failed(exp, feature_store, smartsim_model) diff --git a/tests/test_orc_config_settings.py b/tests/test_orc_config_settings.py index 365596496..dc49f9d6a 100644 --- a/tests/test_orc_config_settings.py +++ b/tests/test_orc_config_settings.py @@ -40,40 +40,40 @@ pytestmark = pytest.mark.group_b -def test_config_methods(dbutils, local_db): - """Test all configuration file edit methods on an active db""" +def test_config_methods(fsutils, local_fs): + """Test all configuration file edit methods on an active fs""" # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs - configs = dbutils.get_db_configs() + configs = fsutils.get_fs_configs() for setting, value in configs.items(): - config_set_method = dbutils.get_config_edit_method(local_db, setting) + config_set_method = fsutils.get_config_edit_method(local_fs, setting) config_set_method(value) - # ensure SmartSimError is raised when Orchestrator.set_db_conf + # ensure SmartSimError is raised when FeatureStore.set_fs_conf # is given invalid CONFIG key-value pairs - ss_error_configs = dbutils.get_smartsim_error_db_configs() + ss_error_configs = fsutils.get_smartsim_error_fs_configs() for key, value_list in ss_error_configs.items(): for value in value_list: with pytest.raises(SmartSimError): - local_db.set_db_conf(key, value) + local_fs.set_fs_conf(key, value) - # ensure TypeError is raised when Orchestrator.set_db_conf + # ensure TypeError is raised when FeatureStore.set_fs_conf # is given either a key or a value that is not a string - type_error_configs = dbutils.get_type_error_db_configs() + type_error_configs = fsutils.get_type_error_fs_configs() for key, value_list in type_error_configs.items(): for value in value_list: with pytest.raises(TypeError): - local_db.set_db_conf(key, value) + local_fs.set_fs_conf(key, value) -def test_config_methods_inactive(wlmutils, dbutils): +def test_config_methods_inactive(wlmutils, fsutils): """Ensure a SmartSimError is raised when trying to - set configurations on an inactive database + set configurations on an inactive feature store """ - db = wlmutils.get_orchestrator() - configs = dbutils.get_db_configs() + fs = wlmutils.get_feature_store() + configs = fsutils.get_fs_configs() for setting, value in configs.items(): - config_set_method = dbutils.get_config_edit_method(db, setting) + config_set_method = fsutils.get_config_edit_method(fs, setting) with pytest.raises(SmartSimError): config_set_method(value) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 54f86ad99..18d1e6cc1 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -31,7 +31,7 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.error import SmartSimError from smartsim.error.errors import SSUnsupportedError @@ -43,62 +43,62 @@ import conftest -def test_orc_parameters() -> None: +def test_feature_store_parameters() -> None: threads_per_queue = 2 inter_op_threads = 2 intra_op_threads = 2 - db = Orchestrator( - db_nodes=1, + fs = FeatureStore( + fs_nodes=1, threads_per_queue=threads_per_queue, inter_op_threads=inter_op_threads, intra_op_threads=intra_op_threads, ) - assert db.queue_threads == threads_per_queue - assert db.inter_threads == inter_op_threads - assert db.intra_threads == intra_op_threads + assert fs.queue_threads == threads_per_queue + assert fs.inter_threads == inter_op_threads + assert fs.intra_threads == intra_op_threads - module_str = db._rai_module + module_str = fs._rai_module assert "THREADS_PER_QUEUE" in module_str assert "INTRA_OP_PARALLELISM" in module_str assert "INTER_OP_PARALLELISM" in module_str def test_is_not_active() -> None: - db = Orchestrator(db_nodes=1) - assert not db.is_active() + fs = FeatureStore(fs_nodes=1) + assert not fs.is_active() -def test_inactive_orc_get_address() -> None: - db = Orchestrator() +def test_inactive_feature_store_get_address() -> None: + fs = FeatureStore() with pytest.raises(SmartSimError): - db.get_address() + fs.get_address() -def test_orc_active_functions(test_dir: str, wlmutils: "conftest.WLMUtils") -> None: - exp_name = "test_orc_active_functions" +def test_feature_store_active_functions(test_dir: str, wlmutils: "conftest.WLMUtils") -> None: + exp_name = "test_feature_store_active_functions" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - db = Orchestrator(port=wlmutils.get_test_port()) - db.set_path(test_dir) + fs = FeatureStore(port=wlmutils.get_test_port()) + fs.set_path(test_dir) - exp.start(db) + exp.start(fs) - # check if the orchestrator is active - assert db.is_active() + # check if the FeatureStore is active + assert fs.is_active() - # check if the orchestrator can get the address - correct_address = db.get_address() == ["127.0.0.1:" + str(wlmutils.get_test_port())] + # check if the FeatureStore can get the address + correct_address = fs.get_address() == ["127.0.0.1:" + str(wlmutils.get_test_port())] if not correct_address: - exp.stop(db) + exp.stop(fs) assert False - exp.stop(db) + exp.stop(fs) - assert not db.is_active() + assert not fs.is_active() - # check if orchestrator.get_address() raises an exception + # check if FeatureStore.get_address() raises an exception with pytest.raises(SmartSimError): - db.get_address() + fs.get_address() def test_multiple_interfaces(test_dir: str, wlmutils: "conftest.WLMUtils") -> None: @@ -112,125 +112,125 @@ def test_multiple_interfaces(test_dir: str, wlmutils: "conftest.WLMUtils") -> No net_if_addrs = ["lo", net_if_addrs[0]] - db = Orchestrator(port=wlmutils.get_test_port(), interface=net_if_addrs) - db.set_path(test_dir) + fs = FeatureStore(port=wlmutils.get_test_port(), interface=net_if_addrs) + fs.set_path(test_dir) - exp.start(db) + exp.start(fs) - # check if the orchestrator is active - assert db.is_active() + # check if the FeatureStore is active + assert fs.is_active() - # check if the orchestrator can get the address - correct_address = db.get_address() == ["127.0.0.1:" + str(wlmutils.get_test_port())] + # check if the FeatureStore can get the address + correct_address = fs.get_address() == ["127.0.0.1:" + str(wlmutils.get_test_port())] if not correct_address: - exp.stop(db) + exp.stop(fs) assert False - exp.stop(db) + exp.stop(fs) -def test_catch_local_db_errors() -> None: - # local database with more than one node not allowed +def test_catch_local_feature_store_errors() -> None: + # local feature store with more than one node not allowed with pytest.raises(SSUnsupportedError): - db = Orchestrator(db_nodes=2) + fs = FeatureStore(fs_nodes=2) - # Run command for local orchestrator not allowed + # Run command for local FeatureStore not allowed with pytest.raises(SmartSimError): - db = Orchestrator(run_command="srun") + fs = FeatureStore(run_command="srun") - # Batch mode for local orchestrator is not allowed + # Batch mode for local FeatureStore is not allowed with pytest.raises(SmartSimError): - db = Orchestrator(batch=True) + fs = FeatureStore(batch=True) ##### PBS ###### def test_pbs_set_run_arg(wlmutils: "conftest.WLMUtils") -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface="lo", launcher="pbs", run_command="aprun", ) - orc.set_run_arg("account", "ACCOUNT") + feature_store.set_run_arg("account", "ACCOUNT") assert all( - [db.run_settings.run_args["account"] == "ACCOUNT" for db in orc.entities] + [fs.run_settings.run_args["account"] == "ACCOUNT" for fs in feature_store.entities] ) - orc.set_run_arg("pes-per-numa-node", "5") + feature_store.set_run_arg("pes-per-numa-node", "5") assert all( - ["pes-per-numa-node" not in db.run_settings.run_args for db in orc.entities] + ["pes-per-numa-node" not in fs.run_settings.run_args for fs in feature_store.entities] ) def test_pbs_set_batch_arg(wlmutils: "conftest.WLMUtils") -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface="lo", launcher="pbs", run_command="aprun", ) with pytest.raises(SmartSimError): - orc.set_batch_arg("account", "ACCOUNT") + feature_store.set_batch_arg("account", "ACCOUNT") - orc2 = Orchestrator( + feature_store2 = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, interface="lo", launcher="pbs", run_command="aprun", ) - orc2.set_batch_arg("account", "ACCOUNT") - assert orc2.batch_settings.batch_args["account"] == "ACCOUNT" - orc2.set_batch_arg("N", "another_name") - assert "N" not in orc2.batch_settings.batch_args + feature_store2.set_batch_arg("account", "ACCOUNT") + assert feature_store2.batch_settings.batch_args["account"] == "ACCOUNT" + feature_store2.set_batch_arg("N", "another_name") + assert "N" not in feature_store2.batch_settings.batch_args ##### Slurm ###### def test_slurm_set_run_arg(wlmutils: "conftest.WLMUtils") -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface="lo", launcher="slurm", run_command="srun", ) - orc.set_run_arg("account", "ACCOUNT") + feature_store.set_run_arg("account", "ACCOUNT") assert all( - [db.run_settings.run_args["account"] == "ACCOUNT" for db in orc.entities] + [fs.run_settings.run_args["account"] == "ACCOUNT" for fs in feature_store.entities] ) def test_slurm_set_batch_arg(wlmutils: "conftest.WLMUtils") -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface="lo", launcher="slurm", run_command="srun", ) with pytest.raises(SmartSimError): - orc.set_batch_arg("account", "ACCOUNT") + feature_store.set_batch_arg("account", "ACCOUNT") - orc2 = Orchestrator( + feature_store2 = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, interface="lo", launcher="slurm", run_command="srun", ) - orc2.set_batch_arg("account", "ACCOUNT") - assert orc2.batch_settings.batch_args["account"] == "ACCOUNT" + feature_store2.set_batch_arg("account", "ACCOUNT") + assert feature_store2.batch_settings.batch_args["account"] == "ACCOUNT" @pytest.mark.parametrize( @@ -240,98 +240,98 @@ def test_slurm_set_batch_arg(wlmutils: "conftest.WLMUtils") -> None: pytest.param(False, id="Multiple `srun`s"), ], ) -def test_orc_results_in_correct_number_of_shards(single_cmd: bool) -> None: +def test_feature_store_results_in_correct_number_of_shards(single_cmd: bool) -> None: num_shards = 5 - orc = Orchestrator( + feature_store = FeatureStore( port=12345, launcher="slurm", run_command="srun", - db_nodes=num_shards, + fs_nodes=num_shards, batch=False, single_cmd=single_cmd, ) if single_cmd: - assert len(orc.entities) == 1 - (node,) = orc.entities + assert len(feature_store.entities) == 1 + (node,) = feature_store.entities assert len(node.run_settings.mpmd) == num_shards - 1 else: - assert len(orc.entities) == num_shards - assert all(node.run_settings.mpmd == [] for node in orc.entities) + assert len(feature_store.entities) == num_shards + assert all(node.run_settings.mpmd == [] for node in feature_store.entities) assert ( - orc.num_shards == orc.db_nodes == sum(node.num_shards for node in orc.entities) + feature_store.num_shards == feature_store.fs_nodes == sum(node.num_shards for node in feature_store.entities) ) ###### LSF ###### -def test_catch_orc_errors_lsf(wlmutils: "conftest.WLMUtils") -> None: +def test_catch_feature_store_errors_lsf(wlmutils: "conftest.WLMUtils") -> None: with pytest.raises(SSUnsupportedError): - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=2, - db_per_host=2, + fs_nodes=2, + fs_per_host=2, batch=False, launcher="lsf", run_command="jsrun", ) - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, hosts=["batch", "host1", "host2"], launcher="lsf", run_command="jsrun", ) with pytest.raises(SmartSimError): - orc.set_batch_arg("P", "MYPROJECT") + feature_store.set_batch_arg("P", "MYPROJECT") def test_lsf_set_run_args(wlmutils: "conftest.WLMUtils") -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, hosts=["batch", "host1", "host2"], launcher="lsf", run_command="jsrun", ) - orc.set_run_arg("l", "gpu-gpu") - assert all(["l" not in db.run_settings.run_args for db in orc.entities]) + feature_store.set_run_arg("l", "gpu-gpu") + assert all(["l" not in fs.run_settings.run_args for fs in feature_store.entities]) def test_lsf_set_batch_args(wlmutils: "conftest.WLMUtils") -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, hosts=["batch", "host1", "host2"], launcher="lsf", run_command="jsrun", ) - assert orc.batch_settings.batch_args["m"] == '"batch host1 host2"' - orc.set_batch_arg("D", "102400000") - assert orc.batch_settings.batch_args["D"] == "102400000" + assert feature_store.batch_settings.batch_args["m"] == '"batch host1 host2"' + feature_store.set_batch_arg("D", "102400000") + assert feature_store.batch_settings.batch_args["D"] == "102400000" -def test_orc_telemetry(test_dir: str, wlmutils: "conftest.WLMUtils") -> None: - """Ensure the default behavior for an orchestrator is to disable telemetry""" - db = Orchestrator(port=wlmutils.get_test_port()) - db.set_path(test_dir) +def test_feature_store_telemetry(test_dir: str, wlmutils: "conftest.WLMUtils") -> None: + """Ensure the default behavior for an FeatureStore is to disable telemetry""" + fs = FeatureStore(port=wlmutils.get_test_port()) + fs.set_path(test_dir) # default is disabled - assert not db.telemetry.is_enabled + assert not fs.telemetry.is_enabled # ensure updating value works as expected - db.telemetry.enable() - assert db.telemetry.is_enabled + fs.telemetry.enable() + assert fs.telemetry.is_enabled # toggle back - db.telemetry.disable() - assert not db.telemetry.is_enabled + fs.telemetry.disable() + assert not fs.telemetry.is_enabled # toggle one more time - db.telemetry.enable() - assert db.telemetry.is_enabled + fs.telemetry.enable() + assert fs.telemetry.is_enabled diff --git a/tests/test_output_files.py b/tests/test_output_files.py index f3830051c..4491ace39 100644 --- a/tests/test_output_files.py +++ b/tests/test_output_files.py @@ -33,7 +33,7 @@ from smartsim._core.config import CONFIG from smartsim._core.control.controller import Controller, _AnonymousBatchJob from smartsim._core.launcher.step import Step -from smartsim.database.orchestrator import Orchestrator +from smartsim.database.orchestrator import FeatureStore from smartsim.entity.ensemble import Ensemble from smartsim.entity.model import Model from smartsim.settings.base import RunSettings @@ -50,7 +50,7 @@ batch_rs = SrunSettings("echo", ["spam", "eggs"]) ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore(fs_nodes=3, batch=True, launcher="slurm", run_command="srun") model = Model("test_model", params={}, path="", run_settings=rs) batch_model = Model( "batch_test_model", params={}, path="", run_settings=batch_rs, batch_settings=bs @@ -115,7 +115,7 @@ def test_get_output_files_with_create_job_step(test_dir): @pytest.mark.parametrize( "entity", - [pytest.param(ens, id="ensemble"), pytest.param(orc, id="orchestrator")], + [pytest.param(ens, id="ensemble"), pytest.param(feature_store, id="featurestore")], ) def test_get_output_files_with_create_batch_job_step(entity, test_dir): """Testing output files through _create_batch_job_step""" diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index 6ce93c6f9..4629c18b4 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -30,7 +30,7 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group @@ -39,22 +39,22 @@ first_dir = "" -# TODO ensure database is shutdown +# TODO ensure feature store is shutdown # use https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test -def test_local_orchestrator(test_dir, wlmutils): - """Test launching orchestrator locally""" +def test_local_feature_store(test_dir, wlmutils): + """Test launching feature store locally""" global first_dir - exp_name = "test-orc-launch-local" + exp_name = "test-feature-store-launch-local" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) first_dir = test_dir - orc = Orchestrator(port=wlmutils.get_test_port()) - orc.set_path(osp.join(test_dir, "orchestrator")) + feature_store = FeatureStore(port=wlmutils.get_test_port()) + feature_store.set_path(osp.join(test_dir, "feature_store")) - exp.start(orc) - statuses = exp.get_status(orc) + exp.start(feature_store) + statuses = exp.get_status(feature_store) assert [stat != SmartSimStatus.STATUS_FAILED for stat in statuses] # simulate user shutting down main thread @@ -62,22 +62,23 @@ def test_local_orchestrator(test_dir, wlmutils): exp._control._launcher.task_manager.actively_monitoring = False -def test_reconnect_local_orc(test_dir): - """Test reconnecting to orchestrator from first experiment""" +def test_reconnect_local_feature_store(test_dir): + """Test reconnecting to feature store from first experiment""" global first_dir # start new experiment - exp_name = "test-orc-local-reconnect-2nd" + exp_name = "test-feature-store-local-reconnect-2nd" exp_2 = Experiment(exp_name, launcher="local", exp_path=test_dir) checkpoint = osp.join(first_dir, "orchestrator", "smartsim_db.dat") - reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) + + reloaded_feature_store = exp_2.reconnect_feature_store(checkpoint) # let statuses update once time.sleep(5) - statuses = exp_2.get_status(reloaded_orc) + statuses = exp_2.get_status(reloaded_feature_store) for stat in statuses: if stat == SmartSimStatus.STATUS_FAILED: - exp_2.stop(reloaded_orc) + exp_2.stop(reloaded_feature_store) assert False - exp_2.stop(reloaded_orc) + exp_2.stop(reloaded_feature_store) diff --git a/tests/test_serialize.py b/tests/test_serialize.py index b2dc0b7a7..f3447d5e5 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -36,7 +36,7 @@ from smartsim._core._cli import utils from smartsim._core.control.manifest import LaunchedManifestBuilder from smartsim._core.utils import serialize -from smartsim.database.orchestrator import Orchestrator +from smartsim.database.orchestrator import FeatureStore _CFG_TM_ENABLED_ATTR = "telemetry_enabled" @@ -144,10 +144,10 @@ def test_started_entities_are_serialized(test_dir, manifest_json): exp.stop(hello_world_model, spam_eggs_model, hello_ensemble) -def test_serialzed_database_does_not_break_if_using_a_non_standard_install(monkeypatch): - monkeypatch.setattr(utils, "get_db_path", lambda: None) - db = Orchestrator() - dict_ = serialize._dictify_db(db, []) +def test_serialzed_feature_store_does_not_break_if_using_a_non_standard_install(monkeypatch): + monkeypatch.setattr(utils, "get_fs_path", lambda: None) + fs = FeatureStore() + dict_ = serialize._dictify_fs(fs, []) assert dict_["type"] == "Unknown" diff --git a/tests/test_slurm_parser.py b/tests/test_slurm_parser.py index b5f7cf32a..a49d9b198 100644 --- a/tests/test_slurm_parser.py +++ b/tests/test_slurm_parser.py @@ -231,12 +231,12 @@ def test_parse_sacct_step_id_2(): "extern|119225.extern|\n" "m1-119225.0|119225.0|\n" "m2-119225.1|119225.1|\n" - "orchestrator_0-119225.2|119225.2|\n" + "featurestore_0-119225.2|119225.2|\n" "n1-119225.3|119225.3|" ) step_id = "119225.2" parsed_step_id = slurmParser.parse_step_id_from_sacct( - output, "orchestrator_0-119225.2" + output, "featurestore_0-119225.2" ) assert step_id == parsed_step_id diff --git a/tests/test_slurm_settings.py b/tests/test_slurm_settings.py index d9d820244..aa915cded 100644 --- a/tests/test_slurm_settings.py +++ b/tests/test_slurm_settings.py @@ -79,7 +79,7 @@ def test_update_env(): def test_catch_colo_mpmd(): srun = SrunSettings("python") - srun.colocated_db_settings = {"port": 6379, "cpus": 1} + srun.colocated_fs_settings = {"port": 6379, "cpus": 1} srun_2 = SrunSettings("python") # should catch the user trying to make rs mpmd that already are colocated diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py index a2aac654b..00ea341b4 100644 --- a/tests/test_smartredis.py +++ b/tests/test_smartredis.py @@ -29,7 +29,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity import Ensemble, Model from smartsim.status import SmartSimStatus @@ -70,10 +70,10 @@ def test_exchange(fileutils, test_dir, wlmutils): "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" ) - # create and start a database - orc = Orchestrator(port=wlmutils.get_test_port()) - exp.generate(orc) - exp.start(orc, block=False) + # create and start a feature store + feature_store = FeatureStore(port=wlmutils.get_test_port()) + exp.generate(feature_store) + exp.start(feature_store, block=False) rs = exp.create_run_settings("python", "producer.py --exchange") params = {"mult": [1, -10]} @@ -100,8 +100,8 @@ def test_exchange(fileutils, test_dir, wlmutils): try: assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: - # stop the orchestrator - exp.stop(orc) + # stop the FeatureStore + exp.stop(feature_store) def test_consumer(fileutils, test_dir, wlmutils): @@ -116,10 +116,10 @@ def test_consumer(fileutils, test_dir, wlmutils): "smartredis_ensemble_consumer", exp_path=test_dir, launcher="local" ) - # create and start a database - orc = Orchestrator(port=wlmutils.get_test_port()) - exp.generate(orc) - exp.start(orc, block=False) + # create and start a feature store + feature_store = FeatureStore(port=wlmutils.get_test_port()) + exp.generate(feature_store) + exp.start(feature_store, block=False) rs_prod = exp.create_run_settings("python", "producer.py") rs_consumer = exp.create_run_settings("python", "consumer.py") @@ -149,5 +149,5 @@ def test_consumer(fileutils, test_dir, wlmutils): try: assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: - # stop the orchestrator - exp.stop(orc) + # stop the FeatureStore + exp.stop(feature_store) diff --git a/tests/test_symlinking.py b/tests/test_symlinking.py index 2b70e3e9f..a91263654 100644 --- a/tests/test_symlinking.py +++ b/tests/test_symlinking.py @@ -32,7 +32,7 @@ from smartsim import Experiment from smartsim._core.config import CONFIG from smartsim._core.control.controller import Controller, _AnonymousBatchJob -from smartsim.database.orchestrator import Orchestrator +from smartsim.database.orchestrator import FeatureStore from smartsim.entity.ensemble import Ensemble from smartsim.entity.model import Model from smartsim.settings.base import RunSettings @@ -49,7 +49,7 @@ batch_rs = SrunSettings("echo", ["spam", "eggs"]) ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore(fs_nodes=3, batch=True, launcher="slurm", run_command="srun") model = Model("test_model", params={}, path="", run_settings=rs) batch_model = Model( "batch_test_model", params={}, path="", run_settings=batch_rs, batch_settings=bs @@ -92,7 +92,7 @@ def symlink_with_create_job_step(test_dir, entity): "entity", [ pytest.param(ens, id="ensemble"), - pytest.param(orc, id="orchestrator"), + pytest.param(feature_store, id="featurestore"), pytest.param(anon_batch_model, id="model"), ], ) @@ -221,15 +221,15 @@ def test_non_batch_model_symlinks(test_dir): _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) -def test_non_batch_orchestrator_symlinks(test_dir): - exp = Experiment("test-non-batch-orc", exp_path=test_dir) +def test_non_batch_feature_store_symlinks(test_dir): + exp = Experiment("test-non-batch-feature-store", exp_path=test_dir) - db = exp.create_database(interface="lo") + db = exp.create_feature_store(interface="lo") exp.generate(db) exp.start(db, block=True) exp.stop(db) - for i in range(db.db_nodes): + for i in range(db.fs_nodes): _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.out"), False) _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.err"), False) diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index ebeeaee48..6120d6486 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -299,8 +299,8 @@ def test_load_manifest(fileutils: FileUtils, test_dir: str, config: cfg.Config): assert len(manifest.runs[0].models) == 1 assert len(manifest.runs[2].models) == 8 # 8 models in ensemble - assert len(manifest.runs[0].orchestrators) == 0 - assert len(manifest.runs[1].orchestrators) == 3 # 3 shards in db + assert len(manifest.runs[0].featurestores) == 0 + assert len(manifest.runs[1].featurestores) == 3 # 3 shards in fs def test_load_manifest_colo_model(fileutils: FileUtils): @@ -335,9 +335,9 @@ def test_load_manifest_serial_models(fileutils: FileUtils): assert len(manifest.runs[0].models) == 5 -def test_load_manifest_db_and_models(fileutils: FileUtils): +def test_load_manifest_fs_and_models(fileutils: FileUtils): """Ensure that the runtime manifest loads correctly when containing models & - orchestrator across 2 separate runs""" + feature store across 2 separate runs""" # NOTE: for regeneration, this manifest can use `test_telemetry_colo` sample_manifest_path = fileutils.get_test_conf_path("telemetry/db_and_model.json") sample_manifest = pathlib.Path(sample_manifest_path) @@ -349,19 +349,19 @@ def test_load_manifest_db_and_models(fileutils: FileUtils): assert manifest.launcher == "Slurm" assert len(manifest.runs) == 2 - assert len(manifest.runs[0].orchestrators) == 1 + assert len(manifest.runs[0].featurestores) == 1 assert len(manifest.runs[1].models) == 1 # verify collector paths from manifest are deserialized to collector config - assert manifest.runs[0].orchestrators[0].collectors["client"] - assert manifest.runs[0].orchestrators[0].collectors["memory"] + assert manifest.runs[0].featurestores[0].collectors["client"] + assert manifest.runs[0].featurestores[0].collectors["memory"] # verify collector paths missing from manifest are empty - assert not manifest.runs[0].orchestrators[0].collectors["client_count"] + assert not manifest.runs[0].featurestores[0].collectors["client_count"] -def test_load_manifest_db_and_models_1run(fileutils: FileUtils): +def test_load_manifest_fs_and_models_1run(fileutils: FileUtils): """Ensure that the runtime manifest loads correctly when containing models & - orchestrator in a single run""" + featurestore in a single run""" # NOTE: for regeneration, this manifest can use `test_telemetry_colo` sample_manifest_path = fileutils.get_test_conf_path( "telemetry/db_and_model_1run.json" @@ -375,21 +375,21 @@ def test_load_manifest_db_and_models_1run(fileutils: FileUtils): assert manifest.launcher == "Slurm" assert len(manifest.runs) == 1 - assert len(manifest.runs[0].orchestrators) == 1 + assert len(manifest.runs[0].featurestores) == 1 assert len(manifest.runs[0].models) == 1 @pytest.mark.parametrize( - ["task_id", "step_id", "etype", "exp_isorch", "exp_ismanaged"], + ["task_id", "step_id", "etype", "exp_isfeature_store", "exp_ismanaged"], [ - pytest.param("123", "", "model", False, False, id="unmanaged, non-orch"), - pytest.param("456", "123", "ensemble", False, True, id="managed, non-orch"), - pytest.param("789", "987", "orchestrator", True, True, id="managed, orch"), - pytest.param("987", "", "orchestrator", True, False, id="unmanaged, orch"), + pytest.param("123", "", "model", False, False, id="unmanaged, non-feature_store"), + pytest.param("456", "123", "ensemble", False, True, id="managed, non-feature_store"), + pytest.param("789", "987", "featurestore", True, True, id="managed, feature_store"), + pytest.param("987", "", "featurestore", True, False, id="unmanaged, feature_store"), ], ) def test_persistable_computed_properties( - task_id: str, step_id: str, etype: str, exp_isorch: bool, exp_ismanaged: bool + task_id: str, step_id: str, etype: str, exp_isfeature_store: bool, exp_ismanaged: bool ): name = f"test-{etype}-{uuid.uuid4()}" timestamp = get_ts_ms() @@ -407,7 +407,7 @@ def test_persistable_computed_properties( persistable = persistables[0] if persistables else None assert persistable.is_managed == exp_ismanaged - assert persistable.is_db == exp_isorch + assert persistable.is_fs == exp_isfeature_store def test_deserialize_ensemble(fileutils: FileUtils): @@ -459,17 +459,17 @@ def test_shutdown_conditions__has_monitored_job(test_dir: str): telmon._action_handler = mani_handler assert not telmon._can_shutdown() - assert not bool(mani_handler.job_manager.db_jobs) + assert not bool(mani_handler.job_manager.fs_jobs) assert bool(mani_handler.job_manager.jobs) -def test_shutdown_conditions__has_db(test_dir: str): - """Show that an event handler w/a monitored db cannot shutdown""" +def test_shutdown_conditions__has_fs(test_dir: str): + """Show that an event handler w/a monitored fs cannot shutdown""" job_entity1 = JobEntity() job_entity1.name = "xyz" job_entity1.step_id = "123" job_entity1.task_id = "" - job_entity1.type = "orchestrator" # <---- make entity appear as db + job_entity1.type = "featurestore" # <---- make entity appear as fs mani_handler = ManifestEventHandler("xyz") ## TODO: see next comment and combine an add_job method on manieventhandler @@ -486,7 +486,7 @@ def test_shutdown_conditions__has_db(test_dir: str): telmon._action_handler = mani_handler # replace w/mock handler assert not telmon._can_shutdown() - assert bool([j for j in mani_handler._tracked_jobs.values() if j.is_db]) + assert bool([j for j in mani_handler._tracked_jobs.values() if j.is_fs]) assert not bool(mani_handler.job_manager.jobs) @@ -554,10 +554,10 @@ def is_alive(self) -> bool: ], ) @pytest.mark.asyncio -async def test_auto_shutdown__has_db( +async def test_auto_shutdown__has_fs( test_dir: str, cooldown_ms: int, task_duration_ms: int ): - """Ensure that the cooldown timer is respected with a running db""" + """Ensure that the cooldown timer is respected with a running fs""" class FauxObserver: """Mock for the watchdog file system event listener""" @@ -575,10 +575,10 @@ def is_alive(self) -> bool: return True entity = JobEntity() - entity.name = "db_0" + entity.name = "fs_0" entity.step_id = "123" entity.task_id = "" - entity.type = "orchestrator" + entity.type = "featurestore" entity.telemetry_on = True entity.status_dir = test_dir @@ -611,8 +611,8 @@ def is_alive(self) -> bool: def test_telemetry_single_model(fileutils, test_dir, wlmutils, config): - """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp - with unique db_identifiers""" + """Test that it is possible to create_feature_store then colocate_fs_uds/colocate_fs_tcp + with unique fs_identifiers""" # Set experiment name exp_name = "telemetry_single_model" @@ -775,15 +775,15 @@ def test_telemetry_serial_models_nonblocking( assert len(stop_events) == 5 -def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config): +def test_telemetry_fs_only_with_generate(test_dir, wlmutils, monkeypatch, config): """ - Test telemetry with only a database running + Test telemetry with only a feature store running """ with monkeypatch.context() as ctx: ctx.setattr(cfg.Config, "telemetry_frequency", 1) # Set experiment name - exp_name = "telemetry_db_with_generate" + exp_name = "telemetry_fs_with_generate" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -793,14 +793,14 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # create regular database - orc = exp.create_database(port=test_port, interface=test_interface) - exp.generate(orc) + # create regular feature store + feature_store = exp.create_feature_store(port=test_port, interface=test_interface) + exp.generate(feature_store) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir try: - exp.start(orc, block=True) + exp.start(feature_store, block=True) snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) @@ -810,24 +810,24 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config assert len(start_events) == 1 assert len(stop_events) <= 1 finally: - exp.stop(orc) + exp.stop(feature_store) snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED + assert exp.get_status(feature_store)[0] == SmartSimStatus.STATUS_CANCELLED stop_events = list(telemetry_output_path.rglob("stop.json")) assert len(stop_events) == 1 -def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, config): +def test_telemetry_fs_only_without_generate(test_dir, wlmutils, monkeypatch, config): """ - Test telemetry with only a non-generated database running + Test telemetry with only a non-generated feature store running """ with monkeypatch.context() as ctx: ctx.setattr(cfg.Config, "telemetry_frequency", 1) # Set experiment name - exp_name = "telemetry_db_only_without_generate" + exp_name = "telemetry_fs_only_without_generate" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -837,12 +837,12 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, con # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # create regular database - orc = exp.create_database(port=test_port, interface=test_interface) + # create regular feature store + feature_store = exp.create_feature_store(port=test_port, interface=test_interface) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir try: - exp.start(orc) + exp.start(feature_store) snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) @@ -852,18 +852,18 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, con assert len(start_events) == 1 assert len(stop_events) == 0 finally: - exp.stop(orc) + exp.stop(feature_store) snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED + assert exp.get_status(feature_store)[0] == SmartSimStatus.STATUS_CANCELLED stop_events = list(telemetry_output_path.rglob("stop.json")) assert len(stop_events) == 1 -def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch, config): +def test_telemetry_fs_and_model(fileutils, test_dir, wlmutils, monkeypatch, config): """ - Test telemetry with only a database and a model running + Test telemetry with only a feature store and a model running """ with monkeypatch.context() as ctx: @@ -881,11 +881,11 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch, conf # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # create regular database - orc = exp.create_database(port=test_port, interface=test_interface) - exp.generate(orc) + # create regular feature store + feature_store = exp.create_feature_store(port=test_port, interface=test_interface) + exp.generate(feature_store) try: - exp.start(orc) + exp.start(feature_store) # create run settings app_settings = exp.create_run_settings(sys.executable, test_script) @@ -897,12 +897,12 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch, conf exp.generate(smartsim_model) exp.start(smartsim_model, block=True) finally: - exp.stop(orc) + exp.stop(feature_store) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED + assert exp.get_status(feature_store)[0] == SmartSimStatus.STATUS_CANCELLED assert exp.get_status(smartsim_model)[0] == SmartSimStatus.STATUS_COMPLETED start_events = list(telemetry_output_path.rglob("database/**/start.json")) @@ -996,7 +996,7 @@ def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch, c start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) - # the colodb does NOT show up as a unique entity in the telemetry + # the colofs does NOT show up as a unique entity in the telemetry assert len(start_events) == 1 assert len(stop_events) == 1 @@ -1198,13 +1198,13 @@ def test_multistart_experiment( rs_m.set_tasks(1) model = exp.create_model("my-model", run_settings=rs_m) - db = exp.create_database( - db_nodes=1, + fs = exp.create_feature_store( + fs_nodes=1, port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface(), ) - exp.generate(db, ens, model, overwrite=True) + exp.generate(fs, ens, model, overwrite=True) with monkeypatch.context() as ctx: ctx.setattr(cfg.Config, "telemetry_frequency", 1) @@ -1215,20 +1215,20 @@ def test_multistart_experiment( # track PID to see that telmon cooldown avoids restarting process tm_pid = exp._control._telemetry_monitor.pid - exp.start(db, block=False) + exp.start(fs, block=False) # check that same TM proc is active assert tm_pid == exp._control._telemetry_monitor.pid try: exp.start(ens, block=True, summary=True) finally: - exp.stop(db) + exp.stop(fs) assert tm_pid == exp._control._telemetry_monitor.pid - time.sleep(3) # time for telmon to write db stop event + time.sleep(3) # time for telmon to write fs stop event telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - db_start_events = list(telemetry_output_path.rglob("database/**/start.json")) - assert len(db_start_events) == 1 + fs_start_events = list(telemetry_output_path.rglob("database/**/start.json")) + assert len(fs_start_events) == 1 m_start_events = list(telemetry_output_path.rglob("model/**/start.json")) assert len(m_start_events) == 1 @@ -1302,7 +1302,7 @@ def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: job_entity.step_id = "faux-step-id" job_entity.task_id = 1234 job_entity.status_dir = test_dir - job_entity.type = "orchestrator" + job_entity.type = "featurestore" job = Job(job_entity.name, job_entity.step_id, job_entity, "slurm", True) From 254bca0dcab284bc81b6eec3b8d4a34dc74cbdd0 Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Thu, 9 May 2024 17:39:52 -0500 Subject: [PATCH 02/11] orch to feature store bug fix --- smartsim/_core/control/controller.py | 8 ++++---- tests/on_wlm/test_symlinking.py | 2 +- tests/test_reconnect_orchestrator.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 82785fedd..155e93f0a 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -814,12 +814,12 @@ def reload_saved_fs(self, checkpoint_file: str) -> FeatureStore: err_message = ( "The SmartSim feature store checkpoint is incomplete or corrupted. " ) - if not "db" in fs_config: + if not "fs" in fs_config: raise SmartSimError( err_message + "Could not find the featurestore object." ) - if not "db_jobs" in fs_config: + if not "fs_jobs" in fs_config: raise SmartSimError( err_message + "Could not find feature store job objects." ) @@ -828,11 +828,11 @@ def reload_saved_fs(self, checkpoint_file: str) -> FeatureStore: raise SmartSimError( err_message + "Could not find feature store job objects." ) - feature_store: FeatureStore = fs_config["db"] + feature_store: FeatureStore = fs_config["fs"] # TODO check that each fs_object is running - job_steps = zip(fs_config["db_jobs"].values(), fs_config["steps"]) + job_steps = zip(fs_config["fs_jobs"].values(), fs_config["steps"]) try: for fs_job, step in job_steps: self._jobs.fs_jobs[fs_job.ename] = fs_job diff --git a/tests/on_wlm/test_symlinking.py b/tests/on_wlm/test_symlinking.py index 058b56e74..df9647342 100644 --- a/tests/on_wlm/test_symlinking.py +++ b/tests/on_wlm/test_symlinking.py @@ -129,7 +129,7 @@ def test_batch_model_symlinks(test_dir, wlmutils): _should_not_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.sh")) -def test_batch_orchestrator_symlinks(test_dir, wlmutils): +def test_batch_feature_store_symlinks(test_dir, wlmutils): exp_name = "test-batch-orc" launcher = wlmutils.get_test_launcher() exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index 4629c18b4..889876f00 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -69,7 +69,7 @@ def test_reconnect_local_feature_store(test_dir): exp_name = "test-feature-store-local-reconnect-2nd" exp_2 = Experiment(exp_name, launcher="local", exp_path=test_dir) - checkpoint = osp.join(first_dir, "orchestrator", "smartsim_db.dat") + checkpoint = osp.join(first_dir, "feature_store", "smartsim_db.dat") reloaded_feature_store = exp_2.reconnect_feature_store(checkpoint) From 7ec2fa686b6aa5361ef2f2d457f27dd26106bc17 Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Thu, 16 May 2024 19:58:41 -0500 Subject: [PATCH 03/11] merge with develop, and update merged changed to feature store --- .github/workflows/changelog.yml | 2 +- .gitignore | 6 +- .wci.yml | 4 +- Makefile | 4 +- README.md | 6 +- conftest.py | 404 +++-- doc/_static/version_names.json | 4 +- doc/api/smartsim_api.rst | 26 + doc/changelog.md | 31 +- doc/conf.py | 2 +- doc/dragon.rst | 169 +++ doc/experiment.rst | 97 +- doc/index.rst | 1 + doc/installation_instructions/basic.rst | 28 +- .../platform/olcf-summit.rst | 2 +- doc/run_settings.rst | 33 +- doc/testing.rst | 10 +- .../experiment_doc_examples/exp.py | 8 +- docker-compose.yml | 2 +- docker/docs/dev/Dockerfile | 3 + docker/prod/Dockerfile | 2 +- pyproject.toml | 11 +- setup.cfg | 1 + setup.py | 5 +- smartsim/_core/__init__.py | 4 +- smartsim/_core/_cli/build.py | 45 +- smartsim/_core/_cli/cli.py | 10 +- smartsim/_core/_cli/scripts/dragon_install.py | 232 +++ smartsim/_core/_cli/teardown.py | 74 + smartsim/_core/_cli/utils.py | 5 + smartsim/_core/_cli/validate.py | 12 +- smartsim/_core/_install/buildenv.py | 4 +- smartsim/_core/_install/builder.py | 9 + smartsim/_core/config/config.py | 64 +- smartsim/_core/control/controller.py | 73 +- smartsim/_core/control/job.py | 21 +- smartsim/_core/control/manifest.py | 8 + smartsim/_core/control/previewrenderer.py | 192 +++ smartsim/_core/entrypoints/dragon.py | 351 +++++ smartsim/_core/entrypoints/dragon_client.py | 203 +++ smartsim/_core/entrypoints/redis.py | 8 +- .../_core/entrypoints/telemetrymonitor.py | 2 + smartsim/_core/generation/generator.py | 4 +- smartsim/_core/launcher/__init__.py | 2 + smartsim/_core/launcher/colocated.py | 3 +- smartsim/_core/launcher/dragon/__init__.py | 25 + .../_core/launcher/dragon/dragonBackend.py | 734 +++++++++ .../_core/launcher/dragon/dragonConnector.py | 532 +++++++ .../_core/launcher/dragon/dragonLauncher.py | 321 ++++ .../_core/launcher/dragon/dragonSockets.py | 158 ++ smartsim/_core/launcher/launcher.py | 10 + smartsim/_core/launcher/pbs/pbsLauncher.py | 19 +- smartsim/_core/launcher/pbs/pbsParser.py | 23 +- smartsim/_core/launcher/step/__init__.py | 1 + smartsim/_core/launcher/step/dragonStep.py | 248 +++ smartsim/_core/launcher/step/step.py | 4 +- smartsim/_core/schemas/__init__.py | 41 + smartsim/_core/schemas/dragonRequests.py | 90 ++ smartsim/_core/schemas/dragonResponses.py | 73 + smartsim/_core/schemas/utils.py | 124 ++ smartsim/_core/utils/__init__.py | 9 +- smartsim/_core/utils/helpers.py | 125 +- smartsim/_core/utils/network.py | 35 + smartsim/_core/utils/redis.py | 6 +- smartsim/_core/utils/security.py | 302 ++++ smartsim/_core/utils/telemetry/manifest.py | 29 +- smartsim/_core/utils/telemetry/telemetry.py | 44 +- smartsim/_core/utils/telemetry/util.py | 3 +- smartsim/database/orchestrator.py | 47 +- smartsim/entity/dbnode.py | 8 +- smartsim/entity/dbobject.py | 4 +- smartsim/entity/ensemble.py | 2 + smartsim/entity/entityList.py | 18 +- smartsim/error/__init__.py | 1 + smartsim/error/errors.py | 10 + smartsim/experiment.py | 75 +- smartsim/log.py | 3 +- smartsim/ml/data.py | 16 +- smartsim/settings/__init__.py | 2 + smartsim/settings/dragonRunSettings.py | 78 + smartsim/settings/settings.py | 14 +- smartsim/settings/slurmSettings.py | 2 +- .../preview/plain_text/activeinfra.template | 9 + .../preview/plain_text/base.template | 52 + .../preview/plain_text/clientconfig.template | 7 + .../plain_text/clientconfig_debug.template | 29 + .../plain_text/clientconfig_info.template | 19 + .../plain_text/clientconfigcolo.template | 7 + .../clientconfigcolo_debug.template | 37 + .../plain_text/clientconfigcolo_info.template | 22 + .../preview/plain_text/ensemble.template | 7 + .../plain_text/ensemble_debug.template | 62 + .../preview/plain_text/ensemble_info.template | 51 + .../preview/plain_text/experiment.template | 5 + .../preview/plain_text/model.template | 7 + .../preview/plain_text/model_debug.template | 114 ++ .../preview/plain_text/model_info.template | 54 + .../preview/plain_text/orchestrator.template | 7 + .../plain_text/orchestrator_debug.template | 33 + .../plain_text/orchestrator_info.template | 11 + tests/backends/test_cli_mini_exp.py | 7 +- tests/backends/test_dataloader.py | 71 +- tests/backends/test_dbmodel.py | 127 +- tests/backends/test_dbscript.py | 76 +- tests/backends/test_onnx.py | 39 +- tests/backends/test_tf.py | 22 +- tests/backends/test_torch.py | 24 +- tests/full_wlm/test_generic_batch_launch.py | 9 +- .../full_wlm/test_generic_orc_launch_batch.py | 37 +- tests/{on_wlm => full_wlm}/test_symlinking.py | 5 + tests/on_wlm/test_containers_wlm.py | 5 +- tests/on_wlm/test_dragon.py | 94 ++ tests/on_wlm/test_dragon_entrypoint.py | 295 ++++ tests/on_wlm/test_preview_wlm.py | 409 +++++ tests/on_wlm/test_simple_entity_launch.py | 54 +- tests/on_wlm/test_wlm_orc_config_settings.py | 19 +- tests/test_collector_manager.py | 10 +- tests/test_collector_sink.py | 2 +- tests/test_collectors.py | 18 +- tests/test_config.py | 28 + tests/test_configs/mpi/mpi_hello.c | 35 + .../multidbid_colo_env_vars_only.py | 52 + tests/test_containers.py | 28 +- tests/test_controller_errors.py | 7 +- tests/test_dbnode.py | 22 +- tests/test_dragon_backend.py | 453 ++++++ tests/test_dragon_installer.py | 471 ++++++ tests/test_dragon_launcher.py | 523 +++++++ tests/test_experiment.py | 2 + tests/test_fixtures.py | 56 + tests/test_multidb.py | 13 +- tests/test_orc_config_settings.py | 16 +- tests/test_orchestrator.py | 64 +- tests/test_pbs_parser.py | 20 + tests/test_preview.py | 1330 +++++++++++++++++ tests/test_schema_utils.py | 217 +++ tests/test_smartredis.py | 57 +- tests/test_telemetry_monitor.py | 15 +- tests/utils/test_network.py | 30 + tests/utils/test_security.py | 234 +++ 140 files changed, 10258 insertions(+), 694 deletions(-) create mode 100644 doc/dragon.rst create mode 100644 smartsim/_core/_cli/scripts/dragon_install.py create mode 100644 smartsim/_core/_cli/teardown.py create mode 100644 smartsim/_core/control/previewrenderer.py create mode 100644 smartsim/_core/entrypoints/dragon.py create mode 100644 smartsim/_core/entrypoints/dragon_client.py create mode 100644 smartsim/_core/launcher/dragon/__init__.py create mode 100644 smartsim/_core/launcher/dragon/dragonBackend.py create mode 100644 smartsim/_core/launcher/dragon/dragonConnector.py create mode 100644 smartsim/_core/launcher/dragon/dragonLauncher.py create mode 100644 smartsim/_core/launcher/dragon/dragonSockets.py create mode 100644 smartsim/_core/launcher/step/dragonStep.py create mode 100644 smartsim/_core/schemas/__init__.py create mode 100644 smartsim/_core/schemas/dragonRequests.py create mode 100644 smartsim/_core/schemas/dragonResponses.py create mode 100644 smartsim/_core/schemas/utils.py create mode 100644 smartsim/_core/utils/security.py create mode 100644 smartsim/settings/dragonRunSettings.py create mode 100644 smartsim/templates/templates/preview/plain_text/activeinfra.template create mode 100644 smartsim/templates/templates/preview/plain_text/base.template create mode 100644 smartsim/templates/templates/preview/plain_text/clientconfig.template create mode 100644 smartsim/templates/templates/preview/plain_text/clientconfig_debug.template create mode 100644 smartsim/templates/templates/preview/plain_text/clientconfig_info.template create mode 100644 smartsim/templates/templates/preview/plain_text/clientconfigcolo.template create mode 100644 smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template create mode 100644 smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template create mode 100644 smartsim/templates/templates/preview/plain_text/ensemble.template create mode 100644 smartsim/templates/templates/preview/plain_text/ensemble_debug.template create mode 100644 smartsim/templates/templates/preview/plain_text/ensemble_info.template create mode 100644 smartsim/templates/templates/preview/plain_text/experiment.template create mode 100644 smartsim/templates/templates/preview/plain_text/model.template create mode 100644 smartsim/templates/templates/preview/plain_text/model_debug.template create mode 100644 smartsim/templates/templates/preview/plain_text/model_info.template create mode 100644 smartsim/templates/templates/preview/plain_text/orchestrator.template create mode 100644 smartsim/templates/templates/preview/plain_text/orchestrator_debug.template create mode 100644 smartsim/templates/templates/preview/plain_text/orchestrator_info.template rename tests/{on_wlm => full_wlm}/test_symlinking.py (97%) create mode 100644 tests/on_wlm/test_dragon.py create mode 100644 tests/on_wlm/test_dragon_entrypoint.py create mode 100644 tests/on_wlm/test_preview_wlm.py create mode 100755 tests/test_configs/mpi/mpi_hello.c create mode 100644 tests/test_configs/smartredis/multidbid_colo_env_vars_only.py create mode 100644 tests/test_dragon_backend.py create mode 100644 tests/test_dragon_installer.py create mode 100644 tests/test_dragon_launcher.py create mode 100644 tests/test_fixtures.py create mode 100644 tests/test_preview.py create mode 100644 tests/test_schema_utils.py create mode 100644 tests/utils/test_network.py create mode 100644 tests/utils/test_security.py diff --git a/.github/workflows/changelog.yml b/.github/workflows/changelog.yml index 3346206d1..cd4ab58fa 100644 --- a/.github/workflows/changelog.yml +++ b/.github/workflows/changelog.yml @@ -46,4 +46,4 @@ jobs: uses: dangoslen/changelog-enforcer@v3.6.0 with: changeLogPath: './doc/changelog.md' - missingUpdateErrorMessage: 'changelog.md has not been updated' \ No newline at end of file + missingUpdateErrorMessage: 'changelog.md has not been updated' diff --git a/.gitignore b/.gitignore index 24e061563..77b91d586 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ tests/test_output # Dependencies smartsim/_core/.third-party +smartsim/_core/.dragon # Docs _build @@ -22,14 +23,13 @@ venv/ .venv/ env/ .env/ +**/.env # written upon install smartsim/version.py -smartsim/_core/bin/*-server -smartsim/_core/bin/*-cli - # created upon install +smartsim/_core/bin smartsim/_core/lib # optional dev tools diff --git a/.wci.yml b/.wci.yml index 265d59579..6194f1939 100644 --- a/.wci.yml +++ b/.wci.yml @@ -22,8 +22,8 @@ language: Python release: - version: 0.6.2 - date: 2024-02-16 + version: 0.7.0 + date: 2024-05-14 documentation: general: https://www.craylabs.org/docs/overview.html diff --git a/Makefile b/Makefile index f71f2a0b3..bddbda722 100644 --- a/Makefile +++ b/Makefile @@ -150,11 +150,11 @@ tutorials-dev: @docker compose build tutorials-dev @docker run -p 8888:8888 smartsim-tutorials:dev-latest -# help: tutorials-prod - Build and start a docker container to run the tutorials (v0.6.2) +# help: tutorials-prod - Build and start a docker container to run the tutorials (v0.7.0) .PHONY: tutorials-prod tutorials-prod: @docker compose build tutorials-prod - @docker run -p 8888:8888 smartsim-tutorials:v0.6.2 + @docker run -p 8888:8888 smartsim-tutorials:v0.7.0 # help: diff --git a/README.md b/README.md index cfd8d4271..c0986042e 100644 --- a/README.md +++ b/README.md @@ -174,13 +174,17 @@ system with which it has a corresponding `RunSettings` class. If one can be foun ## Experiments on HPC Systems SmartSim integrates with common HPC schedulers providing batch and interactive -launch capabilities for all applications. +launch capabilities for all applications: - Slurm - LSF - PBSPro - Local (for laptops/single node, no batch) +In addition, on Slurm and PBS systems, [Dragon](https://dragonhpc.github.io/dragon/doc/_build/html/index.html) +can be used as a launcher. Please refer to the documentation for instructions on +how to insall it on your system and use it in SmartSim. + ### Interactive Launch Example diff --git a/conftest.py b/conftest.py index 89191ee5b..4e8a33451 100644 --- a/conftest.py +++ b/conftest.py @@ -27,31 +27,41 @@ from __future__ import annotations import asyncio +from collections import defaultdict +from dataclasses import dataclass import json import os import pathlib import shutil +import subprocess import signal +import socket import sys import tempfile +import time import typing as t import uuid import warnings from subprocess import run +import time import psutil import pytest import smartsim from smartsim import Experiment +from smartsim._core.launcher.dragon.dragonConnector import DragonConnector +from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher from smartsim._core.config import CONFIG from smartsim._core.config.config import Config from smartsim._core.utils.telemetry.telemetry import JobEntity from smartsim.database import FeatureStore from smartsim.entity import Model -from smartsim.error import SSConfigError +from smartsim.error import SSConfigError, SSInternalError +from smartsim.log import get_logger from smartsim.settings import ( AprunSettings, + DragonRunSettings, JsrunSettings, MpiexecSettings, MpirunSettings, @@ -60,6 +70,8 @@ SrunSettings, ) +logger = get_logger(__name__) + # pylint: disable=redefined-outer-name,invalid-name,global-statement # Globals, yes, but its a testing file @@ -70,15 +82,17 @@ test_num_gpus = CONFIG.test_num_gpus test_nic = CONFIG.test_interface test_alloc_specs_path = os.getenv("SMARTSIM_TEST_ALLOC_SPEC_SHEET_PATH", None) -test_port = CONFIG.test_port +test_ports = CONFIG.test_ports test_account = CONFIG.test_account or "" test_batch_resources: t.Dict[t.Any, t.Any] = CONFIG.test_batch_resources +test_output_dirs = 0 +mpi_app_exe = None +built_mpi_app = False # Fill this at runtime if needed test_hostlist = None has_aprun = shutil.which("aprun") is not None - def get_account() -> str: return test_account @@ -98,9 +112,7 @@ def print_test_configuration() -> None: print("TEST_ALLOC_SPEC_SHEET_PATH:", test_alloc_specs_path) print("TEST_DIR:", test_output_root) print("Test output will be located in TEST_DIR if there is a failure") - print( - "TEST_PORTS:", ", ".join(str(port) for port in range(test_port, test_port + 3)) - ) + print("TEST_PORTS:", ", ".join(str(port) for port in test_ports)) if test_batch_resources: print("TEST_BATCH_RESOURCES: ") print(json.dumps(test_batch_resources, indent=2)) @@ -108,7 +120,7 @@ def print_test_configuration() -> None: def pytest_configure() -> None: pytest.test_launcher = test_launcher - pytest.wlm_options = ["slurm", "pbs", "lsf", "pals"] + pytest.wlm_options = ["slurm", "pbs", "lsf", "pals", "dragon"] account = get_account() pytest.test_account = account pytest.test_device = test_device @@ -125,6 +137,14 @@ def pytest_sessionstart( if os.path.isdir(test_output_root): shutil.rmtree(test_output_root) os.makedirs(test_output_root) + while not os.path.isdir(test_output_root): + time.sleep(0.1) + + if CONFIG.dragon_server_path is None: + dragon_server_path = os.path.join(test_output_root, "dragon_server") + os.makedirs(dragon_server_path) + os.environ["SMARTSIM_DRAGON_SERVER_PATH"] = dragon_server_path + print_test_configuration() @@ -136,12 +156,62 @@ def pytest_sessionfinish( returning the exit status to the system. """ if exitstatus == 0: - shutil.rmtree(test_output_root) + cleanup_attempts = 5 + while cleanup_attempts > 0: + try: + shutil.rmtree(test_output_root) + except OSError as e: + cleanup_attempts -= 1 + time.sleep(1) + if not cleanup_attempts: + raise + else: + break else: - # kill all spawned processes in case of error + # kill all spawned processes + if CONFIG.test_launcher == "dragon": + time.sleep(5) kill_all_test_spawned_processes() +def build_mpi_app() -> t.Optional[pathlib.Path]: + global built_mpi_app + built_mpi_app = True + cc = shutil.which("cc") + if cc is None: + cc = shutil.which("gcc") + if cc is None: + return None + + path_to_src = pathlib.Path(FileUtils().get_test_conf_path("mpi")) + path_to_out = pathlib.Path(test_output_root) / "apps" / "mpi_app" + os.makedirs(path_to_out.parent, exist_ok=True) + cmd = [cc, str(path_to_src / "mpi_hello.c"), "-o", str(path_to_out)] + proc = subprocess.Popen(cmd) + proc.wait(timeout=1) + if proc.returncode == 0: + return path_to_out + else: + return None + +@pytest.fixture(scope="session") +def mpi_app_path() -> t.Optional[pathlib.Path]: + """Return path to MPI app if it was built + + return None if it could not or will not be built + """ + if not CONFIG.test_mpi: + return None + + # if we already tried to build, return what we have + if built_mpi_app: + return mpi_app_exe + + # attempt to build, set global + mpi_app_exe = build_mpi_app() + return mpi_app_exe + + def kill_all_test_spawned_processes() -> None: # in case of test failure, clean up all spawned processes pid = os.getpid() @@ -157,6 +227,7 @@ def kill_all_test_spawned_processes() -> None: print("Not all processes were killed after test") + def get_hostlist() -> t.Optional[t.List[str]]: global test_hostlist if not test_hostlist: @@ -227,7 +298,23 @@ def _reset(): ) -@pytest.fixture +def _find_free_port(ports: t.Collection[int]) -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + for port in ports: + try: + sock.bind(("127.0.0.1", port)) + except socket.error: + continue + else: + _, port_ = sock.getsockname() + return int(port_) + raise SSInternalError( + "Could not find a free port out of a options: " + f"{', '.join(str(port) for port in sorted(ports))}" + ) + + +@pytest.fixture(scope="session") def wlmutils() -> t.Type[WLMUtils]: return WLMUtils @@ -244,7 +331,9 @@ def get_test_launcher() -> str: @staticmethod def get_test_port() -> int: - return test_port + # TODO: Ideally this should find a free port on the correct host(s), + # but this is good enough for now + return _find_free_port(test_ports) @staticmethod def get_test_account() -> str: @@ -273,6 +362,12 @@ def get_base_run_settings( run_args.update(kwargs) settings = RunSettings(exe, args, run_command="srun", run_args=run_args) return settings + if test_launcher == "dragon": + run_args = {"nodes": nodes} + run_args = {"ntasks": ntasks} + run_args.update(kwargs) + settings = DragonRunSettings(exe, args, run_args=run_args) + return settings if test_launcher == "pbs": if shutil.which("aprun"): run_command = "aprun" @@ -314,6 +409,11 @@ def get_run_settings( run_args = {"nodes": nodes, "ntasks": ntasks, "time": "00:10:00"} run_args.update(kwargs) return SrunSettings(exe, args, run_args=run_args) + if test_launcher == "dragon": + run_args = {"nodes": nodes} + run_args.update(kwargs) + settings = DragonRunSettings(exe, args, run_args=run_args) + return settings if test_launcher == "pbs": if shutil.which("aprun"): run_args = {"pes": ntasks} @@ -339,53 +439,6 @@ def get_run_settings( return RunSettings(exe, args) - @staticmethod - def get_feature_store(nodes: int = 1, batch: bool = False) -> FeatureStore: - if test_launcher == "pbs": - if not shutil.which("aprun"): - hostlist = get_hostlist() - else: - hostlist = None - return FeatureStore( - fs_nodes=nodes, - port=test_port, - batch=batch, - interface=test_nic, - launcher=test_launcher, - hosts=hostlist, - ) - if test_launcher == "pals": - hostlist = get_hostlist() - return FeatureStore( - fs_nodes=nodes, - port=test_port, - batch=batch, - interface=test_nic, - launcher=test_launcher, - hosts=hostlist, - ) - if test_launcher == "slurm": - return FeatureStore( - fs_nodes=nodes, - port=test_port, - batch=batch, - interface=test_nic, - launcher=test_launcher, - ) - if test_launcher == "lsf": - return FeatureStore( - fs_nodes=nodes, - port=test_port, - batch=batch, - cpus_per_shard=4, - gpus_per_shard=2 if test_device == "GPU" else 0, - project=get_account(), - interface=test_nic, - launcher=test_launcher, - ) - - return FeatureStore(port=test_port, interface="lo") - @staticmethod def choose_host(rs: RunSettings) -> t.Optional[str]: if isinstance(rs, (MpirunSettings, MpiexecSettings)): @@ -396,65 +449,6 @@ def choose_host(rs: RunSettings) -> t.Optional[str]: return None -@pytest.fixture -def local_fs( - request: t.Any, wlmutils: t.Type[WLMUtils], test_dir: str -) -> t.Generator[FeatureStore, None, None]: - """Yield fixture for startup and teardown of an local feature_store""" - - exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - fs = FeatureStore(port=wlmutils.get_test_port(), interface="lo") - fs.set_path(test_dir) - exp.start(fs) - - yield fs - # pass or fail, the teardown code below is ran after the - # completion of a test case that uses this fixture - exp.stop(fs) - - -@pytest.fixture -def fs( - request: t.Any, wlmutils: t.Type[WLMUtils], test_dir: str -) -> t.Generator[FeatureStore, None, None]: - """Yield fixture for startup and teardown of an feature_store""" - launcher = wlmutils.get_test_launcher() - - exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) - fs = wlmutils.get_feature_store() - fs.set_path(test_dir) - exp.start(fs) - - yield fs - # pass or fail, the teardown code below is ran after the - # completion of a test case that uses this fixture - exp.stop(fs) - - -@pytest.fixture -def fs_cluster( - test_dir: str, wlmutils: t.Type[WLMUtils], request: t.Any -) -> t.Generator[FeatureStore, None, None]: - """ - Yield fixture for startup and teardown of a clustered feature_store. - This should only be used in on_wlm and full_wlm tests. - """ - launcher = wlmutils.get_test_launcher() - - exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) - fs = wlmutils.get_feature_store(nodes=3) - fs.set_path(test_dir) - exp.start(fs) - - yield fs - # pass or fail, the teardown code below is ran after the - # completion of a test case that uses this fixture - exp.stop(fs) - - @pytest.fixture(scope="function", autouse=True) def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None: for key in os.environ.keys(): @@ -464,6 +458,14 @@ def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("SSKEYOUT", raising=False) +@pytest.fixture(scope="function", autouse=True) +def check_output_dir() -> None: + global test_output_dirs + assert os.path.isdir(test_output_root) + assert len(os.listdir(test_output_root)) >= test_output_dirs + test_output_dirs = len(os.listdir(test_output_root)) + + @pytest.fixture def fsutils() -> t.Type[FSUtils]: return FSUtils @@ -653,7 +655,7 @@ def setup_test_colo( fs_args: t.Dict[str, t.Any], colo_settings: t.Optional[RunSettings] = None, colo_model_name: str = "colocated_model", - port: int = test_port, + port: t.Optional[int] = None, on_wlm: bool = False, ) -> Model: """Setup feature store needed for the colo pinning tests""" @@ -669,10 +671,11 @@ def setup_test_colo( if on_wlm: colo_settings.set_tasks(1) colo_settings.set_nodes(1) + colo_model = exp.create_model(colo_model_name, colo_settings) if fs_type in ["tcp", "deprecated"]: - fs_args["port"] = port + fs_args["port"] = port if port is not None else _find_free_port(test_ports) fs_args["ifname"] = "lo" if fs_type == "uds" and colo_model_name is not None: tmp_dir = tempfile.gettempdir() @@ -696,6 +699,21 @@ def setup_test_colo( return colo_model +@pytest.fixture(scope="function") +def global_dragon_teardown() -> None: + """Connect to a dragon server started at the path indicated by + the environment variable SMARTSIM_DRAGON_SERVER_PATH and + force its shutdown to bring down the runtime and allow a subsequent + allocation of a new runtime. + """ + if test_launcher != "dragon" or CONFIG.dragon_server_path is None: + return + logger.debug(f"Tearing down Dragon infrastructure, server path: {CONFIG.dragon_server_path}") + dragon_connector = DragonConnector() + dragon_connector.ensure_connected() + dragon_connector.cleanup() + + @pytest.fixture def config() -> Config: return CONFIG @@ -856,3 +874,151 @@ def num_calls(self) -> int: @property def details(self) -> t.List[t.Tuple[t.Tuple[t.Any, ...], t.Dict[str, t.Any]]]: return self._details + +## Reuse feature store across tests + +feature_store_registry: t.DefaultDict[str, t.Optional[FeatureStore]] = defaultdict(lambda: None) + +@pytest.fixture(scope="function") +def local_experiment(test_dir: str) -> smartsim.Experiment: + """Create a default experiment that uses the requested launcher""" + name = pathlib.Path(test_dir).stem + return smartsim.Experiment(name, exp_path=test_dir, launcher="local") + +@pytest.fixture(scope="function") +def wlm_experiment(test_dir: str, wlmutils: WLMUtils) -> smartsim.Experiment: + """Create a default experiment that uses the requested launcher""" + name = pathlib.Path(test_dir).stem + return smartsim.Experiment( + name, + exp_path=test_dir, + launcher=wlmutils.get_test_launcher() + ) + +def _cleanup_fs(name: str) -> None: + global feature_store_registry + fs = feature_store_registry[name] + if fs and fs.is_active(): + exp = Experiment("cleanup") + try: + fs = exp.reconnect_feature_store(fs.checkpoint_file) + exp.stop(fs) + except: + pass + +@dataclass +class DBConfiguration: + name: str + launcher: str + num_nodes: int + interface: t.Union[str,t.List[str]] + hostlist: t.Optional[t.List[str]] + port: int + +@dataclass +class PrepareFeatureStoreOutput: + featurestore: t.Optional[FeatureStore] # The actual feature store object + new_fs: bool # True if a new feature store was created when calling prepare_fs + +# Reuse feature stores +@pytest.fixture(scope="session") +def local_fs() -> t.Generator[DBConfiguration, None, None]: + name = "local_fs_fixture" + config = DBConfiguration( + name, + "local", + 1, + "lo", + None, + _find_free_port(tuple(reversed(test_ports))), + ) + yield config + _cleanup_fs(name) + +@pytest.fixture(scope="session") +def single_fs(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: + hostlist = wlmutils.get_test_hostlist() + hostlist = hostlist[-1:] if hostlist is not None else None + name = "single_fs_fixture" + config = DBConfiguration( + name, + wlmutils.get_test_launcher(), + 1, + wlmutils.get_test_interface(), + hostlist, + _find_free_port(tuple(reversed(test_ports))) + ) + yield config + _cleanup_fs(name) + + +@pytest.fixture(scope="session") +def clustered_fs(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: + hostlist = wlmutils.get_test_hostlist() + hostlist = hostlist[-4:-1] if hostlist is not None else None + name = "clustered_fs_fixture" + config = DBConfiguration( + name, + wlmutils.get_test_launcher(), + 3, + wlmutils.get_test_interface(), + hostlist, + _find_free_port(tuple(reversed(test_ports))), + ) + yield config + _cleanup_fs(name) + + +@pytest.fixture +def register_new_fs() -> t.Callable[[DBConfiguration], FeatureStore]: + def _register_new_fs( + config: DBConfiguration + ) -> FeatureStore: + exp_path = pathlib.Path(test_output_root, config.name) + exp_path.mkdir(exist_ok=True) + exp = Experiment( + config.name, + exp_path=str(exp_path), + launcher=config.launcher, + ) + feature_store = exp.create_feature_store( + port=config.port, + batch=False, + interface=config.interface, + hosts=config.hostlist, + fs_nodes=config.num_nodes + ) + exp.generate(feature_store, overwrite=True) + exp.start(feature_store) + global feature_store_registry + feature_store_registry[config.name] = feature_store + return feature_store + return _register_new_fs + + +@pytest.fixture(scope="function") +def prepare_fs( + register_new_fs: t.Callable[ + [DBConfiguration], + FeatureStore + ] +) -> t.Callable[ + [DBConfiguration], + PrepareFeatureStoreOutput +]: + def _prepare_fs(fs_config: DBConfiguration) -> PrepareFeatureStoreOutput: + global feature_store_registry + fs = feature_store_registry[fs_config.name] + + new_fs = False + fs_up = False + + if fs: + fs_up = fs.is_active() + + if not fs_up or fs is None: + fs = register_new_fs(fs_config) + new_fs = True + + return PrepareFeatureStoreOutput(fs, new_fs) + return _prepare_fs diff --git a/doc/_static/version_names.json b/doc/_static/version_names.json index bbe3b332d..bc095f84a 100644 --- a/doc/_static/version_names.json +++ b/doc/_static/version_names.json @@ -1,7 +1,8 @@ { "version_names":[ "develop (unstable)", - "0.6.2 (stable)", + "0.7.0 (stable)", + "0.6.2", "0.6.1", "0.6.0", "0.5.1", @@ -14,6 +15,7 @@ "version_urls": [ "https://www.craylabs.org/develop/overview.html", "https://www.craylabs.org/docs/overview.html", + "https://www.craylabs.org/docs/versions/0.6.2/overview.html", "https://www.craylabs.org/docs/versions/0.6.1/overview.html", "https://www.craylabs.org/docs/versions/0.6.0/overview.html", "https://www.craylabs.org/docs/versions/0.5.1/overview.html", diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index bb6a02b8e..d9615e04c 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -25,6 +25,7 @@ Experiment Experiment.finished Experiment.get_status Experiment.reconnect_orchestrator + Experiment.preview Experiment.summary Experiment.telemetry @@ -59,6 +60,7 @@ Types of Settings: MpiexecSettings OrterunSettings JsrunSettings + DragonRunSettings SbatchSettings QsubBatchSettings BsubBatchSettings @@ -162,6 +164,28 @@ and within batch launches (e.g., ``QsubBatchSettings``) :members: +.. _dragonsettings_api: + +DragonRunSettings +----------------- + +``DragonRunSettings`` can be used on systems that support Slurm or +PBS, if Dragon is available in the Python environment (see `_dragon_install` +for instructions on how to install it through ``smart``). + +``DragonRunSettings`` can be used in interactive sessions (on allcation) +and within batch launches (i.e. ``SbatchSettings`` or ``QsubBatchSettings``, +for Slurm and PBS sessions, respectively). + +.. autosummary:: + DragonRunSettings.set_nodes + DragonRunSettings.set_tasks_per_node + +.. autoclass:: DragonRunSettings + :inherited-members: + :undoc-members: + :members: + .. _jsrun_api: @@ -409,6 +433,8 @@ Orchestrator Orchestrator.set_max_message_size Orchestrator.set_db_conf Orchestrator.telemetry + Orchestrator.checkpoint_file + Orchestrator.batch Orchestrator ------------ diff --git a/doc/changelog.md b/doc/changelog.md index d95670f2c..73ea36511 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -9,12 +9,18 @@ Jump to: ## SmartSim -### Development branch +### 0.7.0 -To be released at some future point in time +Released on 14 May, 2024 Description +- Improve Dragon server shutdown +- Add dragon runtime installer +- Add launcher based on Dragon +- Reuse Orchestrators within the testing suite to improve performance. +- Fix building of documentation +- Preview entities on experiment before start - Update authentication in release workflow - Auto-generate type-hints into documentation - Auto-post release PR to develop @@ -58,6 +64,25 @@ Description Detailed Notes +- The Dragon server will now terminate any process which is still running + when a request of an immediate shutdown is sent. ([SmartSim-PR582](https://github.com/CrayLabs/SmartSim/pull/582)) +- Add `--dragon` option to `smart build`. Install appropriate Dragon + runtime from Dragon GitHub release assets. + ([SmartSim-PR580](https://github.com/CrayLabs/SmartSim/pull/580)) +- Add new launcher, based on [Dragon](https://dragonhpc.github.io/dragon/doc/_build/html/index.html). + The new launcher is compatible with the Slurm and PBS schedulers and can + be selected by specifying ``launcher="dragon"`` when creating an `Experiment`, + or by using ``DragonRunSettings`` to launch a job. The Dragon launcher + is at an early stage of development: early adopters are referred to the + dedicated documentation section to learn more about it. ([SmartSim-PR580](https://github.com/CrayLabs/SmartSim/pull/580)) +- Tests may now request a given configuration and will reconnect to + the existing orchestrator instead of building up and tearing down + a new one each test. + ([SmartSim-PR567](https://github.com/CrayLabs/SmartSim/pull/567)) +- Manually ensure that typing_extensions==4.6.1 in Dockerfile used to build + docs. This fixes the deploy_dev_docs Github action ([SmartSim-PR564](https://github.com/CrayLabs/SmartSim/pull/564)) +- Added preview functionality to Experiment, including preview of all entities, active infrastructure and + client configuration. ([SmartSim-PR525](https://github.com/CrayLabs/SmartSim/pull/525)) - Replace the developer created token with the GH_TOKEN environment variable. ([SmartSim-PR570](https://github.com/CrayLabs/SmartSim/pull/570)) - Add extension to auto-generate function type-hints into documentation. @@ -102,7 +127,7 @@ Detailed Notes Torch will unconditionally try to link in this library, however fails because the linking flags are incorrect. ([SmartSim-PR538](https://github.com/CrayLabs/SmartSim/pull/538)) -- Change type_extension and pydantic versions in readthedocs +- Change typing\_extensions and pydantic versions in readthedocs environment to enable docs build. ([SmartSim-PR537](https://github.com/CrayLabs/SmartSim/pull/537)) - Promote devices to a dedicated Enum type throughout the SmartSim diff --git a/doc/conf.py b/doc/conf.py index 71d109b5c..932bce013 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -29,7 +29,7 @@ import smartsim version = smartsim.__version__ except ImportError: - version = "0.6.2" + version = "0.7.0" # The full version, including alpha/beta/rc tags release = version diff --git a/doc/dragon.rst b/doc/dragon.rst new file mode 100644 index 000000000..0bf6a8ea3 --- /dev/null +++ b/doc/dragon.rst @@ -0,0 +1,169 @@ +****** +Dragon +****** + +======== +Overview +======== + +Dragon is a composable distributed run-time targeting HPC workflows. In SmartSim, +Dragon can be used as a launcher, within a Slurm or PBS allocation or batch job. +The SmartSim team collaborates with the Dragon team to develop an efficient +launcher which will enable fast, interactive, and customized execution of +complex workflows on large HPC systems. As Dragon is scheduler-agnostic, +the same SmartSim script using Dragon as a launcher can be run indifferently +on a Slurm or PBS system. Support for additional schedulers is coming soon. + +.. warning:: + The Dragon launcher is currently in its early development stage and should be treated as + a prototype implementation. Your assistance is invaluable in identifying any issues + encountered during usage and suggesting missing features for implementation. Please + provide feedback in the form of a created issue on the + `SmartSim issues GitHub page `_. + The :ref:`Known Issues section` is also a good starting + point when troubleshooting workflows run using the Dragon launcher. + +===== +Usage +===== +To use Dragon, you need to install it in your current Python environment. This can +be accomplished by providing the ``--dragon`` flag to the ``smart build`` command, as +detailed in the :ref:`Dragon Install `. Note that specifying the device +configuration is also required for a proper build. + +After installation, specify Dragon as the launcher when creating an ``Experiment``: + +.. code-block:: python + + exp = Experiment(name="dragon-example", launcher="dragon") + +Dragon introduces its own run settings class, ``DragonRunSettings``, which allows users to +specify nodes and tasks per node for a ``Model``. For instance, continuing from the previous +example: + +.. code-block:: python + + # Because "dragon" was specified as the launcher during Experiment initialization, + # create_run_settings will return a DragonRunSettings object + rs = exp.create_run_settings(exe="mpi_app", + exe_args=["--option", "value"], + env_vars={"MYVAR": "VALUE"}) + # Above we specify the executable (exe), executable arguments (exe_args) + # and environment variables (env_vars) + + # Sets the number of nodes for this job + rs.set_nodes(4) + # Set the tasks per node for this job + rs.set_tasks_per_node(3) + # Initialize the Model and pass in the DragonRunSettings object + mpi_app = exp.create_model("MPI_APP", run_settings=rs) + # Start the Model + exp.start(mpi_app) + +SmartSim supports ``DragonRunSettings`` with ``Model``, ``Ensemble`` and ``Orchestrator`` entities. +In the next sections, we detail how Dragon is integrated into SmartSim. + +For more information on HPC launchers, visit the :ref:`Run Settings` page. + +================= +The Dragon Server +================= + +Dragon can initiate processes on any available resource within an allocation. To facilitate +this, SmartSim initializes the Dragon infrastructure whenever a ``Model`` is launched and maintains +it until the parent ``Experiment`` concludes. To facilitate interaction with processes managed by +Dragon, SmartSim establishes a command server within the Dragon infrastructure. This server, +known as the `Dragon Server`, is responsible for executing commands to start or stop processes +and to query their status. + +Sharing the Dragon Server across Experiments +============================================ + +Currently, SmartSim supports only one Dragon server per allocation. Consequently, +if multiple Experiments need to run within the same allocation, the Dragon server +must be shared among them. By default, the server starts from a subdirectory +of the ``Experiment`` path, where it creates a configuration file. +To enable server sharing, users can specify a custom path +from which the server should be launched. This can be achieved by setting the +environment variable ``SMARTSIM_DRAGON_SERVER_PATH`` to an existing absolute path. +Each ``Experiment`` will then search for the configuration file in the specified path +and initiate a new server instance only if the file is not found. + +Dragon's High-Speed Transport Agents +==================================== + +On systems equipped with the HPE Slingshot interconnect, Dragon utilizes High-Speed +Transport Agents (HSTA) by default for internal messaging within the infrastructure +launched by SmartSim. On systems without the HPE Slingshot interconnect, +TCP agents are employed. To specify the use of TCP agents, users must set the environment +variable ``SMARTSIM_DRAGON_TRANSPORT`` to ``tcp`` prior to executing the Experiment. +To specify HSTA, ``SMARTSIM_DRAGON_TRANSPORT`` can be set to ``hsta`` or left unset. + +============= +Communication +============= + +SmartSim and the Dragon Server communicate using `ZeroMQ `_. + +Similar to other communication protocols, defining timeouts for send and receive operations +is crucial in SmartSim. SmartSim configures default timeouts that have been tested on various +systems, such as Polaris, Perlmutter, and other HPE Cray EX and Apollo systems. +However, if you encounter failed communication attempts, adjusting the timeouts may +be necessary. You can adjust these timeouts by setting the corresponding environment variables: + +- **Server Start-up Timeout**: This timeout specifies the duration the SmartSim ``Experiment`` + waits when the server is initially started. It must accommodate the time required for + Dragon to set up the infrastructure, which varies based on the system's workload manager + response time. The default timeout is `"300000"` milliseconds (i.e., five minutes), and you can override + it using the ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` environment variable. + +- **Server Send and Receive Timeout**: This timeout dictates how long SmartSim and the Dragon + server wait to send or receive a message. The default timeout is `"30000"` milliseconds (i.e., 30 seconds), + and you can modify it using the ``SMARTSIM_DRAGON_TIMEOUT`` environment variable. + +Setting any timeout to "-1" will result in an infinite waiting time, causing the execution to +block until the communication is completed, potentially hanging indefinitely if issues occur. + +It's important to note that all communications are secured with `elliptic curve cryptography `_. +SmartSim generates the necessary key-pairs and stores them in the user's home directory by +default. However, you can specify an alternative absolute path using the ``SMARTSIM_KEY_PATH`` +environment variable. + +.. _dragon_known_issues: + +============ +Known issues +============ + +As previously noted, the integration of SmartSim with Dragon is still in its early +development stage, and there are known issues that may result in unexpected behavior +during runs: + +- **Incomplete cleanup of Dragon resources**: When SmartSim exits, it attempts to properly + shut down the Dragon infrastructure to clean up associated resources, such as shared memory + segments, and terminate all processes. However, in rare cases, if the execution is + abruptly interrupted (e.g., by terminating SmartSim with ``SIGKILL``), the cleanup process + may be incomplete, leaving processes like the Dragon overlay network active on the node + where SmartSim was executed (which could be a login node, particularly on Slurm systems). + If this occurs, you can use the following command to address the issue: + + .. code-block:: + + smart teardown --dragon + + This command will terminate all Dragon-related processes, release shared memory segments, + but also terminate all Python processes associated with your username. + +- **Dragon server not starting**: This issue may arise due to two main reasons: + + 1. *HSTA not available on the system*: Try setting the environment variable + ``SMARTSIM_DRAGON_TRANSPORT`` to ``tcp``. + 2. *System or Workload Manager too busy*: Attempt to mitigate this by setting the environment + variable ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` to a larger value or ``"-1"``. + +- **MPI-based applications hanging**: To run MPI-based applications on Dragon, Cray PMI or + Cray PALS must be available on the system. This limitation is currently being addressed. + + +Interested users can learn more about the Dragon project at the external +`Dragon documentation page `_. \ No newline at end of file diff --git a/doc/experiment.rst b/doc/experiment.rst index 73ba08812..716df1228 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -52,14 +52,18 @@ SmartSim supports launching AI-enabled workflows on a wide variety of systems, i Linux machine or on HPC machines with a job scheduler (e.g. Slurm, PBS Pro, and LSF). When creating a SmartSim ``Experiment``, the user has the opportunity to specify the `launcher` type or defer to automatic `launcher` selection. `Launcher` selection determines how SmartSim translates entity configurations into system calls to launch, -manage, and monitor. Currently, SmartSim supports 5 `launchers`: +manage, and monitor. Currently, SmartSim supports 7 `launcher` options: 1. ``local`` **[default]**: for single-node, workstation, or laptop 2. ``slurm``: for systems using the Slurm scheduler 3. ``pbs``: for systems using the PBS Pro scheduler 4. ``pals``: for systems using the PALS scheduler 5. ``lsf``: for systems using the LSF scheduler -6. ``auto``: have SmartSim auto-detect the launcher to use +6. ``dragon``: if Dragon is installed in the current Python environment, see :ref:`Dragon Install ` +7. ``auto``: have SmartSim auto-detect the launcher to use (will not detect ``dragon``) + +The :ref:`Dragon-based launcher ` can be run on PBS- or Slurm-based systems +(MPI applications are supported only when Cray PMI or Cray PALS are available). If the systems `launcher` cannot be found or no `launcher` argument is provided, the default value of `"local"` will be assigned which will start all ``Experiment`` launched entities on the @@ -126,6 +130,9 @@ the ``Experiment`` post-creation methods. * - ``get_status`` - ``exp.get_status(*args)`` - Retrieve Entity Status + * - ``preview`` + - ``exp.preview(*args, ...)`` + - Preview an Entity .. _orchestrator_exp_docs: @@ -329,6 +336,9 @@ Example *Generating* - the ``Orchestrator`` output directory - the ``Model`` output directory + *Previewing* + - the ``Orchestrator`` contents + - the ``Model`` contents *Starting* - an in-memory database (standalone ``Orchestrator``) - an application (``Model``) @@ -354,7 +364,7 @@ Initializing .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py :language: python :linenos: - :lines: 1-7 + :lines: 1-8 We also initialize a SmartSim :ref:`logger`. We will use the logger to log the ``Experiment`` summary. @@ -369,7 +379,7 @@ Initializing .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py :language: python :linenos: - :lines: 9-10 + :lines: 10-11 .. compound:: Before invoking the factory method to create a ``Model``, @@ -384,7 +394,7 @@ Initializing .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py :language: python :linenos: - :lines: 12-13 + :lines: 13-14 After creating the ``RunSettings`` object, initialize the ``Model`` object by passing the `name` and `settings` to ``create_model``. @@ -392,7 +402,7 @@ Initializing .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py :language: python :linenos: - :lines: 14-15 + :lines: 15-16 Generating ========== @@ -405,7 +415,7 @@ Generating .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py :language: python :linenos: - :lines: 17-18 + :lines: 18-19 `Overwrite=True` instructs SmartSim to overwrite entity contents if files and subdirectories already exist within the ``Experiment`` directory. @@ -418,6 +428,73 @@ Generating The ``Experiment.generate`` call places the `.err` and `.out` log files in the entity subdirectories within the main ``Experiment`` directory. +Previewing +========== +.. compound:: + Optionally, users can preview an ``Experiment`` entity. The ``Experiment.preview`` method displays the entity summaries during runtime + to offer additional insight into the launch details. Any instance of a ``Model``, ``Ensemble``, or ``Orchestrator`` created by the + ``Experiment`` can be passed as an argument to the preview method. Additionally, users may specify the name of a file to write preview data to + via the ``output_filename`` argument, as well as the text format through the ``output_format`` argument. Users can also specify how verbose + the preview is via the ``verbosity_level`` argument. + + The following options are available when configuring preview: + + * `verbosity_level="info"` instructs preview to display user-defined fields and entities. + * `verbosity_level="debug"` instructs preview to display user-defined field and entities and auto-generated fields. + * `verbosity_level="developer"` instructs preview to display user-defined field and entities, auto-generated fields, and run commands. + * `output_format="plain_text"` sets the output format. The only accepted output format is 'plain_text'. + * `output_filename="test_name.txt"` specifies name of file and extension to write preview data to. If no output filename is set, the preview will be output to stdout. + + In the example below, we preview the ``Orchestrator`` and ``Model`` entities by passing their instances to ``Experiment.preview``: + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 21-22 + +When executed, the preview logs the following in stdout: + +:: + + === Experiment Overview === + + Experiment Name: example-experiment + Experiment Path: absolute/path/to/SmartSim/example-experiment + Launcher: local + + === Entity Preview === + + == Orchestrators == + + = Database Identifier: orchestrator = + Path: absolute/path/to/SmartSim/example-experiment/orchestrator + Shards: 1 + TCP/IP Port(s): + 6379 + Network Interface: ib0 + Type: redis + Executable: absolute/path/to/SmartSim/smartsim/_core/bin/redis-server + + == Models == + + = Model Name: hello_world = + Path: absolute/path/to/SmartSim/example-experiment/hello_world + Executable: /bin/echo + Executable Arguments: + Hello + World + Client Configuration: + Database Identifier: orchestrator + Database Backend: redis + TCP/IP Port(s): + 6379 + Type: Standalone + Outgoing Key Collision Prevention (Key Prefixing): + Tensors: Off + Datasets: Off + ML Models/Torch Scripts: Off + Aggregation Lists: Off + Starting ======== .. compound:: @@ -428,7 +505,7 @@ Starting .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py :language: python :linenos: - :lines: 20-21 + :lines: 24-25 Stopping ======== @@ -439,7 +516,7 @@ Stopping .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py :language: python :linenos: - :lines: 23-26 + :lines: 27-28 Notice that we use the ``Experiment.summary`` function to print the summary of the workflow. @@ -454,4 +531,4 @@ When you run the experiment, the following output will appear:: .. note:: Failure to tear down the ``Orchestrator`` at the end of an ``Experiment`` may lead to ``Orchestrator`` launch failures if another ``Experiment`` is - started on the same node. \ No newline at end of file + started on the same node. diff --git a/doc/index.rst b/doc/index.rst index 7e7d9c2d6..4c64712b2 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -37,6 +37,7 @@ orchestrator ss_logger ml_features + dragon api/smartsim_api .. toctree:: diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index 905519f6f..02c17e1fd 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -237,6 +237,28 @@ to building SmartSim with GPU support is to specify a different ``device`` backends look for the CUDA Toolkit and cuDNN libraries. Please see the :ref:`Platform Installation Section ` section for guidance. + +.. _dragon_install: + +Dragon Install +-------------- + +`Dragon `_ is +an HPC-native library for distributed computing. SmartSim can use Dragon as a +launcher on systems with Slurm or PBS as schedulers. To install the correct +version of Dragon, you can add the ``--dragon`` option to ``smart build``. +For example, to install dragon alongside the RedisAI CPU backends, you can run + +.. code-block:: bash + + # run one of the following + smart build --device cpu --dragon # install Dragon, PT and TF for cpu + smart build --device cpu --onnx --dragon # install Dragon and all backends (PT, TF, ONNX) on cpu + +.. note:: + Dragon is only supported on Linux systems. For further information, you + can read :ref:`the dedicated documentation page `. + ========== SmartRedis ========== @@ -300,7 +322,7 @@ source remains at the site of the clone instead of in site-packages. pip install -e .[dev,ml] # for bash users pip install -e .\[dev,ml\] # for zsh users -Use the now installed ``smart`` cli to install the machine learning runtimes. +Use the now installed ``smart`` cli to install the machine learning runtimes and dragon. .. tabs:: @@ -309,8 +331,8 @@ Use the now installed ``smart`` cli to install the machine learning runtimes. .. code-block:: bash # run one of the following - smart build --device cpu --onnx # install with cpu-only support - smart build --device gpu --onnx # install with both cpu and gpu support + smart build --device cpu --onnx --dragon # install with cpu-only support + smart build --device gpu --onnx --dragon # install with both cpu and gpu support .. tab:: MacOS (Intel x64) diff --git a/doc/installation_instructions/platform/olcf-summit.rst b/doc/installation_instructions/platform/olcf-summit.rst index 6268584cc..236d15054 100644 --- a/doc/installation_instructions/platform/olcf-summit.rst +++ b/doc/installation_instructions/platform/olcf-summit.rst @@ -19,7 +19,7 @@ into problems. .. code-block:: bash # setup Python and build environment - export ENV_NAME=smartsim-0.6.2 + export ENV_NAME=smartsim-0.7.0 git clone https://github.com/CrayLabs/SmartRedis.git smartredis git clone https://github.com/CrayLabs/SmartSim.git smartsim conda config --prepend channels https://ftp.osuosl.org/pub/open-ce/1.6.1/ diff --git a/doc/run_settings.rst b/doc/run_settings.rst index 449b61ea4..ed12df8cb 100644 --- a/doc/run_settings.rst +++ b/doc/run_settings.rst @@ -176,6 +176,13 @@ for each job scheduler. Users may replace `mpirun` with `mpiexec` or `orterun`. + + .. note:: + SmartSim will look for an allocation by accessing the associated WLM job ID environment variable. If an allocation + is present, the entity will be launched on the reserved compute resources. A user may also specify the allocation ID + when initializing a run settings object via the `alloc` argument. If an allocation is specified, the entity receiving + these run parameters will launch on that allocation. + .. group-tab:: PBS Pro The PBS Pro `launcher` supports the :ref:`AprunSettings API ` as well as the :ref:`MpirunSettings API `, :ref:`MpiexecSettings API ` and :ref:`OrterunSettings API ` that each can be used to run executables @@ -304,8 +311,24 @@ for each job scheduler. Users may replace `mpirun` with `mpiexec` or `orterun`. -.. note:: - SmartSim will look for an allocation by accessing the associated WLM job ID environment variable. If an allocation - is present, the entity will be launched on the reserved compute resources. A user may also specify the allocation ID - when initializing a run settings object via the `alloc` argument. If an allocation is specified, the entity receiving - these run parameters will launch on that allocation. \ No newline at end of file + .. group-tab:: Dragon + The Dragon `launcher` does not need any launch binary. Below we step through initializing a ``DragonRunSettings`` instance on a Slurm- + or PBS-based machine. + + **DragonRunSettings** + + Run a job with the `dragon` launcher. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher dragon + exp = Experiment("name-of-experiment", launcher="dragon") + + # Initialize a DragonRunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World") + # Set the number of nodes for this job + run_settings.set_nodes(4) + # Set the number of tasks per node for this job + run_settings.set_tasks_per_node(10) diff --git a/doc/testing.rst b/doc/testing.rst index ccb2db3c2..08cce5d36 100644 --- a/doc/testing.rst +++ b/doc/testing.rst @@ -66,20 +66,20 @@ of the tests located within the ``on_wlm`` directory. To run the ``on_wlm`` test suite, users will have to be on a system with one of the supported workload managers. Additionally, users will -need to obtain an allocation of **at least 4 nodes**. +need to obtain an allocation of **at least 8 nodes**. Examples of how to obtain allocations on systems with the launchers: .. code:: bash # for slurm (with srun) - salloc -N 4 -A account --exclusive -t 00:10:00 + salloc -N 8 -A account --exclusive -t 00:10:00 # for PBSPro (with aprun) - qsub -l select=4 -l place=scatter -l walltime=00:10:00 -q queue + qsub -l select=8 -l place=scatter -l walltime=00:10:00 -q queue # for LSF (with jsrun) - bsub -Is -W 00:30 -nnodes 4 -P project $SHELL + bsub -Is -W 00:30 -nnodes 8 -P project $SHELL Values for queue, account, or project should be substituted appropriately. @@ -119,7 +119,7 @@ A full example on an internal SLURM system .. code:: bash - salloc -N 4 -A account --exclusive -t 03:00:00 + salloc -N 8 -A account --exclusive -t 03:00:00 export SMARTSIM_TEST_LAUNCHER=slurm export SMARTSIM_TEST_INTERFACE=ipogif0 export SMARTSIM_TEST_DEVICE=gpu diff --git a/doc/tutorials/doc_examples/experiment_doc_examples/exp.py b/doc/tutorials/doc_examples/experiment_doc_examples/exp.py index 738b767d3..3d5fb4c17 100644 --- a/doc/tutorials/doc_examples/experiment_doc_examples/exp.py +++ b/doc/tutorials/doc_examples/experiment_doc_examples/exp.py @@ -1,4 +1,5 @@ from smartsim import Experiment +from smartsim._core.control.previewrenderer import Verbosity from smartsim.log import get_logger # Initialize an Experiment @@ -17,10 +18,13 @@ # Generate the output directory exp.generate(standalone_feature_store, model, overwrite=True) -# Launch the Feature Store then Model instance +# Preview the experiment +exp.preview(standalone_feature_store, model, verbosity_level=Verbosity.DEBUG) + +# Launch the Orchestrator then Model instance exp.start(standalone_feature_store, model) # Clobber the Feature Store exp.stop(standalone_feature_store) # Log the summary of the Experiment -smartsim_logger.info(exp.summary()) \ No newline at end of file +smartsim_logger.info(exp.summary()) diff --git a/docker-compose.yml b/docker-compose.yml index 466ee68c9..f5be4e338 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,7 +18,7 @@ services: - "8888:8888" tutorials-prod: - image: smartsim-tutorials:v0.6.2 + image: smartsim-tutorials:v0.7.0 build: context: . dockerfile: ./docker/prod/Dockerfile diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index 49bbb833c..e9db9c342 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -58,4 +58,7 @@ RUN git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop -- RUN python -m pip install -r doc/requirements-doc.txt \ && NO_CHECKS=1 SMARTSIM_SUFFIX=dev python -m pip install . +# Note this is needed to ensure that the Sphinx builds. Can be removed with newer Tensorflow +RUN python -m pip install typing_extensions==4.6.1 + RUN make docs diff --git a/docker/prod/Dockerfile b/docker/prod/Dockerfile index c4e86d603..325ace923 100644 --- a/docker/prod/Dockerfile +++ b/docker/prod/Dockerfile @@ -46,7 +46,7 @@ COPY --chown=craylabs:root ./tutorials/ /home/craylabs/tutorials/ USER craylabs RUN export PATH=/home/craylabs/.local/bin:$PATH && \ echo "export PATH=/home/craylabs/.local/bin:$PATH" >> /home/craylabs/.bashrc && \ - python -m pip install smartsim[ml]==0.6.2 jupyter jupyterlab matplotlib && \ + python -m pip install smartsim[ml]==0.7.0 jupyter jupyterlab matplotlib && \ smart build --device cpu -v && \ chown craylabs:root -R /home/craylabs/.local && \ rm -rf ~/.cache/pip diff --git a/pyproject.toml b/pyproject.toml index fe87141de..91164a68b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,8 +45,15 @@ exclude = ''' | build | dist | setup.py + | .*\.py ) ''' +force-exclude = ''' +( + .*\.dragon/* +) +''' + [tool.pytest.ini_options] log_cli = true @@ -61,6 +68,7 @@ markers = [ # supress circular import warning profile = "black" skip = ["tests/test_configs/circular_config"] +skip_glob="smartsim/_core/.dragon/*" [tool.coverage.run] source = ["smartsim"] @@ -78,7 +86,7 @@ namespace_packages = true files = [ "smartsim" ] -plugins = [] +plugins = ["pydantic.mypy"] ignore_errors = false # Dynamic typing @@ -124,6 +132,7 @@ module = [ "torch", "smartsim.ml.torch.*", # must solve/ignore inheritance issues "watchdog", + "dragon.*", ] ignore_missing_imports = true ignore_errors = true diff --git a/setup.cfg b/setup.cfg index ba6606f7f..742386d2c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -62,6 +62,7 @@ include = smartsim* exclude = .third-party + .dragon tests doc smartredis diff --git a/setup.py b/setup.py index f377051b1..3928d72a0 100644 --- a/setup.py +++ b/setup.py @@ -139,7 +139,6 @@ def finalize_options(self): class SmartSimBuild(build_py): - def run(self): feature_store_builder = builder.FeatureStoreBuilder( build_env(), build_env.MALLOC, build_env.JOBS @@ -174,7 +173,11 @@ def has_ext_modules(_placeholder): "tqdm>=4.50.2", "filelock>=3.4.2", "protobuf~=3.20", + "jinja2>=3.1.2", "watchdog>=4.0.0", + "pydantic==1.10.14", + "pyzmq>=25.1.2", + "pygithub>=2.3.0", ] # Add SmartRedis at specific version diff --git a/smartsim/_core/__init__.py b/smartsim/_core/__init__.py index bbc108f48..490078770 100644 --- a/smartsim/_core/__init__.py +++ b/smartsim/_core/__init__.py @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .control import Controller, Manifest +from .control import Controller, Manifest, previewrenderer from .generation import Generator -__all__ = ["Controller", "Manifest", "Generator"] +__all__ = ["Controller", "Manifest", "Generator", "previewrenderer"] diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 0e53c7181..ea5f2177c 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -33,6 +33,7 @@ from tabulate import tabulate +from smartsim._core._cli.scripts.dragon_install import install_dragon from smartsim._core._cli.utils import SMART_LOGGER_FORMAT, color_bool, pip from smartsim._core._install import builder from smartsim._core._install.buildenv import ( @@ -358,12 +359,27 @@ def _format_incompatible_python_env_message( ) +def _configure_keydb_build(versions: Versioner) -> None: + """Configure the redis versions to be used during the build operation""" + versions.REDIS = Version_("6.2.0") + versions.REDIS_URL = "https://github.com/EQ-Alpha/KeyDB" + versions.REDIS_BRANCH = "v6.2.0" + + CONFIG.conf_path = Path(CONFIG.core_path, "config", "keydb.conf") + if not CONFIG.conf_path.resolve().is_file(): + raise SSConfigError( + "Database configuration file at REDIS_CONF could not be found" + ) + + +# pylint: disable-next=too-many-statements def execute( args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / ) -> int: verbose = args.v keydb = args.keydb device = Device(args.device.lower()) + is_dragon_requested = args.dragon # torch and tf build by default pt = not args.no_pt # pylint: disable=invalid-name tf = not args.no_tf # pylint: disable=invalid-name @@ -375,7 +391,7 @@ def execute( logger.info("Checking requested versions...") versions = Versioner() - logger.info("Checking for build tools...") + logger.debug("Checking for build tools...") if verbose: logger.info("Build Environment:") @@ -384,14 +400,7 @@ def execute( print(tabulate(env, headers=env_vars, tablefmt="github"), "\n") if keydb: - versions.REDIS = Version_("6.2.0") - versions.REDIS_URL = "https://github.com/EQ-Alpha/KeyDB" - versions.REDIS_BRANCH = "v6.2.0" - CONFIG.conf_path = Path(CONFIG.core_path, "config", "keydb.conf") - if not CONFIG.conf_path.resolve().is_file(): - raise SSConfigError( - "Database configuration file at REDIS_CONF could not be found" - ) + _configure_keydb_build(versions) if verbose: fs_name: DbEngine = "KEYDB" if keydb else "REDIS" @@ -400,6 +409,17 @@ def execute( version_names = list(vers.keys()) print(tabulate(vers, headers=version_names, tablefmt="github"), "\n") + if is_dragon_requested: + install_to = CONFIG.core_path / ".dragon" + return_code = install_dragon(install_to) + + if return_code == 0: + logger.info("Dragon installation complete") + elif return_code == 1: + logger.info("Dragon installation not supported on platform") + else: + logger.warning("Dragon installation failed") + try: if not args.only_python_packages: # REDIS/KeyDB @@ -457,6 +477,12 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: choices=[device.value for device in Device], help="Device to build ML runtimes for", ) + parser.add_argument( + "--dragon", + action="store_true", + default=False, + help="Install the dragon runtime", + ) parser.add_argument( "--only_python_packages", action="store_true", @@ -499,7 +525,6 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: default=False, help="Build KeyDB instead of Redis", ) - parser.add_argument( "--no_torch_with_mkl", dest="torch_with_mkl", diff --git a/smartsim/_core/_cli/cli.py b/smartsim/_core/_cli/cli.py index 3cad573d1..3d5c6e066 100644 --- a/smartsim/_core/_cli/cli.py +++ b/smartsim/_core/_cli/cli.py @@ -39,6 +39,8 @@ from smartsim._core._cli.info import execute as info_execute from smartsim._core._cli.plugin import plugins from smartsim._core._cli.site import execute as site_execute +from smartsim._core._cli.teardown import configure_parser as teardown_parser +from smartsim._core._cli.teardown import execute as teardown_execute from smartsim._core._cli.utils import MenuItemConfig from smartsim._core._cli.validate import configure_parser as validate_parser from smartsim._core._cli.validate import execute as validate_execute @@ -106,7 +108,7 @@ def default_cli() -> SmartCli: menu = [ MenuItemConfig( "build", - "Build SmartSim dependencies (Redis, RedisAI, ML runtimes)", + "Build SmartSim dependencies (Redis, RedisAI, Dragon, ML runtimes)", build_execute, build_parser, ), @@ -142,6 +144,12 @@ def default_cli() -> SmartCli: "Display information about the current SmartSim installation", info_execute, ), + MenuItemConfig( + "teardown", + "Clean up allocated resources after an experiment terminates", + teardown_execute, + teardown_parser, + ), ] return SmartCli(menu) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py new file mode 100644 index 000000000..466c390bd --- /dev/null +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -0,0 +1,232 @@ +import os +import pathlib +import sys +import typing as t + +from github import Github +from github.GitReleaseAsset import GitReleaseAsset + +from smartsim._core._cli.utils import pip +from smartsim._core._install.builder import WebTGZ +from smartsim._core.config import CONFIG +from smartsim._core.utils.helpers import check_platform, is_crayex_platform +from smartsim.error.errors import SmartSimCLIActionCancelled +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +def create_dotenv(dragon_root_dir: pathlib.Path) -> None: + """Create a .env file with required environment variables for the Dragon runtime""" + dragon_root = str(dragon_root_dir) + dragon_inc_dir = str(dragon_root_dir / "include") + dragon_lib_dir = str(dragon_root_dir / "lib") + dragon_bin_dir = str(dragon_root_dir / "bin") + + dragon_vars = { + "DRAGON_BASE_DIR": dragon_root, + "DRAGON_ROOT_DIR": dragon_root, # note: same as base_dir + "DRAGON_INCLUDE_DIR": dragon_inc_dir, + "DRAGON_LIB_DIR": dragon_lib_dir, + "DRAGON_VERSION": dragon_pin(), + "PATH": dragon_bin_dir, + "LD_LIBRARY_PATH": dragon_lib_dir, + } + + lines = [f"{k}={v}\n" for k, v in dragon_vars.items()] + + if not CONFIG.dragon_dotenv.parent.exists(): + CONFIG.dragon_dotenv.parent.mkdir(parents=True) + + with CONFIG.dragon_dotenv.open("w", encoding="utf-8") as dotenv: + dotenv.writelines(lines) + + +def python_version() -> str: + """Return a formatted string used to filter release assets + for the current python version""" + return f"py{sys.version_info.major}.{sys.version_info.minor}" + + +def dragon_pin() -> str: + """Return a string indicating the pinned major/minor version of the dragon + package to install""" + return "0.9" + + +def _platform_filter(asset_name: str) -> bool: + """Return True if the asset name matches naming standard for current + platform (Cray or non-Cray). Otherwise, returns False. + + :param asset_name: A value to inspect for keywords indicating a Cray EX asset + :returns: True if supplied value is correct for current platform""" + key = "crayex" + is_cray = key in asset_name.lower() + if is_crayex_platform(): + return is_cray + return not is_cray + + +def _version_filter(asset_name: str) -> bool: + """Return true if the supplied value contains a python version match + + :param asset_name: A value to inspect for keywords indicating a python version + :returns: True if supplied value is correct for current python version""" + return python_version() in asset_name + + +def _pin_filter(asset_name: str) -> bool: + """Return true if the supplied value contains a dragon version pin match + + :param asset_name: A value to inspect for keywords indicating a dragon version + :returns: True if supplied value is correct for current dragon version""" + return f"dragon-{dragon_pin()}" in asset_name + + +def _get_release_assets() -> t.Collection[GitReleaseAsset]: + """Retrieve a collection of available assets for all releases that satisfy + the dragon version pin + + :returns: A collection of release assets""" + git = Github() + + dragon_repo = git.get_repo("DragonHPC/dragon") + + if dragon_repo is None: + raise SmartSimCLIActionCancelled("Unable to locate dragon repo") + + # find any releases matching our pinned version requirement + tags = [tag for tag in dragon_repo.get_tags() if dragon_pin() in tag.name] + # repo.get_latest_release fails if only pre-release results are returned + pin_releases = list(dragon_repo.get_release(tag.name) for tag in tags) + releases = sorted(pin_releases, key=lambda r: r.published_at, reverse=True) + + # take the most recent release for the given pin + assets = releases[0].assets + + return assets + + +def filter_assets(assets: t.Collection[GitReleaseAsset]) -> t.Optional[GitReleaseAsset]: + """Filter the available release assets so that HSTA agents are used + when run on a Cray EX platform + + :param assets: The collection of dragon release assets to filter + :returns: An asset meeting platform & version filtering requirements""" + # Expect cray & non-cray assets that require a filter, e.g. + # 'dragon-0.8-py3.9.4.1-bafaa887f.tar.gz', + # 'dragon-0.8-py3.9.4.1-CRAYEX-ac132fe95.tar.gz' + asset = next( + ( + asset + for asset in assets + if _version_filter(asset.name) + and _platform_filter(asset.name) + and _pin_filter(asset.name) + ), + None, + ) + return asset + + +def retrieve_asset_info() -> GitReleaseAsset: + """Find a release asset that meets all necessary filtering criteria + + :param dragon_pin: identify the dragon version to install (e.g. dragon-0.8) + :returns: A GitHub release asset""" + assets = _get_release_assets() + asset = filter_assets(assets) + + platform_result = check_platform() + if not platform_result.is_cray: + logger.warning("Installing Dragon without HSTA support") + for msg in platform_result.failures: + logger.warning(msg) + + if asset is None: + raise SmartSimCLIActionCancelled("No dragon runtime asset available to install") + + logger.debug(f"Retrieved asset metadata: {asset}") + return asset + + +def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib.Path: + """Retrieve the physical file associated to a given GitHub release asset + + :param working_dir: location in file system where assets should be written + :param asset: GitHub release asset to retrieve + :returns: path to the downloaded asset""" + if working_dir.exists() and list(working_dir.rglob("*.whl")): + return working_dir + + archive = WebTGZ(asset.browser_download_url) + archive.extract(working_dir) + + logger.debug(f"Retrieved {asset.browser_download_url} to {working_dir}") + return working_dir + + +def install_package(asset_dir: pathlib.Path) -> int: + """Install the package found in `asset_dir` into the current python environment + + :param asset_dir: path to a decompressed archive contents for a release asset""" + wheels = asset_dir.rglob("*.whl") + wheel_path = next(wheels, None) + if not wheel_path: + logger.error(f"No wheel found for package in {asset_dir}") + return 1 + + create_dotenv(wheel_path.parent) + + while wheel_path is not None: + logger.info(f"Installing package: {wheel_path.absolute()}") + + try: + pip("install", "--force-reinstall", str(wheel_path)) + wheel_path = next(wheels, None) + except Exception: + logger.error(f"Unable to install from {asset_dir}") + return 1 + + return 0 + + +def cleanup( + archive_path: t.Optional[pathlib.Path] = None, +) -> None: + """Delete the downloaded asset and any files extracted during installation + + :param archive_path: path to a downloaded archive for a release asset""" + if archive_path: + archive_path.unlink(missing_ok=True) + logger.debug(f"Deleted archive: {archive_path}") + + +def install_dragon(extraction_dir: t.Union[str, os.PathLike[str]]) -> int: + """Retrieve a dragon runtime appropriate for the current platform + and install to the current python environment + :param extraction_dir: path for download and extraction of assets + :returns: Integer return code, 0 for success, non-zero on failures""" + if sys.platform == "darwin": + logger.debug(f"Dragon not supported on platform: {sys.platform}") + return 1 + + extraction_dir = pathlib.Path(extraction_dir) + filename: t.Optional[pathlib.Path] = None + asset_dir: t.Optional[pathlib.Path] = None + + try: + asset_info = retrieve_asset_info() + asset_dir = retrieve_asset(extraction_dir, asset_info) + + return install_package(asset_dir) + except Exception as ex: + logger.error("Unable to install dragon runtime", exc_info=ex) + finally: + cleanup(filename) + + return 2 + + +if __name__ == "__main__": + sys.exit(install_dragon(CONFIG.core_path / ".dragon")) diff --git a/smartsim/_core/_cli/teardown.py b/smartsim/_core/_cli/teardown.py new file mode 100644 index 000000000..a3f181145 --- /dev/null +++ b/smartsim/_core/_cli/teardown.py @@ -0,0 +1,74 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import os +import subprocess +import typing as t + +from smartsim._core.config import CONFIG + + +def configure_parser(parser: argparse.ArgumentParser) -> None: + """Builds the parser for the command""" + parser.add_argument( + "--dragon", + action="store_true", + default=False, + help="Terminate Dragon environment resources if" + "any remain after experiment completion", + ) + + +def _do_dragon_teardown() -> int: + """Run dragon-cleanup script to destroy all remaining dragon resources""" + env = os.environ.copy() + dragon_cleanup = next(CONFIG.core_path.rglob("dragon-cleanup"), None) + if dragon_cleanup is None: + print("dragon-cleanup not found. Skipping cleanup") + return 0 + + # Use popen to avoid `dragon-cleanup` doing a kill on all + # python processes, terminating `smart teardown`, and the + # subprocess handling `dragon-cleanup`. Child processes using + # subprocess.run are killed and cleanup is interrupted + with subprocess.Popen( + [str(dragon_cleanup.absolute())], + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) as process: + process.wait() + return process.returncode + + +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: + if args.dragon: + return _do_dragon_teardown() + + return 0 diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index af0aba417..6c2a40911 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -84,6 +84,11 @@ def clean(core_path: Path, _all: bool = False) -> int: if build_temp.is_dir(): shutil.rmtree(build_temp, ignore_errors=True) + dragon_temp = core_path / ".dragon" + if dragon_temp.is_dir(): + shutil.rmtree(dragon_temp, ignore_errors=True) + logger.info("Successfully removed dragon installation") + lib_path = core_path / "lib" if lib_path.is_dir(): # remove RedisAI diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 9ddd559f8..709968c11 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -30,7 +30,6 @@ import multiprocessing as mp import os import os.path -import socket import tempfile import typing as t from types import TracebackType @@ -42,6 +41,7 @@ from smartsim._core._cli.utils import SMART_LOGGER_FORMAT from smartsim._core._install.builder import Device from smartsim._core.utils.helpers import installed_redisai_backends +from smartsim._core.utils.network import find_free_port from smartsim.log import get_logger logger = get_logger("Smart", fmt=SMART_LOGGER_FORMAT) @@ -152,8 +152,8 @@ def test_install( ) -> None: exp = Experiment("ValidationExperiment", exp_path=location, launcher="local") exp.telemetry.disable() + port = find_free_port() if port is None else port - port = _find_free_port() if port is None else port with _make_managed_local_feature_store(exp, port) as client: logger.info("Verifying Tensor Transfer") client.put_tensor("plain-tensor", np.ones((1, 1, 3, 3))) @@ -206,14 +206,6 @@ def _make_managed_local_feature_store( exp.stop(feature_store) -def _find_free_port() -> int: - """A 'good enough' way to find an open port to bind to""" - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.bind(("0.0.0.0", 0)) - _, port = sock.getsockname() - return int(port) - - def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None: recv_conn, send_conn = mp.Pipe(duplex=False) # Build the model in a subproc so that keras does not hog the gpu diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index 64d1cc3f4..4835d2cfe 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -270,8 +270,8 @@ class Versioner: PYTHON_MIN = Version_("3.9.0") # Versions - SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.6.2")) - SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.5.2")) + SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.7.0")) + SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.5.3")) SMARTSIM_SUFFIX = get_env("SMARTSIM_SUFFIX", "") # Redis diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index f32802074..cdef30bf9 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -825,6 +825,15 @@ def _extract_download(self, download_path: Path, target: _PathLike) -> None: zip_file.extractall(target) +class WebTGZ(_WebTGZ): + def __init__(self, url: str) -> None: + self._url = url + + @property + def url(self) -> str: + return self._url + + @dataclass(frozen=True) class _PTArchive(_WebZip, _RAIBuildDependency): architecture: Architecture diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index fc3d41bb0..374457f3a 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -89,6 +89,7 @@ # - Default: None +# pylint: disable-next=too-many-public-methods class Config: def __init__(self) -> None: # SmartSim/smartsim/_core @@ -99,6 +100,7 @@ def __init__(self) -> None: self.lib_path = Path(dependency_path, "lib").resolve() self.bin_path = Path(dependency_path, "bin").resolve() self.conf_path = Path(dependency_path, "config", "redis.conf") + self.conf_dir = Path(self.core_path, "config") @property def redisai(self) -> str: @@ -152,6 +154,30 @@ def database_file_parse_trials(self) -> int: def database_file_parse_interval(self) -> int: return int(os.getenv("SMARTSIM_DB_FILE_PARSE_INTERVAL", "2")) + @property + def dragon_dotenv(self) -> Path: + """Returns the path to a .env file containing dragon environment variables""" + return self.conf_dir / "dragon" / ".env" + + @property + def dragon_server_path(self) -> t.Optional[str]: + return os.getenv( + "SMARTSIM_DRAGON_SERVER_PATH", + os.getenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", None), + ) + + @property + def dragon_server_timeout(self) -> int: + return int(os.getenv("SMARTSIM_DRAGON_TIMEOUT", "30000")) + + @property + def dragon_server_startup_timeout(self) -> int: + return int(os.getenv("SMARTSIM_DRAGON_STARTUP_TIMEOUT", "300000")) + + @property + def dragon_transport(self) -> str: + return os.getenv("SMARTSIM_DRAGON_TRANSPORT", "hsta") + @property def log_level(self) -> str: return os.environ.get("SMARTSIM_LOG_LEVEL", "info") @@ -177,8 +203,14 @@ def test_num_gpus(self) -> int: # pragma: no cover return int(os.environ.get("SMARTSIM_TEST_NUM_GPUS") or 1) @property - def test_port(self) -> int: # pragma: no cover - return int(os.environ.get("SMARTSIM_TEST_PORT", 6780)) + def test_ports(self) -> t.Sequence[int]: # pragma: no cover + min_required_ports = 25 + first_port = int(os.environ.get("SMARTSIM_TEST_PORT", 6780)) + num_ports = max( + int(os.environ.get("SMARTSIM_TEST_NUM_PORTS", min_required_ports)), + min_required_ports, + ) + return range(first_port, first_port + num_ports) @property def test_batch_resources(self) -> t.Dict[t.Any, t.Any]: # pragma: no cover @@ -219,6 +251,11 @@ def test_account(self) -> t.Optional[str]: # pragma: no cover # no account by default return os.environ.get("SMARTSIM_TEST_ACCOUNT", None) + @property + def test_mpi(self) -> bool: # pragma: no cover + # By default, test MPI app if it compiles + return int(os.environ.get("SMARTSIM_TEST_MPI", "1")) > 0 + @property def telemetry_frequency(self) -> int: return int(os.environ.get("SMARTSIM_TELEMETRY_FREQUENCY", 5)) @@ -235,6 +272,29 @@ def telemetry_cooldown(self) -> int: def telemetry_subdir(self) -> str: return ".smartsim/telemetry" + @property + def dragon_default_subdir(self) -> str: + return ".smartsim/dragon" + + @property + def dragon_log_filename(self) -> str: + return "dragon_config.log" + + @property + def smartsim_key_path(self) -> str: + """Path to a root directory used for persistence of key files. Default + value `$HOME/.smartsim/keys`. User-overrideable by setting the environment + variable `SMARTSIM_KEY_PATH`. + + :returns: The configured key path. + """ + default_path = Path.home() / ".smartsim" / "keys" + return os.environ.get("SMARTSIM_KEY_PATH", str(default_path)) + + @property + def dragon_pin(self) -> str: + return "0.9" + @lru_cache(maxsize=128, typed=False) def get_config() -> Config: diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 155e93f0a..568bb9ec3 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -27,6 +27,7 @@ from __future__ import annotations import itertools +import os import os.path as osp import pathlib import pickle @@ -36,7 +37,6 @@ import threading import time import typing as t -from os import environ from smartredis import Client, ConfigOptions @@ -67,7 +67,13 @@ from ...servertype import CLUSTERED, STANDALONE from ...status import TERMINAL_STATUSES, SmartSimStatus from ..config import CONFIG -from ..launcher import LocalLauncher, LSFLauncher, PBSLauncher, SlurmLauncher +from ..launcher import ( + DragonLauncher, + LocalLauncher, + LSFLauncher, + PBSLauncher, + SlurmLauncher, +) from ..launcher.launcher import Launcher from ..utils import check_cluster_status, create_cluster, serialize from .controller_utils import _AnonymousBatchJob, _look_up_launched_data @@ -118,6 +124,10 @@ def start( The controller will start the job-manager thread upon execution of all jobs. """ + # launch a telemetry monitor to track job progress + if CONFIG.telemetry_enabled: + self._start_telemetry_monitor(exp_path) + self._jobs.kill_on_interrupt = kill_on_interrupt # register custom signal handler for ^C (SIGINT) @@ -134,16 +144,17 @@ def start( launched.map(_look_up_launched_data(self._launcher)) ) - # launch a telemetry monitor to track job progress - if CONFIG.telemetry_enabled: - self._start_telemetry_monitor(exp_path) - # block until all non-feature store jobs are complete if block: # poll handles its own keyboard interrupt as # it may be called separately self.poll(5, True, kill_on_interrupt=kill_on_interrupt) + @property + def active_feature_store_jobs(self) -> t.Dict[str, Job]: + """Return active feature store jobs.""" + return {**self._jobs.fs_jobs} + @property def feature_store_active(self) -> bool: with JM_LOCK: @@ -331,6 +342,7 @@ def init_launcher(self, launcher: str) -> None: "pals": PBSLauncher, "lsf": LSFLauncher, "local": LocalLauncher, + "dragon": DragonLauncher, } if launcher is not None: @@ -727,7 +739,7 @@ def _prep_entity_client_env(self, entity: Model) -> None: entity.run_settings.update_env(client_env) - def _save_feature_store(self, featurestore: FeatureStore) -> None: + def _save_feature_store(self, feature_store: FeatureStore) -> None: """Save the FeatureStore object via pickle This function saves the feature store information to a pickle @@ -736,14 +748,27 @@ def _save_feature_store(self, featurestore: FeatureStore) -> None: :param featurestore: FeatureStore configuration to be saved """ - dat_file = "/".join((featurestore.path, "smartsim_db.dat")) - fs_jobs = self._jobs.fs_jobs - feature_store_data = {"fs": featurestore, "fs_jobs": fs_jobs} - steps = [] - for fs_job in fs_jobs.values(): - steps.append(self._launcher.step_mapping[fs_job.name]) - feature_store_data["steps"] = steps - with open(dat_file, "wb") as pickle_file: + + if not feature_store.is_active(): + raise Exception("Feature store is not running") + + # Extract only the fs_jobs associated with this particular featurestore + if feature_store.batch: + job_names = [feature_store.name] + else: + job_names = [fsnode.name for fsnode in feature_store.entities] + fs_jobs = { + name: job for name, job in self._jobs.fs_jobs.items() if name in job_names + } + + # Extract the associated steps + steps = [ + self._launcher.step_mapping[fs_job.name] for fs_job in fs_jobs.values() + ] + + feature_store_data = {"fs": feature_store, "fs_jobs": fs_jobs, "steps": steps} + + with open(feature_store.checkpoint_file, "wb") as pickle_file: pickle.dump(feature_store_data, pickle_file) def _feature_store_launch_wait(self, featurestore: FeatureStore) -> None: @@ -774,8 +799,7 @@ def _feature_store_launch_wait(self, featurestore: FeatureStore) -> None: statuses = self.get_entity_list_status(featurestore) if all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses): ready = True - # TODO remove in favor of by node status check - time.sleep(CONFIG.jm_interval) + # TODO: Add a node status check elif any(stat in TERMINAL_STATUSES for stat in statuses): self.stop_fs(featurestore) msg = "FeatureStore failed during startup" @@ -793,14 +817,14 @@ def _feature_store_launch_wait(self, featurestore: FeatureStore) -> None: # launch explicitly raise - def reload_saved_fs(self, checkpoint_file: str) -> FeatureStore: + def reload_saved_fs( + self, checkpoint_file: t.Union[str, os.PathLike[str]] + ) -> FeatureStore: with JM_LOCK: - if self.feature_store_active: - raise SmartSimError("FeatureStore exists and is active") if not osp.exists(checkpoint_file): raise FileNotFoundError( - f"The SmartSim feature store config file {checkpoint_file} " + f"The SmartSim feature store config file {os.fspath(checkpoint_file)} " "cannot be found." ) @@ -836,7 +860,7 @@ def reload_saved_fs(self, checkpoint_file: str) -> FeatureStore: try: for fs_job, step in job_steps: self._jobs.fs_jobs[fs_job.ename] = fs_job - self._launcher.step_mapping[fs_job.name] = step + self._launcher.add_step_to_mapping_table(fs_job.name, step) if step.task_id: self._launcher.task_manager.add_existing(int(step.task_id)) except LauncherError as e: @@ -865,9 +889,9 @@ def _set_fsobjects(self, manifest: Manifest) -> None: if not fs_is_active(hosts=hosts, ports=ports, num_shards=len(fs_addresses)): raise SSInternalError("Cannot set FS Objects, FS is not running") - environ[f"SSDB{fs_name}"] = fs_addresses[0] + os.environ[f"SSDB{fs_name}"] = fs_addresses[0] - environ[f"SR_DB_TYPE{fs_name}"] = ( + os.environ[f"SR_DB_TYPE{fs_name}"] = ( CLUSTERED if len(fs_addresses) > 1 else STANDALONE ) @@ -908,7 +932,6 @@ def _start_telemetry_monitor(self, exp_dir: str) -> None: self._telemetry_monitor is None or self._telemetry_monitor.returncode is not None ): - logger.debug("Starting telemetry monitor process") cmd = [ sys.executable, diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index 1c72e6b46..7a9db0927 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -136,6 +136,7 @@ def _map_standard_metadata( entity_dict: t.Dict[str, t.Any], entity: "JobEntity", exp_dir: str, + raw_experiment: t.Dict[str, t.Any], ) -> None: """Map universal properties from a runtime manifest onto a `JobEntity` @@ -143,13 +144,20 @@ def _map_standard_metadata( :param entity_dict: The raw dictionary deserialized from manifest JSON :param entity: The entity instance to modify :param exp_dir: The path to the experiment working directory + :param raw_experiment: The raw experiment dictionary deserialized from + manifest JSON """ metadata = entity_dict["telemetry_metadata"] status_dir = pathlib.Path(metadata.get("status_dir")) + is_dragon = raw_experiment["launcher"].lower() == "dragon" # all entities contain shared properties that identify the task entity.type = entity_type - entity.name = entity_dict["name"] + entity.name = ( + entity_dict["name"] + if not is_dragon + else entity_dict["telemetry_metadata"]["step_id"] + ) entity.step_id = str(metadata.get("step_id") or "") entity.task_id = str(metadata.get("task_id") or "") entity.timestamp = int(entity_dict.get("timestamp", "0")) @@ -158,17 +166,24 @@ def _map_standard_metadata( @classmethod def from_manifest( - cls, entity_type: str, entity_dict: t.Dict[str, t.Any], exp_dir: str + cls, + entity_type: str, + entity_dict: t.Dict[str, t.Any], + exp_dir: str, + raw_experiment: t.Dict[str, t.Any], ) -> "JobEntity": """Instantiate a `JobEntity` from the dictionary deserialized from manifest JSON :param entity_type: The type of the associated `SmartSimEntity` :param entity_dict: The raw dictionary deserialized from manifest JSON :param exp_dir: The path to the experiment working directory + :param raw_experiment: raw experiment deserialized from manifest JSON """ entity = JobEntity() - cls._map_standard_metadata(entity_type, entity_dict, entity, exp_dir) + cls._map_standard_metadata( + entity_type, entity_dict, entity, exp_dir, raw_experiment + ) cls._map_fs_metadata(entity_dict, entity) return entity diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 8358d1512..7508ed4f2 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -105,6 +105,14 @@ def all_entity_lists(self) -> t.List[EntitySequence[SmartSimEntity]]: return _all_entity_lists + @property + def has_deployable(self) -> bool: + """ + Return True if the manifest contains entities that + must be physically deployed + """ + return bool(self._deployables) + @staticmethod def _check_names(deployables: t.List[t.Any]) -> None: used = [] diff --git a/smartsim/_core/control/previewrenderer.py b/smartsim/_core/control/previewrenderer.py new file mode 100644 index 000000000..4c6e31adb --- /dev/null +++ b/smartsim/_core/control/previewrenderer.py @@ -0,0 +1,192 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import pathlib +import typing as t +from enum import Enum + +import jinja2 +import jinja2.utils as u +from jinja2 import pass_eval_context + +from ..._core.config import CONFIG +from ..._core.control import Manifest +from ...error.errors import PreviewFormatError +from ...log import get_logger +from .job import Job + +logger = get_logger(__name__) + +if t.TYPE_CHECKING: + from smartsim import Experiment + + +class Format(str, Enum): + PLAINTEXT = "plain_text" + + +class Verbosity(str, Enum): + INFO = "info" + DEBUG = "debug" + DEVELOPER = "developer" + + +@pass_eval_context +def as_toggle(_eval_ctx: u.F, value: bool) -> str: + """Return "On" if value returns True, + and "Off" is value returns False. + """ + return "On" if value else "Off" + + +@pass_eval_context +def get_ifname(_eval_ctx: u.F, value: t.List[str]) -> str: + """Extract Network Interface from feature store run settings.""" + if value: + for val in value: + if "ifname=" in val: + output = val.split("=")[-1] + return output + return "" + + +@pass_eval_context +def get_fstype(_eval_ctx: u.F, value: str) -> str: + """Extract data base type.""" + if value: + if "-cli" in value: + fs_type, _ = value.split("/")[-1].split("-", 1) + return fs_type + return "" + + +@pass_eval_context +def is_list(_eval_ctx: u.F, value: str) -> bool: + """Return True if item is of type list, and False + otherwise, to determine how Jinja template should + render an item. + """ + return isinstance(value, list) + + +def render_to_file(content: str, filename: str) -> None: + """Output preview to a file if an output filename + is specified. + + :param content: The rendered preview. + :param filename: The name of the file to write the preview to. + """ + filename = find_available_filename(filename) + + with open(filename, "w", encoding="utf-8") as prev_file: + prev_file.write(content) + + +def render( + exp: "Experiment", + manifest: t.Optional[Manifest] = None, + verbosity_level: Verbosity = Verbosity.INFO, + output_format: Format = Format.PLAINTEXT, + output_filename: t.Optional[str] = None, + active_fsjobs: t.Optional[t.Dict[str, Job]] = None, +) -> str: + """ + Render the template from the supplied entities. + :param experiment: the experiment to be previewed. + :param manifest: the manifest to be previewed. + :param verbosity_level: the verbosity level + :param output_format: the output format. + """ + + verbosity_level = Verbosity(verbosity_level) + + _check_output_format(output_format) + + loader = jinja2.PackageLoader( + "smartsim.templates.templates.preview", output_format.value + ) + env = jinja2.Environment(loader=loader, autoescape=True) + + env.filters["as_toggle"] = as_toggle + env.filters["get_ifname"] = get_ifname + env.filters["get_fstype"] = get_fstype + env.filters["is_list"] = is_list + env.globals["Verbosity"] = Verbosity + + tpl_path = "base.template" + + tpl = env.get_template(tpl_path) + + if verbosity_level == Verbosity.INFO: + logger.warning( + "Only showing user set parameters. Some internal entity " + "fields are truncated. To view truncated fields: use verbosity_level " + "'developer' or 'debug.'" + ) + + rendered_preview = tpl.render( + exp_entity=exp, + active_fsjobs=active_fsjobs, + manifest=manifest, + config=CONFIG, + verbosity_level=verbosity_level, + ) + + if output_filename: + render_to_file( + rendered_preview, + output_filename, + ) + else: + logger.info(rendered_preview) + return rendered_preview + + +def find_available_filename(filename: str) -> str: + """Iterate through potentially unique names until one is found that does + not already exist. Return an unused name variation + + :param filename: The name of the file to write the preview to. + """ + + path = pathlib.Path(filename) + candidate_path = pathlib.Path(filename) + index = 1 + + while candidate_path.exists(): + candidate_path = path.with_name(f"{path.stem}_{index:02}.txt") + index += 1 + return str(candidate_path) + + +def _check_output_format(output_format: Format) -> None: + """ + Check that a valid file output format is given. + """ + if not output_format == Format.PLAINTEXT: + raise PreviewFormatError(f"The only valid output format currently available \ +is {Format.PLAINTEXT.value}") diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py new file mode 100644 index 000000000..2bfde74f2 --- /dev/null +++ b/smartsim/_core/entrypoints/dragon.py @@ -0,0 +1,351 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterpris +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import dataclasses +import json +import os +import signal +import socket +import sys +import textwrap +import time +import typing as t +from types import FrameType + +import zmq +import zmq.auth.thread + +from smartsim._core.config import get_config +from smartsim._core.launcher.dragon import dragonSockets +from smartsim._core.launcher.dragon.dragonBackend import DragonBackend +from smartsim._core.schemas import ( + DragonBootstrapRequest, + DragonBootstrapResponse, + DragonShutdownRequest, +) +from smartsim._core.utils.network import get_best_interface_and_address +from smartsim.log import ContextThread, get_logger + +""" +Dragon server entrypoint script +""" + +logger = get_logger("Dragon Server") + +# kill is not catchable +SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] + +SHUTDOWN_INITIATED = False + + +@dataclasses.dataclass +class DragonEntrypointArgs: + launching_address: str + interface: str + + +def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: + if not signo: + logger.info("Received signal with no signo") + else: + logger.info(f"Received signal {signo}") + cleanup() + + +def get_log_path() -> str: + config = get_config() + return config.dragon_log_filename + + +def print_summary(network_interface: str, ip_address: str) -> None: + zmq_config = {"interface": network_interface, "address": ip_address} + + log_path = get_log_path() + with open(log_path, "w", encoding="utf-8") as dragon_config_log: + dragon_config_log.write( + textwrap.dedent(f"""\ + -------- Dragon Configuration -------- + IPADDRESS: {ip_address} + NETWORK: {network_interface} + HOSTNAME: {socket.gethostname()} + DRAGON_SERVER_CONFIG: {json.dumps(zmq_config)} + -------------------------------------- + """), + ) + + +def start_updater( + backend: DragonBackend, updater: t.Optional[ContextThread] +) -> ContextThread: + """Start the ``DragonBackend`` updater thread. + + If ``updater`` is not None, then it is first checked and if it + alive, no other thread is started. + + :param backend: The dragon backend for which the thread will be started + :param updater: An existing updater thread that might have to be replaced + :return: Running updater thread + """ + # If the updater was started, check if it completed or died + if updater is not None: + updater.join(0.1) + # If it's alive, there is nothing to do + if updater.is_alive(): + return updater + updater = ContextThread(name="DragonBackend", daemon=True, target=backend.update) + updater.start() + return updater + + +def is_updater_healthy(backend: DragonBackend) -> bool: + """Check if the backend has been updated recently. + + The acceptable delay is defined as the server timeout plus the backend's cooldown + period. If the server timeout is set to `-1`, then the acceptable delay is set to + one minute plus the cooldown period. + + :param backend: The backend for which the updater's health is checked + :return: Whether the backend was updated recently + """ + server_timeout = get_config().dragon_server_timeout / 1000 + acceptable_delay = backend.cooldown_period + ( + 60.0 if server_timeout == -1 else server_timeout + ) + + heartbeat_delay = backend.current_time - backend.last_heartbeat + if heartbeat_delay > acceptable_delay: + logger.debug( + f"Updater inactive for {heartbeat_delay:.2f} seconds, will request restart." + ) + return False + return True + + +def updater_fallback(backend: DragonBackend, updater: ContextThread) -> ContextThread: + """Check if updater has updated the backend recently, if not, check its status + and start a new one if it is not alive. + :param backend: The dragon backend for which the udpater's health must be checked + :param updater: The updater thread which has to be checked and (possibly) replaced + :return: Running updater thread + """ + if is_updater_healthy(backend): + return updater + return start_updater(backend, updater) + + +# pylint: disable-next=too-many-statements +def run( + zmq_context: "zmq.Context[t.Any]", + dragon_head_address: str, + dragon_pid: int, +) -> None: + logger.debug(f"Opening socket {dragon_head_address}") + dragon_head_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REP, True) + dragon_head_socket.bind(dragon_head_address) + dragon_backend = DragonBackend(pid=dragon_pid) + + backend_updater = start_updater(dragon_backend, None) + server = dragonSockets.as_server(dragon_head_socket) + + logger.debug(f"Listening to {dragon_head_address}") + + while not dragon_backend.should_shutdown: + try: + req = server.recv() + logger.debug(f"Received {type(req).__name__} {req}") + except zmq.Again: + backend_updater = updater_fallback(dragon_backend, backend_updater) + continue + + resp = dragon_backend.process_request(req) + + logger.debug(f"Sending {type(resp).__name__} {resp}") + try: + server.send(resp) + except zmq.Again: + logger.error("Could not send response back to launcher.") + backend_updater = updater_fallback(dragon_backend, backend_updater) + + # We can only check the heartbeat if the backend has not shut down + if not dragon_backend.should_shutdown: + logger.debug(f"Listening to {dragon_head_address}") + backend_updater = updater_fallback(dragon_backend, backend_updater) + + if SHUTDOWN_INITIATED: + dragon_backend.process_request(DragonShutdownRequest()) + + logger.info("Backend shutdown has been requested") + + if backend_updater.is_alive(): + backend_updater.join(1) + + if not dragon_backend.frontend_shutdown: + logger.info("Frontend will have to be shut down externally") + while True: + logger.info("Waiting for external shutdown") + time.sleep(5) + + +def execute_entrypoint(args: DragonEntrypointArgs) -> int: + if_config = get_best_interface_and_address() + interface = if_config.interface + address = if_config.address + if not interface: + raise ValueError("Net interface could not be determined") + dragon_head_address = f"tcp://{address}" + + smartsim_config = get_config() + if args.launching_address: + zmq_context = zmq.Context() + zmq_context.setsockopt( + zmq.SNDTIMEO, value=smartsim_config.dragon_server_timeout + ) + zmq_context.setsockopt( + zmq.RCVTIMEO, value=smartsim_config.dragon_server_timeout + ) + zmq_context.setsockopt(zmq.REQ_CORRELATE, 1) + zmq_context.setsockopt(zmq.REQ_RELAXED, 1) + + if str(args.launching_address).split(":", maxsplit=1)[0] == dragon_head_address: + address = "localhost" + dragon_head_address = "tcp://localhost:5555" + else: + dragon_head_address += ":5555" + + zmq_authenticator = dragonSockets.get_authenticator(zmq_context, timeout=-1) + + logger.debug("Getting launcher socket") + launcher_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REQ, False) + + logger.debug(f"Connecting launcher socket to: {args.launching_address}") + launcher_socket.connect(args.launching_address) + client = dragonSockets.as_client(launcher_socket) + + logger.debug( + f"Sending bootstrap request to launcher_socket with {dragon_head_address}" + ) + client.send(DragonBootstrapRequest(address=dragon_head_address)) + response = client.recv() + + logger.debug(f"Received bootstrap response: {response}") + if not isinstance(response, DragonBootstrapResponse): + raise ValueError( + "Could not receive connection confirmation from launcher. Aborting." + ) + + print_summary(interface, dragon_head_address) + + try: + logger.debug("Executing event loop") + run( + zmq_context=zmq_context, + dragon_head_address=dragon_head_address, + dragon_pid=response.dragon_pid, + ) + except Exception as e: + logger.error(f"Dragon server failed with {e}", exc_info=True) + return os.EX_SOFTWARE + finally: + if zmq_authenticator is not None and zmq_authenticator.is_alive(): + zmq_authenticator.stop() + + logger.info("Shutting down! Bye bye!") + + return 0 + + +def remove_config_log() -> None: + """Remove the Dragon `config_log` file from the file system. Used to + clean up after a dragon environment is shutdown to eliminate an + unnecessary attempt to connect to a stopped ZMQ server.""" + log_path = get_log_path() + if os.path.exists(log_path): + os.remove(log_path) + + +def cleanup() -> None: + global SHUTDOWN_INITIATED # pylint: disable=global-statement + logger.debug("Cleaning up") + remove_config_log() + SHUTDOWN_INITIATED = True + + +def register_signal_handlers() -> None: + # make sure to register the cleanup before the start + # the process so our signaller will be able to stop + # the feature store process. + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def parse_arguments(args: t.List[str]) -> DragonEntrypointArgs: + parser = argparse.ArgumentParser( + prefix_chars="+", description="SmartSim Dragon Head Process" + ) + parser.add_argument( + "+launching_address", + type=str, + help="Address of launching process if a ZMQ connection can be established", + required=True, + ) + parser.add_argument( + "+interface", + type=str, + help="Network Interface name", + required=False, + ) + args_ = parser.parse_args(args) + + if not args_.launching_address: + raise ValueError("Empty launching address supplied.") + + return DragonEntrypointArgs(args_.launching_address, args_.interface) + + +def main(args_: t.List[str]) -> int: + """Execute the dragon entrypoint as a module""" + os.environ["PYTHONUNBUFFERED"] = "1" + logger.info("Dragon server started") + + args = parse_arguments(args_) + register_signal_handlers() + + try: + return_code = execute_entrypoint(args) + return return_code + except Exception: + logger.error( + "An unexpected error occurred in the Dragon entrypoint.", exc_info=True + ) + finally: + cleanup() + + return -1 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py new file mode 100644 index 000000000..e998ddce1 --- /dev/null +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -0,0 +1,203 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterpris +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import dataclasses +import json +import os +import signal +import sys +import time +import typing as t +from pathlib import Path +from types import FrameType + +import zmq + +from smartsim._core.launcher.dragon.dragonConnector import DragonConnector +from smartsim._core.schemas import ( + DragonHandshakeRequest, + DragonRequest, + DragonShutdownRequest, + request_registry, +) +from smartsim.log import get_logger + +""" +Dragon client entrypoint script, used to start a server, send requests to it +and then shut it down. +""" + +logger = get_logger("Dragon Client") + +SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] + + +@dataclasses.dataclass +class DragonClientEntrypointArgs: + submit: Path + + +def cleanup() -> None: + """Cleanup resources""" + logger.debug("Cleaning up") + + +def parse_requests(request_filepath: Path) -> t.List[DragonRequest]: + """Parse serialized requests from file + + :param request_filepath: Path to file with serialized requests + :return: Deserialized requests + """ + requests: t.List[DragonRequest] = [] + try: + with open(request_filepath, "r", encoding="utf-8") as request_file: + req_strings = json.load(fp=request_file) + except FileNotFoundError as e: + logger.error( + "Could not find file with run requests," + f"please check whether {request_filepath} exists." + ) + raise e from None + except json.JSONDecodeError as e: + logger.error(f"Could not decode request file {request_filepath}.") + raise e from None + + requests = [request_registry.from_string(req_str) for req_str in req_strings] + + return requests + + +def parse_arguments(args: t.List[str]) -> DragonClientEntrypointArgs: + """Parse arguments used to run entrypoint script + + :param args: Arguments without name of executable + :raises ValueError: If the request file is not specified + :return: Parsed arguments + """ + parser = argparse.ArgumentParser( + prefix_chars="+", + description="SmartSim Dragon Client Process, to be used in batch scripts", + ) + parser.add_argument("+submit", type=str, help="Path to request file", required=True) + args_ = parser.parse_args(args) + + if not args_.submit: + raise ValueError("Request file not provided.") + + return DragonClientEntrypointArgs(submit=Path(args_.submit)) + + +def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: + """Handle signals sent to this process + + :param signo: Signal number + :param _frame: Frame, defaults to None + """ + if not signo: + logger.info("Received signal with no signo") + else: + logger.info(f"Received signal {signo}") + cleanup() + + +def register_signal_handlers() -> None: + """Register signal handlers prior to execution""" + # make sure to register the cleanup before the start + # the process so our signaller will be able to stop + # the server process. + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def execute_entrypoint(args: DragonClientEntrypointArgs) -> int: + """Execute the entrypoint with specified arguments + + :param args: Parsed arguments + :return: Return code + """ + + try: + requests = parse_requests(args.submit) + except Exception: + logger.error("Dragon client failed to parse request file", exc_info=True) + return os.EX_OSFILE + + requests.append(DragonShutdownRequest(immediate=False, frontend_shutdown=True)) + + connector = DragonConnector() + + for request in requests: + response = connector.send_request(request) + if response.error_message is not None: + logger.error(response.error_message) + + logger.info("Terminated sending requests, waiting for Dragon Server to complete") + + if not connector.can_monitor: + logger.error( + "Could not get Dragon Server PID and will not be able to monitor it." + ) + return os.EX_IOERR + + while True: + try: + time.sleep(5) + connector.send_request(DragonHandshakeRequest()) + except zmq.error.Again: + logger.debug("Could not reach server, assuming backend has shut down") + break + + logger.info("Client has finished.") + + return os.EX_OK + + +def main(args_: t.List[str]) -> int: + """Execute the dragon client entrypoint as a module""" + + os.environ["PYTHONUNBUFFERED"] = "1" + logger.info("Dragon client started") + + args = parse_arguments(args_) + register_signal_handlers() + + try: + return execute_entrypoint(args) + except Exception: + logger.error( + "An unexpected error occurred in the Dragon client entrypoint", + exc_info=True, + ) + finally: + cleanup() + + return os.EX_SOFTWARE + + +if __name__ == "__main__": + + sys.exit(main(sys.argv[1:])) diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index 216130629..995c6faa0 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -37,7 +37,6 @@ from smartsim._core.utils.network import current_ip from smartsim.entity.dbnode import LaunchedShardData -from smartsim.error import SSInternalError from smartsim.log import get_logger logger = get_logger(__name__) @@ -111,6 +110,7 @@ def main(args: argparse.Namespace) -> int: *build_cluster_args(shard_data), *build_bind_args(src_addr, *bind_addrs), ] + print_summary(cmd, args.ifname, shard_data) try: @@ -119,9 +119,10 @@ def main(args: argparse.Namespace) -> int: for line in iter(process.stdout.readline, b""): print(line.decode("utf-8").rstrip(), flush=True) - except Exception as e: + except Exception: cleanup() - raise SSInternalError("Feature store process starter raised an exception") from e + logger.error("Feature store process starter raised an exception", exc_info=True) + return 1 return 0 @@ -179,6 +180,7 @@ def cleanup() -> None: action="store_true", help="Specify if this feature store shard is part of a cluster", ) + args_ = parser.parse_args() # make sure to register the cleanup before the start diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py index 27582ac77..5ed1a0c91 100644 --- a/smartsim/_core/entrypoints/telemetrymonitor.py +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -27,6 +27,7 @@ import asyncio import logging import os +import os.path import pathlib import signal import sys @@ -155,6 +156,7 @@ def configure_logger(logger_: logging.Logger, log_level_: int, exp_dir: str) -> # Must register cleanup before the main loop is running def cleanup_telemetry_monitor(_signo: int, _frame: t.Optional[FrameType]) -> None: """Create an enclosure on `manifest_observer` to avoid global variables""" + logger.info("Shutdown signal received by telemetry monitor entrypoint") telemetry_monitor.cleanup() register_signal_handlers(cleanup_telemetry_monitor) diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index 934f285eb..f41db343e 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -140,7 +140,7 @@ def _gen_exp_dir(self) -> None: ) if not path.isdir(self.gen_path): # keep exists ok for race conditions on NFS - pathlib.Path(self.gen_path).mkdir(exist_ok=True) + pathlib.Path(self.gen_path).mkdir(exist_ok=True, parents=True) else: logger.log( level=self.log_level, msg="Working in previously created experiment" @@ -168,7 +168,7 @@ def _gen_feature_store_dir(self, feature_store_list: t.List[FeatureStore]) -> No # Always remove feature store files if present. if path.isdir(feature_store_path): shutil.rmtree(feature_store_path, ignore_errors=True) - pathlib.Path(feature_store_path).mkdir(exist_ok=self.overwrite) + pathlib.Path(feature_store_path).mkdir(exist_ok=self.overwrite, parents=True) def _gen_entity_list_dir(self, entity_lists: t.List[Ensemble]) -> None: """Generate directories for Ensemble instances diff --git a/smartsim/_core/launcher/__init__.py b/smartsim/_core/launcher/__init__.py index 0c4001cd4..d78909641 100644 --- a/smartsim/_core/launcher/__init__.py +++ b/smartsim/_core/launcher/__init__.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from .dragon.dragonLauncher import DragonLauncher from .launcher import Launcher from .local.local import LocalLauncher from .lsf.lsfLauncher import LSFLauncher @@ -32,6 +33,7 @@ __all__ = [ "Launcher", + "DragonLauncher", "LocalLauncher", "LSFLauncher", "PBSLauncher", diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index 4a1393082..9f307968b 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -227,7 +227,8 @@ def _build_fs_script_cmd(fs_scripts: t.List[FSScript]) -> t.List[str]: if fs_script.func: # Notice that here fs_script.func is guaranteed to be a str # because we don't allow the user to pass a serialized function - sanitized_func = fs_script.func.replace("\n", "\\n") + func = fs_script.func + sanitized_func = func.replace("\n", "\\n") if not ( sanitized_func.startswith("'") and sanitized_func.endswith("'") diff --git a/smartsim/_core/launcher/dragon/__init__.py b/smartsim/_core/launcher/dragon/__init__.py new file mode 100644 index 000000000..efe03908e --- /dev/null +++ b/smartsim/_core/launcher/dragon/__init__.py @@ -0,0 +1,25 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py new file mode 100644 index 000000000..245660662 --- /dev/null +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -0,0 +1,734 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import collections +import functools +import itertools +import time +import typing as t +from dataclasses import dataclass, field +from enum import Enum +from threading import RLock + +from tabulate import tabulate + +# pylint: disable=import-error +# isort: off +import dragon.infrastructure.connection as dragon_connection +import dragon.infrastructure.policy as dragon_policy +import dragon.native.group_state as dragon_group_state +import dragon.native.process as dragon_process +import dragon.native.process_group as dragon_process_group +import dragon.native.machine as dragon_machine + +# pylint: enable=import-error +# isort: on +from ...._core.config import get_config +from ...._core.schemas import ( + DragonHandshakeRequest, + DragonHandshakeResponse, + DragonRequest, + DragonResponse, + DragonRunRequest, + DragonRunResponse, + DragonShutdownRequest, + DragonShutdownResponse, + DragonStopRequest, + DragonStopResponse, + DragonUpdateStatusRequest, + DragonUpdateStatusResponse, +) +from ...._core.utils.helpers import create_short_id_str +from ....log import get_logger +from ....status import TERMINAL_STATUSES, SmartSimStatus + +logger = get_logger(__name__) + + +class DragonStatus(str, Enum): + ERROR = str(dragon_group_state.Error()) + RUNNING = str(dragon_group_state.Running()) + + def __str__(self) -> str: + return self.value + + +@dataclass +class ProcessGroupInfo: + status: SmartSimStatus + """Status of step""" + process_group: t.Optional[dragon_process_group.ProcessGroup] = None + """Internal Process Group object, None for finished or not started steps""" + puids: t.Optional[t.List[t.Optional[int]]] = None # puids can be None + """List of Process UIDS belonging to the ProcessGroup""" + return_codes: t.Optional[t.List[int]] = None + """List of return codes of completed processes""" + hosts: t.List[str] = field(default_factory=list) + """List of hosts on which the Process Group """ + redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None + """Workers used to redirect stdout and stderr to file""" + + @property + def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.Optional[t.List[int]]]: + """Information needed by SmartSim Launcher and Job Manager""" + return (self.status, self.return_codes) + + def __str__(self) -> str: + if self.process_group is not None and self.redir_workers is not None: + msg = [f"Active Group ({self.status})"] + if self.puids is not None: + msg.append(f"Number processes: {len(self.puids)}") + else: + msg = [f"Inactive Group ({self.status})"] + + if self.hosts is not None: + msg.append(f"Hosts: {','.join(self.hosts)}") + if self.return_codes is not None: + msg.append(f"{self.return_codes}") + + return ", ".join(msg) + + +# Thanks to Colin Wahl from HPE HPC Dragon Team +def redir_worker(io_conn: dragon_connection.Connection, file_path: str) -> None: + """Read stdout/stderr from the Dragon connection. + + :param io_conn: Dragon connection to stdout or stderr + :param file_path: path to file to write to + """ + while io_conn is None or not io_conn.readable: + time.sleep(0.1) + try: + with open(file_path, "a", encoding="utf-8") as file_to_write: + while True: + output = io_conn.recv() + print(output, flush=True, file=file_to_write, end="") + except EOFError: + pass + except Exception as e: + print(e) + finally: + try: + io_conn.close() + except Exception as e: + print(e) + + +class DragonBackend: + """The DragonBackend class is the main interface between + SmartSim and Dragon. It is not intended to be user-facing, + and will only be called by the Dragon entry-point script or + by threads spawned by it. + """ + + def __init__(self, pid: int) -> None: + self._pid = pid + """PID of dragon executable which launched this server""" + self._group_infos: t.Dict[str, ProcessGroupInfo] = {} + """ProcessGroup execution state information""" + self._queue_lock = RLock() + """Lock that needs to be acquired to access internal queues""" + self._step_ids = (f"{create_short_id_str()}-{id}" for id in itertools.count()) + """Incremental ID to assign to new steps prior to execution""" + + self._initialize_hosts() + self._queued_steps: "collections.OrderedDict[str, DragonRunRequest]" = ( + collections.OrderedDict() + ) + """Steps waiting for execution""" + self._stop_requests: t.Deque[DragonStopRequest] = collections.deque() + """Stop requests which have not been processed yet""" + self._running_steps: t.List[str] = [] + """List of currently running steps""" + self._completed_steps: t.List[str] = [] + """List of completed steps""" + self._last_beat: float = 0.0 + """Time at which the last heartbeat was set""" + self._heartbeat() + self._last_update_time = self._last_beat + """Time at which the status update was printed the last time""" + self._shutdown_requested = False + """Whether the shutdown was requested to this server""" + self._can_shutdown = False + """Whether the server can shut down""" + self._frontend_shutdown: bool = False + """Whether the server frontend should shut down when the backend does""" + self._shutdown_initiation_time: t.Optional[float] = None + """The time at which the server initiated shutdown""" + smartsim_config = get_config() + self._cooldown_period = ( + smartsim_config.telemetry_frequency * 2 + 5 + if smartsim_config.telemetry_enabled + else 5 + ) + """Time in seconds needed to server to complete shutdown""" + + self._view = DragonBackendView(self) + logger.debug(self._view.host_desc) + + @property + def hosts(self) -> list[str]: + with self._queue_lock: + return self._hosts + + @property + def allocated_hosts(self) -> dict[str, str]: + with self._queue_lock: + return self._allocated_hosts + + @property + def free_hosts(self) -> t.Deque[str]: + with self._queue_lock: + return self._free_hosts + + @property + def group_infos(self) -> dict[str, ProcessGroupInfo]: + with self._queue_lock: + return self._group_infos + + def _initialize_hosts(self) -> None: + with self._queue_lock: + self._hosts: t.List[str] = sorted( + dragon_machine.Node(node).hostname + for node in dragon_machine.System().nodes + ) + """List of hosts available in allocation""" + self._free_hosts: t.Deque[str] = collections.deque(self._hosts) + """List of hosts on which steps can be launched""" + self._allocated_hosts: t.Dict[str, str] = {} + """Mapping of hosts on which a step is already running to step ID""" + + def __str__(self) -> str: + return self.status_message + + @property + def status_message(self) -> str: + """Message with status of available nodes and history of launched jobs. + + :returns: Status message + """ + return ( + "Dragon server backend update\n" + f"{self._view.host_table}\n{self._view.step_table}" + ) + + def _heartbeat(self) -> None: + self._last_beat = self.current_time + + @property + def cooldown_period(self) -> int: + """Time (in seconds) the server will wait before shutting down + + when exit conditions are met (see ``should_shutdown()`` for further details). + """ + return self._cooldown_period + + @property + def _has_cooled_down(self) -> bool: + if self._shutdown_initiation_time is None: + logger.debug(f"Starting cooldown period of {self._cooldown_period} seconds") + self._shutdown_initiation_time = self.current_time + return ( + self.current_time - self._shutdown_initiation_time > self._cooldown_period + ) + + @property + def frontend_shutdown(self) -> bool: + """Whether the frontend will have to shutdown once the backend does + + If False, the frontend will wait for an external signal to stop. + """ + return self._frontend_shutdown + + @property + def last_heartbeat(self) -> float: + """Time (in seconds) at which the last heartbeat was set""" + return self._last_beat + + @property + def should_shutdown(self) -> bool: + """Whether the server should shut down + + A server should shut down if a DragonShutdownRequest was received + and it requested immediate shutdown, or if it did not request immediate + shutdown, but all jobs have been executed. + In both cases, a cooldown period may need to be waited before shutdown. + """ + if self._shutdown_requested and self._can_shutdown: + return self._has_cooled_down + return False + + @property + def current_time(self) -> float: + """Current time for DragonBackend object, in seconds since the Epoch""" + return time.time() + + def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]: + """Check if request can be honored with resources available in the allocation. + + Currently only checks for total number of nodes, + in the future it will also look at other constraints + such as memory, accelerators, and so on. + """ + if request.nodes > len(self._hosts): + message = f"Cannot satisfy request. Requested {request.nodes} nodes, " + message += f"but only {len(self._hosts)} nodes are available." + return False, message + if self._shutdown_requested: + message = "Cannot satisfy request, server is shutting down." + return False, message + return True, None + + def _allocate_step( + self, step_id: str, request: DragonRunRequest + ) -> t.Optional[t.List[str]]: + + num_hosts: int = request.nodes + with self._queue_lock: + if num_hosts <= 0 or num_hosts > len(self._free_hosts): + return None + to_allocate = [] + for _ in range(num_hosts): + host = self._free_hosts.popleft() + self._allocated_hosts[host] = step_id + to_allocate.append(host) + return to_allocate + + @staticmethod + def _create_redirect_workers( + global_policy: dragon_policy.Policy, + policies: t.List[dragon_policy.Policy], + puids: t.List[int], + out_file: t.Optional[str], + err_file: t.Optional[str], + ) -> dragon_process_group.ProcessGroup: + grp_redir = dragon_process_group.ProcessGroup( + restart=False, policy=global_policy, pmi_enabled=False + ) + for pol, puid in zip(policies, puids): + proc = dragon_process.Process(None, ident=puid) + if out_file: + grp_redir.add_process( + nproc=1, + template=dragon_process.ProcessTemplate( + target=redir_worker, + args=(proc.stdout_conn, out_file), + stdout=dragon_process.Popen.DEVNULL, + policy=pol, + ), + ) + if err_file: + grp_redir.add_process( + nproc=1, + template=dragon_process.ProcessTemplate( + target=redir_worker, + args=(proc.stderr_conn, err_file), + stdout=dragon_process.Popen.DEVNULL, + policy=pol, + ), + ) + + return grp_redir + + def _stop_steps(self) -> None: + self._heartbeat() + with self._queue_lock: + while len(self._stop_requests) > 0: + request = self._stop_requests.popleft() + step_id = request.step_id + if step_id not in self._group_infos: + logger.error(f"Requested to stop non-existing step {step_id}") + continue + + logger.debug(f"Stopping step {step_id}") + if request.step_id in self._queued_steps: + self._queued_steps.pop(step_id) + else: + # Technically we could just terminate, but what if + # the application intercepts that and ignores it? + proc_group = self._group_infos[step_id].process_group + if ( + proc_group is not None + and proc_group.status == DragonStatus.RUNNING + ): + try: + proc_group.kill() + except dragon_process_group.DragonProcessGroupError: + try: + proc_group.stop() + except dragon_process_group.DragonProcessGroupError: + logger.error("Process group already stopped") + redir_group = self._group_infos[step_id].redir_workers + if redir_group is not None: + try: + redir_group.join(0.1) + redir_group = None + except Exception as e: + logger.error(e) + + self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + self._group_infos[step_id].return_codes = [-9] + + def _start_steps(self) -> None: + self._heartbeat() + with self._queue_lock: + started = [] + for step_id, request in self._queued_steps.items(): + hosts = self._allocate_step(step_id, self._queued_steps[step_id]) + if not hosts: + continue + + logger.debug(f"Step id {step_id} allocated on {hosts}") + + global_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=hosts[0], + ) + grp = dragon_process_group.ProcessGroup( + restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy + ) + + policies = [] + for node_name in hosts: + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, + ) + policies.extend([local_policy] * request.tasks_per_node) + tmp_proc = dragon_process.ProcessTemplate( + target=request.exe, + args=request.exe_args, + cwd=request.path, + env={**request.current_env, **request.env}, + stdout=dragon_process.Popen.PIPE, + stderr=dragon_process.Popen.PIPE, + policy=local_policy, + ) + grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) + + try: + grp.init() + grp.start() + grp_status = SmartSimStatus.STATUS_RUNNING + except Exception as e: + logger.error(e) + grp_status = SmartSimStatus.STATUS_FAILED + + puids = None + try: + puids = list( + set(grp.puids + [puid for puid, retcode in grp.inactive_puids]) + ) + self._group_infos[step_id] = ProcessGroupInfo( + process_group=grp, + puids=puids, + return_codes=[], + status=grp_status, + hosts=hosts, + ) + self._running_steps.append(step_id) + started.append(step_id) + except Exception as e: + logger.error(e) + + if ( + puids is not None + and len(puids) == len(policies) + and grp_status == SmartSimStatus.STATUS_RUNNING + ): + redir_grp = DragonBackend._create_redirect_workers( + global_policy, + policies, + puids, + request.output_file, + request.error_file, + ) + try: + redir_grp.init() + redir_grp.start() + except Exception as e: + raise IOError( + f"Could not redirect stdout and stderr for PUIDS {puids}" + ) from e + self._group_infos[step_id].redir_workers = redir_grp + elif puids is not None and grp_status == SmartSimStatus.STATUS_RUNNING: + logger.error("Cannot redirect workers: some PUIDS are missing") + + if started: + logger.debug(f"{started=}") + + for step_id in started: + try: + self._queued_steps.pop(step_id) + except KeyError: + logger.error( + f"Tried to allocate the same step twice, step id {step_id}" + ) + except Exception as e: + logger.error(e) + + def _refresh_statuses(self) -> None: + self._heartbeat() + with self._queue_lock: + terminated = [] + for step_id in self._running_steps: + group_info = self._group_infos[step_id] + grp = group_info.process_group + if grp is None: + group_info.status = SmartSimStatus.STATUS_FAILED + group_info.return_codes = [-1] + elif group_info.status not in TERMINAL_STATUSES: + if grp.status == str(DragonStatus.RUNNING): + group_info.status = SmartSimStatus.STATUS_RUNNING + else: + puids = group_info.puids + if puids is not None and all( + puid is not None for puid in puids + ): + try: + group_info.return_codes = [ + dragon_process.Process(None, ident=puid).returncode + for puid in puids + ] + except (ValueError, TypeError) as e: + logger.error(e) + group_info.return_codes = [-1 for _ in puids] + else: + group_info.return_codes = [0] + if not group_info.status == SmartSimStatus.STATUS_CANCELLED: + group_info.status = ( + SmartSimStatus.STATUS_FAILED + if any(group_info.return_codes) + or grp.status == DragonStatus.ERROR + else SmartSimStatus.STATUS_COMPLETED + ) + + if group_info.status in TERMINAL_STATUSES: + terminated.append(step_id) + + if terminated: + logger.debug(f"{terminated=}") + + for step_id in terminated: + self._running_steps.remove(step_id) + self._completed_steps.append(step_id) + group_info = self._group_infos[step_id] + if group_info is not None: + for host in group_info.hosts: + logger.debug(f"Releasing host {host}") + try: + self._allocated_hosts.pop(host) + except KeyError: + logger.error(f"Tried to free a non-allocated host: {host}") + self._free_hosts.append(host) + group_info.process_group = None + group_info.redir_workers = None + + def _update_shutdown_status(self) -> None: + self._heartbeat() + with self._queue_lock: + self._can_shutdown |= ( + all( + grp_info.status in TERMINAL_STATUSES + and grp_info.process_group is None + and grp_info.redir_workers is None + for grp_info in self._group_infos.values() + ) + and self._shutdown_requested + ) + + def _should_print_status(self) -> bool: + if self.current_time - self._last_update_time > 10: + self._last_update_time = self.current_time + return True + return False + + def _update(self) -> None: + self._stop_steps() + self._start_steps() + self._refresh_statuses() + self._update_shutdown_status() + + def _kill_all_running_jobs(self) -> None: + with self._queue_lock: + for step_id, group_info in self._group_infos.items(): + if group_info.status not in TERMINAL_STATUSES: + self._stop_requests.append(DragonStopRequest(step_id=step_id)) + + def update(self) -> None: + """Update internal data structures, queues, and job statuses""" + logger.debug("Dragon Backend update thread started") + while not self.should_shutdown: + try: + self._update() + time.sleep(0.1) + except Exception as e: + logger.error(e) + if self._should_print_status(): + try: + logger.debug(str(self)) + except ValueError as e: + logger.error(e) + + logger.debug("Dragon Backend update thread stopping") + + @functools.singledispatchmethod + # Deliberately suppressing errors so that overloads have the same signature + # pylint: disable-next=no-self-use + def process_request(self, request: DragonRequest) -> DragonResponse: + """Process an incoming DragonRequest""" + raise TypeError(f"Unsure how to process a `{type(request)}` request") + + @process_request.register + def _(self, request: DragonRunRequest) -> DragonRunResponse: + step_id = next(self._step_ids) + with self._queue_lock: + honorable, err = self._can_honor(request) + if not honorable: + self._group_infos[step_id] = ProcessGroupInfo( + status=SmartSimStatus.STATUS_FAILED, return_codes=[-1] + ) + else: + self._queued_steps[step_id] = request + self._group_infos[step_id] = ProcessGroupInfo( + status=SmartSimStatus.STATUS_NEVER_STARTED + ) + return DragonRunResponse(step_id=step_id, error_message=err) + + @process_request.register + def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: + with self._queue_lock: + return DragonUpdateStatusResponse( + statuses={ + step_id: self._group_infos[step_id].smartsim_info + for step_id in request.step_ids + if step_id in self._group_infos + } + ) + + @process_request.register + def _(self, request: DragonStopRequest) -> DragonStopResponse: + with self._queue_lock: + self._stop_requests.append(request) + return DragonStopResponse() + + @process_request.register + # Deliberately suppressing errors so that overloads have the same signature + # pylint: disable-next=no-self-use,unused-argument + def _(self, request: DragonHandshakeRequest) -> DragonHandshakeResponse: + return DragonHandshakeResponse(dragon_pid=self._pid) + + @process_request.register + # Deliberately suppressing errors so that overloads have the same signature + # pylint: disable-next=no-self-use,unused-argument + def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse: + self._shutdown_requested = True + self._update_shutdown_status() + if request.immediate: + self._kill_all_running_jobs() + self._frontend_shutdown = request.frontend_shutdown + return DragonShutdownResponse() + + +class DragonBackendView: + def __init__(self, backend: DragonBackend): + self._backend = backend + + @property + def host_desc(self) -> str: + hosts = self._backend.hosts + num_hosts = len(hosts) + host_string = str(num_hosts) + (" hosts" if num_hosts != 1 else " host") + return f"{host_string} available for execution: {hosts}" + + @staticmethod + def _proc_group_info_table_line( + step_id: str, proc_group_info: ProcessGroupInfo + ) -> t.List[str]: + table_line = [step_id, f"{proc_group_info.status.value}"] + + if proc_group_info.hosts is not None: + table_line.append(f"{','.join(proc_group_info.hosts)}") + else: + table_line.append("") + + if proc_group_info.return_codes is not None: + table_line.append( + f"{','.join(str(ret) for ret in proc_group_info.return_codes)}" + ) + else: + table_line.append("") + + if proc_group_info.puids is not None: + table_line.append(f"{len(proc_group_info.puids)}") + else: + table_line.append("") + + return table_line + + @property + def step_table(self) -> str: + """Table representation of all jobs which have been started on the server.""" + headers = ["Step", "Status", "Hosts", "Return codes", "Num procs"] + + group_infos = self._backend.group_infos + + colalign = ( + ["left", "left", "left", "center", "center"] + if len(group_infos) > 0 + else None + ) + values = [ + self._proc_group_info_table_line(step, group_info) + for step, group_info in group_infos.items() + ] + + return tabulate( + values, + headers, + disable_numparse=True, + tablefmt="github", + colalign=colalign, + ) + + @property + def host_table(self) -> str: + """Table representation of current state of nodes available + + in the allocation. + """ + headers = ["Host", "Status"] + hosts = self._backend.hosts + free_hosts = self._backend.free_hosts + + def _host_table_line(host: str) -> list[str]: + return [host, "Free" if host in free_hosts else "Busy"] + + colalign = ["left", "center"] if len(hosts) > 0 else None + values = [_host_table_line(host) for host in hosts] + + return tabulate( + values, headers, disable_numparse=True, tablefmt="github", colalign=colalign + ) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py new file mode 100644 index 000000000..0cd68c24e --- /dev/null +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -0,0 +1,532 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import atexit +import fileinput +import itertools +import json +import os +import subprocess +import sys +import typing as t +from collections import defaultdict +from pathlib import Path +from threading import RLock + +import psutil +import zmq +import zmq.auth.thread + +from ...._core.launcher.dragon import dragonSockets +from ....error.errors import SmartSimError +from ....log import get_logger +from ...config import get_config +from ...schemas import ( + DragonBootstrapRequest, + DragonBootstrapResponse, + DragonHandshakeRequest, + DragonHandshakeResponse, + DragonRequest, + DragonResponse, + DragonShutdownRequest, +) +from ...utils.network import find_free_port, get_best_interface_and_address + +logger = get_logger(__name__) + +_SchemaT = t.TypeVar("_SchemaT", bound=t.Union[DragonRequest, DragonResponse]) + +DRG_LOCK = RLock() + + +class DragonConnector: + """This class encapsulates the functionality needed + to start a Dragon server and communicate with it. + """ + + def __init__(self) -> None: + self._context: zmq.Context[t.Any] = zmq.Context.instance() + self._context.setsockopt(zmq.REQ_CORRELATE, 1) + self._context.setsockopt(zmq.REQ_RELAXED, 1) + self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None + config = get_config() + self._reset_timeout(config.dragon_server_timeout) + self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None + self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None + # Returned by dragon head, useful if shutdown is to be requested + # but process was started by another connector + self._dragon_head_pid: t.Optional[int] = None + self._dragon_server_path = config.dragon_server_path + logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") + self._env_vars: t.Dict[str, str] = {} + if self._dragon_server_path is None: + raise SmartSimError( + "DragonConnector could not find the dragon server path. " + "This should not happen if the Connector was started by an " + "experiment.\nIf the DragonConnector was started manually, " + "then the environment variable SMARTSIM_DRAGON_SERVER_PATH " + "should be set to an existing directory." + ) + + @property + def is_connected(self) -> bool: + """Whether the Connector established a connection to the server + + :return: True if connected + """ + return self._dragon_head_socket is not None + + @property + def can_monitor(self) -> bool: + """Whether the Connector knows the PID of the dragon server head process + and can monitor its status + + :return: True if the server can be monitored""" + return self._dragon_head_pid is not None + + def _handshake(self, address: str) -> None: + self._dragon_head_socket = dragonSockets.get_secure_socket( + self._context, zmq.REQ, False + ) + self._dragon_head_socket.connect(address) + try: + dragon_handshake = _assert_schema_type( + self.send_request(DragonHandshakeRequest()), DragonHandshakeResponse + ) + self._dragon_head_pid = dragon_handshake.dragon_pid + logger.debug( + f"Successful handshake with Dragon server at address {address}" + ) + except (zmq.ZMQError, zmq.Again) as e: + logger.debug(e) + self._dragon_head_socket.close() + self._dragon_head_socket = None + + raise SmartSimError( + f"Unsuccessful handshake with Dragon server at address {address}" + ) from e + + def _reset_timeout(self, timeout: int = get_config().dragon_server_timeout) -> None: + self._context.setsockopt(zmq.SNDTIMEO, value=timeout) + self._context.setsockopt(zmq.RCVTIMEO, value=timeout) + if self._authenticator is not None and self._authenticator.thread is not None: + try: + self._authenticator.thread.authenticator.zap_socket.setsockopt( + zmq.SNDTIMEO, timeout + ) + self._authenticator.thread.authenticator.zap_socket.setsockopt( + zmq.RCVTIMEO, timeout + ) + except zmq.ZMQError: + pass + + def ensure_connected(self) -> None: + """Ensure that the Connector established a connection to the server + + If the Connector is not connected, attempt to connect and raise an error + on failure. + + :raises SmartSimError: if connection cannot be established + """ + if not self.is_connected: + self.connect_to_dragon() + if not self.is_connected: + raise SmartSimError("Could not connect to Dragon server") + + def _get_new_authenticator( + self, timeout: int = get_config().dragon_server_timeout + ) -> None: + if self._authenticator is not None: + if self._authenticator.thread is not None: + try: + logger.debug("Closing ZAP socket") + self._authenticator.thread.authenticator.zap_socket.close() + except Exception as e: + logger.debug(f"Could not close ZAP socket, {e}") + try: + self._authenticator.stop() + except zmq.Again: + logger.debug("Could not stop authenticator") + try: + self._authenticator = dragonSockets.get_authenticator( + self._context, timeout + ) + return + except RuntimeError as e: + logger.error("Could not get authenticator") + raise e from None + + @staticmethod + def _get_dragon_log_level() -> str: + smartsim_to_dragon = defaultdict(lambda: "NONE") + smartsim_to_dragon["developer"] = "INFO" + return smartsim_to_dragon.get(get_config().log_level, "NONE") + + def _connect_to_existing_server(self, path: Path) -> None: + config = get_config() + dragon_config_log = path / config.dragon_log_filename + + if not dragon_config_log.is_file(): + return + + dragon_confs = self._parse_launched_dragon_server_info_from_files( + [dragon_config_log] + ) + logger.debug(dragon_confs) + + for dragon_conf in dragon_confs: + logger.debug( + "Found dragon server config file. Checking if the server" + f" is still up at address {dragon_conf['address']}." + ) + try: + self._reset_timeout() + self._get_new_authenticator(-1) + self._handshake(dragon_conf["address"]) + except SmartSimError as e: + logger.error(e) + finally: + self._reset_timeout(config.dragon_server_timeout) + if self.is_connected: + logger.debug("Connected to existing Dragon server") + return + + def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: + config = get_config() + connector_socket: t.Optional[zmq.Socket[t.Any]] = None + self._reset_timeout(config.dragon_server_startup_timeout) + self._get_new_authenticator(-1) + connector_socket = dragonSockets.get_secure_socket(self._context, zmq.REP, True) + logger.debug(f"Binding connector to {socket_addr}") + connector_socket.bind(socket_addr) + if connector_socket is None: + raise SmartSimError("Socket failed to initialize") + + return connector_socket + + def load_persisted_env(self) -> t.Dict[str, str]: + """Load key-value pairs from a .env file created during dragon installation + + :return: Key-value pairs stored in .env file""" + if self._env_vars: + # use previously loaded env vars. + return self._env_vars + + config = get_config() + + if not config.dragon_dotenv.exists(): + self._env_vars = {} + return self._env_vars + + with open(config.dragon_dotenv, encoding="utf-8") as dot_env: + for kvp in dot_env.readlines(): + split = kvp.strip().split("=", maxsplit=1) + key, value = split[0], split[-1] + self._env_vars[key] = value + + return self._env_vars + + def merge_persisted_env(self, current_env: t.Dict[str, str]) -> t.Dict[str, str]: + """Combine the current environment variable set with the dragon .env by adding + Dragon-specific values and prepending any new values to existing keys + + :param current_env: Environment which has to be merged with .env variables + :return: Merged environment + """ + # ensure we start w/a complete env from current env state + merged_env: t.Dict[str, str] = {**current_env} + + # copy all the values for dragon straight into merged_env + merged_env.update( + {k: v for k, v in self._env_vars.items() if k.startswith("DRAGON")} + ) + + # prepend dragon env updates into existing env vars + for key, value in self._env_vars.items(): + if not key.startswith("DRAGON"): + if current_value := current_env.get(key, None): + # when a key is not dragon specific, don't overwrite the current + # value. instead, prepend the value dragon needs to/current env + value = f"{value}:{current_value}" + merged_env[key] = value + return merged_env + + def connect_to_dragon(self) -> None: + """Connect to Dragon server + + :raises SmartSimError: If connection cannot be established + """ + config = get_config() + with DRG_LOCK: + # TODO use manager instead + if self.is_connected: + return + if self._dragon_server_path is None: + raise SmartSimError("Path to Dragon server not set.") + + logger.info( + "Establishing connection with Dragon server or starting a new one..." + ) + + path = _resolve_dragon_path(self._dragon_server_path) + + self._connect_to_existing_server(path) + if self.is_connected: + return + + path.mkdir(parents=True, exist_ok=True) + + local_address = get_best_interface_and_address().address + if local_address is None: + # TODO parse output file + raise SmartSimError( + "Could not determine SmartSim's local address, " + "the Dragon server could not be started." + ) + # find first available port >= 5995 + port = find_free_port(start=5995) + socket_addr = f"tcp://{local_address}:{port}" + connector_socket = self._start_connector_socket(socket_addr) + + cmd = [ + "dragon", + "-t", + config.dragon_transport, + "-l", + DragonConnector._get_dragon_log_level(), + sys.executable, + "-m", + "smartsim._core.entrypoints.dragon", + "+launching_address", + socket_addr, + ] + + dragon_out_file = path / "dragon_head.out" + dragon_err_file = path / "dragon_head.err" + + self.load_persisted_env() + merged_env = self.merge_persisted_env(os.environ.copy()) + merged_env.update({"PYTHONUNBUFFERED": "1"}) + + with ( + open(dragon_out_file, "w", encoding="utf-8") as dragon_out, + open(dragon_err_file, "w", encoding="utf-8") as dragon_err, + ): + logger.debug(f"Starting Dragon environment: {' '.join(cmd)}") + + # pylint: disable-next=consider-using-with + self._dragon_head_process = subprocess.Popen( + args=cmd, + bufsize=0, + stderr=dragon_err.fileno(), + stdout=dragon_out.fileno(), + cwd=path, + shell=False, + env=merged_env, + start_new_session=True, + ) + + server = dragonSockets.as_server(connector_socket) + logger.debug(f"Listening to {socket_addr}") + request = _assert_schema_type(server.recv(), DragonBootstrapRequest) + server.send( + DragonBootstrapResponse(dragon_pid=self._dragon_head_process.pid) + ) + connector_socket.close() + logger.debug(f"Connecting to {request.address}") + self._reset_timeout(config.dragon_server_timeout) + self._handshake(request.address) + + # Only the Connector which started the server is + # responsible of it, that's why we register the + # cleanup in this code branch. + # The cleanup function should not have references + # to this object to avoid Garbage Collector lockup + server_socket = self._dragon_head_socket + server_process_pid = self._dragon_head_process.pid + + if server_socket is not None and self._dragon_head_process is not None: + atexit.register( + _dragon_cleanup, + server_socket=server_socket, + server_process_pid=server_process_pid, + server_authenticator=self._authenticator, + ) + elif self._dragon_head_process is not None: + self._dragon_head_process.wait(1.0) + if self._dragon_head_process.stdout: + for line in iter(self._dragon_head_process.stdout.readline, b""): + logger.info(line.decode("utf-8").rstrip()) + if self._dragon_head_process.stderr: + for line in iter(self._dragon_head_process.stderr.readline, b""): + logger.warning(line.decode("utf-8").rstrip()) + logger.warning(self._dragon_head_process.returncode) + else: + logger.warning("Could not start Dragon server as subprocess") + + def cleanup(self) -> None: + """Shut down Dragon server and authenticator thread""" + if self._dragon_head_socket is not None and self._dragon_head_pid is not None: + _dragon_cleanup( + server_socket=self._dragon_head_socket, + server_process_pid=self._dragon_head_pid, + server_authenticator=self._authenticator, + ) + self._dragon_head_socket = None + self._dragon_head_pid = None + self._authenticator = None + + def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: + """Send a request to the Dragon server using a secure socket + + :param request: The request to send + :param flags: 0MQ flags, defaults to 0 + :raises SmartSimError: If not connected to Dragon server + :return: Response from server + """ + self.ensure_connected() + if (socket := self._dragon_head_socket) is None: + raise SmartSimError("Not connected to Dragon") + return self._send_req_with_socket(socket, request, flags) + + @staticmethod + def _parse_launched_dragon_server_info_from_iterable( + stream: t.Iterable[str], num_dragon_envs: t.Optional[int] = None + ) -> t.List[t.Dict[str, str]]: + lines = (line.strip() for line in stream) + lines = (line for line in lines if line) + tokenized = (line.split(maxsplit=1) for line in lines) + tokenized = (tokens for tokens in tokenized if len(tokens) > 1) + dragon_env_jsons = ( + config_dict + for first, config_dict in tokenized + if "DRAGON_SERVER_CONFIG" in first + ) + dragon_envs = (json.loads(config_dict) for config_dict in dragon_env_jsons) + + dragon_envs = ( + dragon_env for dragon_env in dragon_envs if "address" in dragon_env + ) + + if num_dragon_envs: + sliced_dragon_envs = itertools.islice(dragon_envs, num_dragon_envs) + return list(sliced_dragon_envs) + return list(dragon_envs) + + @classmethod + def _parse_launched_dragon_server_info_from_files( + cls, + file_paths: t.List[t.Union[str, "os.PathLike[str]"]], + num_dragon_envs: t.Optional[int] = None, + ) -> t.List[t.Dict[str, str]]: + with fileinput.FileInput(file_paths) as ifstream: + dragon_envs = cls._parse_launched_dragon_server_info_from_iterable( + ifstream, num_dragon_envs + ) + + return dragon_envs + + @staticmethod + def _send_req_with_socket( + socket: zmq.Socket[t.Any], + request: DragonRequest, + send_flags: int = 0, + recv_flags: int = 0, + ) -> DragonResponse: + client = dragonSockets.as_client(socket) + with DRG_LOCK: + logger.debug(f"Sending {type(request).__name__}: {request}") + client.send(request, send_flags) + response = client.recv(flags=recv_flags) + + logger.debug(f"Received {type(response).__name__}: {response}") + return response + + +def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: + if not isinstance(obj, typ): + raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") + return obj + + +def _dragon_cleanup( + server_socket: t.Optional[zmq.Socket[t.Any]] = None, + server_process_pid: t.Optional[int] = 0, + server_authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None, +) -> None: + """Clean up resources used by the launcher. + :param server_socket: (optional) Socket used to connect to dragon environment + :param server_process_pid: (optional) Process ID of the dragon entrypoint + :param server_authenticator: (optional) Authenticator used to secure sockets + """ + try: + if server_socket is not None: + print("Sending shutdown request to dragon environment") + # pylint: disable-next=protected-access + DragonConnector._send_req_with_socket( + server_socket, DragonShutdownRequest(), recv_flags=zmq.NOBLOCK + ) + except zmq.error.ZMQError as e: + # Can't use the logger as I/O file may be closed + if not isinstance(e, zmq.Again): + print("Could not send shutdown request to dragon server") + print(f"ZMQ error: {e}", flush=True) + finally: + print("Sending shutdown request is complete") + + if server_process_pid and psutil.pid_exists(server_process_pid): + try: + _, retcode = os.waitpid(server_process_pid, 0) + print( + f"Dragon server process shutdown is complete, return code {retcode}", + flush=True, + ) + except Exception as e: + logger.debug(e) + + try: + if server_authenticator is not None and server_authenticator.is_alive(): + print("Shutting down ZMQ authenticator") + server_authenticator.stop() + except Exception: + print("Authenticator shutdown error") + else: + print("Authenticator shutdown is complete") + + +def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: + dragon_server_path = get_config().dragon_server_path or os.path.join( + fallback, ".smartsim", "dragon" + ) + dragon_server_paths = dragon_server_path.split(":") + if len(dragon_server_paths) > 1: + logger.warning( + "Multiple dragon servers not supported, " + "will connect to (or start) first server in list." + ) + return Path(dragon_server_paths[0]) diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py new file mode 100644 index 000000000..17b47e309 --- /dev/null +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -0,0 +1,321 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import os +import typing as t + +from ...._core.launcher.stepMapping import StepMap +from ....error import LauncherError, SmartSimError +from ....log import get_logger +from ....settings import ( + DragonRunSettings, + QsubBatchSettings, + RunSettings, + SbatchSettings, + SettingsBase, +) +from ....status import SmartSimStatus +from ...schemas import ( + DragonRunRequest, + DragonRunResponse, + DragonStopRequest, + DragonStopResponse, + DragonUpdateStatusRequest, + DragonUpdateStatusResponse, +) +from ..launcher import WLMLauncher +from ..pbs.pbsLauncher import PBSLauncher +from ..slurm.slurmLauncher import SlurmLauncher +from ..step import DragonBatchStep, DragonStep, LocalStep, Step +from ..stepInfo import StepInfo +from .dragonConnector import DragonConnector, _SchemaT + +logger = get_logger(__name__) + + +class DragonLauncher(WLMLauncher): + """This class encapsulates the functionality needed + to launch jobs on systems that use Dragon on top of a workload manager. + + All WLM launchers are capable of launching managed and unmanaged + jobs. Managed jobs are queried through interaction with with WLM, + in this case the Dragon server. Unmanaged jobs are held in the TaskManager + and are managed through references to their launching process ID + i.e. a psutil.Popen object. + Batch Jobs are routed to either Slurm or PBS and their step ID + is stored, prefixed with the name of the scheduler, to allow + the Job Manager to interact with it. + """ + + def __init__(self) -> None: + super().__init__() + self._connector = DragonConnector() + """Connector used to start and interact with the Dragon server""" + self._slurm_launcher = SlurmLauncher() + """Slurm sub-launcher, used only for batch jobs""" + self._pbs_launcher = PBSLauncher() + """PBS sub-launcher, used only for batch jobs""" + + @property + def is_connected(self) -> bool: + return self._connector.is_connected + + def cleanup(self) -> None: + self._connector.cleanup() + + # RunSettings types supported by this launcher + @property + def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: + # RunSettings types supported by this launcher + return { + DragonRunSettings: DragonStep, + SbatchSettings: DragonBatchStep, + QsubBatchSettings: DragonBatchStep, + RunSettings: LocalStep, + } + + def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: + super().add_step_to_mapping_table(name, step_map) + + if step_map.step_id is None: + return + sublauncher: t.Optional[t.Union[SlurmLauncher, PBSLauncher]] = None + if step_map.step_id.startswith("SLURM-"): + sublauncher = self._slurm_launcher + elif step_map.step_id.startswith("PBS-"): + sublauncher = self._pbs_launcher + else: + return + + sublauncher_step_map = StepMap( + step_id=DragonLauncher._unprefix_step_id(step_map.step_id), + task_id=step_map.task_id, + managed=step_map.managed, + ) + sublauncher.add_step_to_mapping_table(name, sublauncher_step_map) + + def run(self, step: Step) -> t.Optional[str]: + """Run a job step through Slurm + + :param step: a job step instance + :raises LauncherError: if launch fails + :return: job step id if job is managed + """ + + if not self.task_manager.actively_monitoring: + self.task_manager.start() + + step_id = None + task_id = None + + cmd = step.get_launch_cmd() + out, err = step.get_output_files() + + if isinstance(step, DragonBatchStep): + # wait for batch step to submit successfully + sublauncher_step_id: t.Optional[str] = None + return_code, out, err = self.task_manager.start_and_wait(cmd, step.cwd) + if return_code != 0: + raise LauncherError(f"Sbatch submission failed\n {out}\n {err}") + if out: + sublauncher_step_id = out.strip() + logger.debug( + f"Gleaned batch job id: {sublauncher_step_id} for {step.name}" + ) + + if sublauncher_step_id is None: + raise SmartSimError("Could not get step id for batch step") + + if isinstance(step.batch_settings, SbatchSettings): + self._slurm_launcher.step_mapping.add( + step.name, sublauncher_step_id, task_id, step.managed + ) + step_id = "SLURM-" + sublauncher_step_id + elif isinstance(step.batch_settings, QsubBatchSettings): + self._pbs_launcher.step_mapping.add( + step.name, sublauncher_step_id, task_id, step.managed + ) + step_id = "PBS-" + sublauncher_step_id + elif isinstance(step, DragonStep): + run_args = step.run_settings.run_args + req_env = step.run_settings.env_vars + self._connector.load_persisted_env() + merged_env = self._connector.merge_persisted_env(os.environ.copy()) + nodes = int(run_args.get("nodes", None) or 1) + tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + response = _assert_schema_type( + self._connector.send_request( + DragonRunRequest( + exe=cmd[0], + exe_args=cmd[1:], + path=step.cwd, + name=step.name, + nodes=nodes, + tasks_per_node=tasks_per_node, + env=req_env, + current_env=merged_env, + output_file=out, + error_file=err, + ) + ), + DragonRunResponse, + ) + step_id = str(response.step_id) + else: + # pylint: disable-next=consider-using-with + out_strm = open(out, "w+", encoding="utf-8") + # pylint: disable-next=consider-using-with + err_strm = open(err, "w+", encoding="utf-8") + task_id = self.task_manager.start_task( + cmd, step.cwd, step.env, out=out_strm.fileno(), err=err_strm.fileno() + ) + step.managed = False + + self.step_mapping.add(step.name, step_id, task_id, step.managed) + + return step_id + + def stop(self, step_name: str) -> StepInfo: + """Step a job step + + :param step_name: name of the job to stop + :return: update for job due to cancel + """ + + stepmap = self.step_mapping[step_name] + step_id = str(stepmap.step_id) + + if step_id.startswith("SLURM-"): + return self._slurm_launcher.stop(step_name) + + if step_id.startswith("PBS-"): + return self._pbs_launcher.stop(step_name) + + _assert_schema_type( + self._connector.send_request(DragonStopRequest(step_id=step_id)), + DragonStopResponse, + ) + + _, step_info = self.get_step_update([step_name])[0] + if not step_info: + raise LauncherError(f"Could not get step_info for job step {step_name}") + + step_info.status = ( + SmartSimStatus.STATUS_CANCELLED # set status to cancelled instead of failed + ) + step_info.launcher_status = str(SmartSimStatus.STATUS_CANCELLED) + return step_info + + @staticmethod + def _unprefix_step_id(step_id: str) -> str: + return step_id.split("-", maxsplit=1)[1] + + def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: + """Get step updates for Dragon-managed jobs + + :param step_ids: list of job step ids + :return: list of updates for managed jobs + """ + + step_id_updates: dict[str, StepInfo] = {} + + dragon_step_ids: t.List[str] = [] + slurm_step_ids: t.List[str] = [] + pbs_step_ids: t.List[str] = [] + for step_id in step_ids: + if step_id.startswith("SLURM-"): + slurm_step_ids.append(step_id) + elif step_id.startswith("PBS-"): + pbs_step_ids.append(step_id) + else: + dragon_step_ids.append(step_id) + + if slurm_step_ids: + # pylint: disable-next=protected-access + slurm_updates = self._slurm_launcher._get_managed_step_update( + [ + DragonLauncher._unprefix_step_id(step_id) + for step_id in slurm_step_ids + ] + ) + step_id_updates.update(dict(zip(slurm_step_ids, slurm_updates))) + + if pbs_step_ids: + # pylint: disable-next=protected-access + pbs_updates = self._pbs_launcher._get_managed_step_update( + [DragonLauncher._unprefix_step_id(step_id) for step_id in pbs_step_ids] + ) + step_id_updates.update(dict(zip(pbs_step_ids, pbs_updates))) + + if dragon_step_ids: + response = _assert_schema_type( + self._connector.send_request( + DragonUpdateStatusRequest(step_ids=dragon_step_ids) + ), + DragonUpdateStatusResponse, + ) + + for step_id in step_ids: + if step_id not in response.statuses: + msg = "Missing step id update from Dragon launcher." + if response.error_message is not None: + msg += "\nDragon backend reported following error: " + msg += response.error_message + logger.error(msg) + info = StepInfo( + SmartSimStatus.STATUS_FAILED, + SmartSimStatus.STATUS_FAILED.value, + -1, + ) + else: + status, ret_codes = response.statuses[step_id] + if ret_codes: + grp_ret_code = min(ret_codes) + if any(ret_codes): + _err_msg = ( + f"One or more processes failed for job {step_id} " + f"Return codes were: {ret_codes}" + ) + logger.error(_err_msg) + else: + grp_ret_code = None + info = StepInfo(status, status.value, grp_ret_code) + + step_id_updates[step_id] = info + + # Order matters as we return an ordered list of StepInfo objects + return [step_id_updates[step_id] for step_id in step_ids] + + def __str__(self) -> str: + return "Dragon" + + +def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: + if not isinstance(obj, typ): + raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") + return obj diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py new file mode 100644 index 000000000..80acd61a2 --- /dev/null +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -0,0 +1,158 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +import zmq +import zmq.auth.thread + +from smartsim._core.config.config import get_config +from smartsim._core.schemas import dragonRequests as _dragonRequests +from smartsim._core.schemas import dragonResponses as _dragonResponses +from smartsim._core.schemas import utils as _utils +from smartsim._core.utils.security import KeyManager +from smartsim.log import get_logger + +if t.TYPE_CHECKING: + from zmq import Context + from zmq.sugar.socket import Socket + +logger = get_logger(__name__) + +AUTHENTICATOR: t.Optional["zmq.auth.thread.ThreadAuthenticator"] = None + + +def as_server( + socket: "Socket[t.Any]", +) -> _utils.SocketSchemaTranslator[ + _dragonResponses.DragonResponse, + _dragonRequests.DragonRequest, +]: + return _utils.SocketSchemaTranslator( + socket, _dragonResponses.response_registry, _dragonRequests.request_registry + ) + + +def as_client( + socket: "Socket[t.Any]", +) -> _utils.SocketSchemaTranslator[ + _dragonRequests.DragonRequest, + _dragonResponses.DragonResponse, +]: + return _utils.SocketSchemaTranslator( + socket, _dragonRequests.request_registry, _dragonResponses.response_registry + ) + + +def get_secure_socket( + context: "zmq.Context[t.Any]", + socket_type: int, + is_server: bool, +) -> "Socket[t.Any]": + """Create secured socket that consumes & produces encrypted messages + + :param context: ZMQ context object + :param socket_type: Type of ZMQ socket to create + :param is_server: Pass `True` to secure the socket as server. Pass `False` + to secure the socket as a client. + :returns: the secured socket prepared for sending encrypted messages + """ + config = get_config() + socket: "Socket[t.Any]" = context.socket(socket_type) + + key_manager = KeyManager(config, as_server=is_server, as_client=not is_server) + server_keys, client_keys = key_manager.get_keys() + logger.debug(f"Applying keys to socket: {server_keys}, {client_keys}") + + if is_server: + logger.debug("Configuring socket as server") + + # configure the server keys on the socket + socket.curve_secretkey = server_keys.private + socket.curve_publickey = server_keys.public + + socket.curve_server = True + else: + # configure client keys on the socket to encrypt outgoing messages + socket.curve_secretkey = client_keys.private + socket.curve_publickey = client_keys.public + + # set the server public key for decrypting incoming messages + socket.curve_serverkey = server_keys.public + return socket + + +def get_authenticator( + context: "zmq.Context[t.Any]", timeout: int = get_config().dragon_server_timeout +) -> "zmq.auth.thread.ThreadAuthenticator": + """Create an authenticator to handle encryption of ZMQ communications + + :param context: ZMQ context object + :returns: the activated `Authenticator` + """ + # pylint: disable-next=global-statement + global AUTHENTICATOR + + if AUTHENTICATOR is not None: + if AUTHENTICATOR.is_alive(): + return AUTHENTICATOR + try: + logger.debug("Stopping authenticator") + AUTHENTICATOR.thread.authenticator.zap_socket.close() + AUTHENTICATOR.thread.join(0.1) + AUTHENTICATOR = None + except Exception as e: + logger.debug(e) + finally: + logger.debug("Stopped authenticator") + + config = get_config() + + key_manager = KeyManager(config, as_client=True) + server_keys, client_keys = key_manager.get_keys() + logger.debug(f"Applying keys to authenticator: {server_keys}, {client_keys}") + + AUTHENTICATOR = zmq.auth.thread.ThreadAuthenticator(context, log=logger) + + ctx_sndtimeo = context.getsockopt(zmq.SNDTIMEO) + ctx_rcvtimeo = context.getsockopt(zmq.RCVTIMEO) + + AUTHENTICATOR.context.setsockopt(zmq.SNDTIMEO, timeout) + AUTHENTICATOR.context.setsockopt(zmq.RCVTIMEO, timeout) + AUTHENTICATOR.context.setsockopt(zmq.REQ_CORRELATE, 1) + AUTHENTICATOR.context.setsockopt(zmq.REQ_RELAXED, 1) + + # allow all keys in the client key directory to connect + logger.debug(f"Securing with client keys in {key_manager.client_keys_dir}") + AUTHENTICATOR.configure_curve(domain="*", location=key_manager.client_keys_dir) + + logger.debug("Starting authenticator") + AUTHENTICATOR.start() + + context.setsockopt(zmq.SNDTIMEO, ctx_sndtimeo) + context.setsockopt(zmq.RCVTIMEO, ctx_rcvtimeo) + + return AUTHENTICATOR diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index 6ae20ae62..1bf768065 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -27,6 +27,7 @@ import abc import typing as t +from ..._core.launcher.stepMapping import StepMap from ...error import AllocationError, LauncherError, SSUnsupportedError from ...settings import SettingsBase from .step import Step @@ -69,6 +70,15 @@ def run(self, step: Step) -> t.Optional[str]: def stop(self, step_name: str) -> StepInfo: raise NotImplementedError + def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: + """Add a StepMap to the Launcher step mapping table + making it monitor the step. + + :param name: name of step to be added + :param step_map: step map of added step + """ + self.step_mapping[name] = step_map + class WLMLauncher(Launcher): # cov-wlm """The base class for any Launcher that utilizes workload diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index e01cbae08..8c2099a8b 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -53,7 +53,11 @@ ) from ..stepInfo import PBSStepInfo, StepInfo from .pbsCommands import qdel, qstat -from .pbsParser import parse_qstat_jobid, parse_step_id_from_qstat +from .pbsParser import ( + parse_qstat_jobid, + parse_qstat_jobid_json, + parse_step_id_from_qstat, +) logger = get_logger(__name__) @@ -182,10 +186,21 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: qstat_out, _ = qstat(step_ids) stats = [parse_qstat_jobid(qstat_out, str(step_id)) for step_id in step_ids] + + # Fallback: if all jobs result as NOTFOUND, it might be an issue + # with truncated names, we resort to json format which does not truncate + # information + if all(stat is None for stat in stats): + qstat_out_json, _ = qstat(["-f", "-F", "json"] + step_ids) + stats = [ + parse_qstat_jobid_json(qstat_out_json, str(step_id)) + for step_id in step_ids + ] + # create PBSStepInfo objects to return for stat, _ in zip(stats, step_ids): - info = PBSStepInfo(stat, None) + info = PBSStepInfo(stat or "NOTFOUND", None) # account for case where job history is not logged by PBS if info.status == SmartSimStatus.STATUS_COMPLETED: info.returncode = 0 diff --git a/smartsim/_core/launcher/pbs/pbsParser.py b/smartsim/_core/launcher/pbs/pbsParser.py index dcb5a3ef5..6f8384b11 100644 --- a/smartsim/_core/launcher/pbs/pbsParser.py +++ b/smartsim/_core/launcher/pbs/pbsParser.py @@ -57,7 +57,7 @@ def parse_qsub_error(output: str) -> str: return base_err -def parse_qstat_jobid(output: str, job_id: str) -> str: +def parse_qstat_jobid(output: str, job_id: str) -> t.Optional[str]: """Parse and return output of the qstat command run with options to obtain job status. @@ -65,7 +65,7 @@ def parse_qstat_jobid(output: str, job_id: str) -> str: :param job_id: allocation id or job step id :return: status """ - result = "NOTFOUND" + result = None for line in output.split("\n"): fields = line.split() if len(fields) >= 5: @@ -76,6 +76,25 @@ def parse_qstat_jobid(output: str, job_id: str) -> str: return result +def parse_qstat_jobid_json(output: str, job_id: str) -> t.Optional[str]: + """Parse and return output of the qstat command run with JSON options + to obtain job status. + + :param output: output of the qstat command in JSON format + :param job_id: allocation id or job step id + :return: status + """ + out_json = load_and_clean_json(output) + + if "Jobs" not in out_json: + return None + jobs: dict[str, t.Any] = out_json["Jobs"] + job: t.Optional[dict[str, t.Any]] = jobs.get(job_id, None) + if job is None: + return None + return str(job.get("job_state", None)) + + def parse_qstat_nodes(output: str) -> t.List[str]: """Parse and return the qstat command run with options to obtain node list. diff --git a/smartsim/_core/launcher/step/__init__.py b/smartsim/_core/launcher/step/__init__.py index 663edb682..c492f3e97 100644 --- a/smartsim/_core/launcher/step/__init__.py +++ b/smartsim/_core/launcher/step/__init__.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from .alpsStep import AprunStep +from .dragonStep import DragonBatchStep, DragonStep from .localStep import LocalStep from .lsfStep import BsubBatchStep, JsrunStep from .mpiStep import MpiexecStep, MpirunStep, OrterunStep diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py new file mode 100644 index 000000000..a0a3e038d --- /dev/null +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -0,0 +1,248 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os +import shutil +import sys +import typing as t + +from ...._core.schemas.dragonRequests import DragonRunRequest, request_registry +from ....error.errors import SSUnsupportedError +from ....log import get_logger +from ....settings import ( + DragonRunSettings, + QsubBatchSettings, + SbatchSettings, + Singularity, +) +from .step import Step + +logger = get_logger(__name__) + + +class DragonStep(Step): + def __init__(self, name: str, cwd: str, run_settings: DragonRunSettings) -> None: + """Initialize a srun job step + + :param name: name of the entity to be launched + :param cwd: path to launch dir + :param run_settings: run settings for entity + """ + super().__init__(name, cwd, run_settings) + self.managed = True + + @property + def run_settings(self) -> DragonRunSettings: + return t.cast(DragonRunSettings, self.step_settings) + + def get_launch_cmd(self) -> t.List[str]: + """Get stringified version of request + needed to launch this step + + :return: launch command + """ + run_settings = self.run_settings + exe_cmd = [] + + if run_settings.colocated_fs_settings: + # Replace the command with the entrypoint wrapper script + bash = shutil.which("bash") + if not bash: + raise RuntimeError("Could not find bash in PATH") + launch_script_path = self.get_colocated_launch_script() + exe_cmd += [bash, launch_script_path] + + if isinstance(run_settings.container, Singularity): + # pylint: disable-next=protected-access + exe_cmd += run_settings.container._container_cmds(self.cwd) + + exe_cmd += run_settings.exe + + exe_args = self._get_exe_args_list(run_settings) + + exe_cmd_and_args = exe_cmd + exe_args + + return exe_cmd_and_args + + @staticmethod + def _get_exe_args_list(run_setting: DragonRunSettings) -> t.List[str]: + """Convenience function to encapsulate checking the + runsettings.exe_args type to always return a list + """ + exe_args = run_setting.exe_args + args: t.List[str] = exe_args if isinstance(exe_args, list) else [exe_args] + return args + + +class DragonBatchStep(Step): + def __init__( + self, + name: str, + cwd: str, + batch_settings: t.Union[SbatchSettings, QsubBatchSettings], + ) -> None: + """Initialize a Slurm Sbatch step + + :param name: name of the entity to launch + :param cwd: path to launch dir + :param batch_settings: batch settings for entity + """ + super().__init__(name, cwd, batch_settings) + self.steps: t.List[Step] = [] + self.managed = True + self.batch_settings = batch_settings + self._request_file_name = "requests.json" + + def get_launch_cmd(self) -> t.List[str]: + """Get the launch command for the batch + + :return: launch command for the batch + """ + if isinstance(self.batch_settings, SbatchSettings): + script = self._write_sbatch_script() + return [self.batch_settings.batch_cmd, "--parsable", script] + if isinstance(self.batch_settings, QsubBatchSettings): + script = self._write_qsub_script() + return [self.batch_settings.batch_cmd, script] + + raise SSUnsupportedError( + "DragonBatchStep only support SbatchSettings and QsubBatchSettings" + ) + + def add_to_batch(self, step: Step) -> None: + """Add a job step to this batch + + :param step: a job step instance e.g. DragonStep + """ + self.steps.append(step) + logger.debug(f"Added step command to batch for {step.name}") + + @staticmethod + def _dragon_entrypoint_cmd(request_file: str) -> str: + """Return command needed to run the Dragon entrypoint""" + cmd = [ + sys.executable, + "-m", + "smartsim._core.entrypoints.dragon_client", + "+submit", + request_file, + ] + return " ".join(cmd) + + def _write_request_file(self) -> str: + """Write json file with requests to submit to Dragon server""" + request_file = self.get_step_file( + ending="json", script_name=self._request_file_name + ) + requests = [] + for step in self.steps: + run_settings = t.cast(DragonRunSettings, step.step_settings) + run_args = run_settings.run_args + env = run_settings.env_vars + nodes = int(run_args.get("nodes", None) or 1) + tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + + cmd = step.get_launch_cmd() + out, err = step.get_output_files() + request = DragonRunRequest( + exe=cmd[0], + exe_args=cmd[1:], + path=step.cwd, + name=step.name, + nodes=nodes, + tasks_per_node=tasks_per_node, + env=env, + current_env=os.environ, + output_file=out, + error_file=err, + ) + requests.append(request_registry.to_string(request)) + with open(request_file, "w", encoding="utf-8") as script_file: + script_file.write(json.dumps(requests)) + + return request_file + + def _write_sbatch_script(self) -> str: + """Write the PBS batch script + + :return: batch script path after writing + """ + batch_script = self.get_step_file(ending=".sh") + output, error = self.get_output_files() + request_file = self._write_request_file() + with open(batch_script, "w", encoding="utf-8") as script_file: + script_file.write("#!/bin/bash\n\n") + script_file.write(f"#SBATCH --output={output}\n") + script_file.write(f"#SBATCH --error={error}\n") + script_file.write(f"#SBATCH --job-name={self.name}\n") + + # add additional sbatch options + for opt in self.batch_settings.format_batch_args(): + script_file.write(f"#SBATCH {opt}\n") + + script_file.write( + f"#SBATCH --export=ALL,SMARTSIM_DRAGON_SERVER_PATH={self.cwd}," + "PYTHONUNBUFFERED=1\n" + ) + + for cmd in self.batch_settings.preamble: + script_file.write(f"{cmd}\n") + + script_file.write( + DragonBatchStep._dragon_entrypoint_cmd(request_file) + "\n" + ) + return batch_script + + def _write_qsub_script(self) -> str: + """Write the Slurm batch script + + :return: batch script path after writing + """ + batch_script = self.get_step_file(ending=".sh") + output, error = self.get_output_files() + request_file = self._write_request_file() + with open(batch_script, "w", encoding="utf-8") as script_file: + script_file.write("#!/bin/bash\n\n") + script_file.write(f"#PBS -o {output}\n") + script_file.write(f"#PBS -e {error}\n") + script_file.write(f"#PBS -N {self.name}\n") + script_file.write("#PBS -V \n") + + # add additional sbatch options + for opt in self.batch_settings.format_batch_args(): + script_file.write(f"#PBS {opt}\n") + + script_file.write(f"#PBS -v SMARTSIM_DRAGON_SERVER_PATH={self.cwd}\n") + + for cmd in self.batch_settings.preamble: + script_file.write(f"{cmd}\n") + + script_file.write( + DragonBatchStep._dragon_entrypoint_cmd(request_file) + "\n" + ) + + return batch_script diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index f6074c954..bdd1db984 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -153,7 +153,7 @@ def _get_launch_cmd(self: _StepT) -> t.List[str]: if self.managed: raise UnproxyableStepError( - f"Attempting to proxy managed step of type {type(self)}" + f"Attempting to proxy managed step of type {type(self)} " "through the unmanaged step proxy entry point" ) @@ -161,6 +161,8 @@ def _get_launch_cmd(self: _StepT) -> t.List[str]: entity_type = self.meta["entity_type"] status_dir = self.meta["status_dir"] + logger.debug(f"Encoding command{' '.join(original_cmd_list)}") + # encode the original cmd to avoid potential collisions and escaping # errors when passing it using CLI arguments to the indirect entrypoint encoded_cmd = encode_cmd(original_cmd_list) diff --git a/smartsim/_core/schemas/__init__.py b/smartsim/_core/schemas/__init__.py new file mode 100644 index 000000000..d7ee9d83d --- /dev/null +++ b/smartsim/_core/schemas/__init__.py @@ -0,0 +1,41 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from .dragonRequests import * +from .dragonResponses import * + +__all__ = [ + "DragonRequest", + "DragonRunRequest", + "DragonHandshakeRequest", + "DragonUpdateStatusRequest", + "DragonStopRequest", + "DragonResponse", + "DragonRunResponse", + "DragonHandshakeResponse", + "DragonUpdateStatusResponse", + "DragonStopResponse", +] diff --git a/smartsim/_core/schemas/dragonRequests.py b/smartsim/_core/schemas/dragonRequests.py new file mode 100644 index 000000000..3e384f746 --- /dev/null +++ b/smartsim/_core/schemas/dragonRequests.py @@ -0,0 +1,90 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +from pydantic import BaseModel, Field, PositiveInt + +import smartsim._core.schemas.utils as _utils + +# Black and Pylint disagree about where to put the `...` +# pylint: disable=multiple-statements + +request_registry = _utils.SchemaRegistry["DragonRequest"]() + + +class DragonRequest(BaseModel): ... + + +class DragonRunRequestView(DragonRequest): + exe: t.Annotated[str, Field(min_length=1)] + exe_args: t.List[t.Annotated[str, Field(min_length=1)]] = [] + path: t.Annotated[str, Field(min_length=1)] + nodes: PositiveInt = 1 + tasks: PositiveInt = 1 + tasks_per_node: PositiveInt = 1 + hostlist: t.Optional[t.Annotated[str, Field(min_length=1)]] = None + output_file: t.Optional[t.Annotated[str, Field(min_length=1)]] = None + error_file: t.Optional[t.Annotated[str, Field(min_length=1)]] = None + env: t.Dict[str, t.Optional[str]] = {} + name: t.Optional[t.Annotated[str, Field(min_length=1)]] = None + pmi_enabled: bool = True + + +@request_registry.register("run") +class DragonRunRequest(DragonRunRequestView): + current_env: t.Dict[str, t.Optional[str]] = {} + + def __str__(self) -> str: + return str(DragonRunRequestView.parse_obj(self.dict(exclude={"current_env"}))) + + +@request_registry.register("update_status") +class DragonUpdateStatusRequest(DragonRequest): + step_ids: t.List[t.Annotated[str, Field(min_length=1)]] + + +@request_registry.register("stop") +class DragonStopRequest(DragonRequest): + step_id: t.Annotated[str, Field(min_length=1)] + + +@request_registry.register("handshake") +class DragonHandshakeRequest(DragonRequest): ... + + +@request_registry.register("bootstrap") +class DragonBootstrapRequest(DragonRequest): + address: t.Annotated[str, Field(min_length=1)] + + +@request_registry.register("shutdown") +class DragonShutdownRequest(DragonRequest): + immediate: bool = True + """Whether the server should shut down immediately, setting this to False means + that the server will shut down when all jobs are terminated.""" + frontend_shutdown: bool = True + """Whether the frontend will have to shut down or wait for external termination""" diff --git a/smartsim/_core/schemas/dragonResponses.py b/smartsim/_core/schemas/dragonResponses.py new file mode 100644 index 000000000..3c5c30a10 --- /dev/null +++ b/smartsim/_core/schemas/dragonResponses.py @@ -0,0 +1,73 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +from pydantic import BaseModel, Field + +import smartsim._core.schemas.utils as _utils +from smartsim.status import SmartSimStatus + +# Black and Pylint disagree about where to put the `...` +# pylint: disable=multiple-statements + +response_registry = _utils.SchemaRegistry["DragonResponse"]() + + +class DragonResponse(BaseModel): + error_message: t.Optional[str] = None + + +@response_registry.register("run") +class DragonRunResponse(DragonResponse): + step_id: t.Annotated[str, Field(min_length=1)] + + +@response_registry.register("status_update") +class DragonUpdateStatusResponse(DragonResponse): + # status is a dict: {step_id: (is_alive, returncode)} + statuses: t.Mapping[ + t.Annotated[str, Field(min_length=1)], + t.Tuple[SmartSimStatus, t.Optional[t.List[int]]], + ] = {} + + +@response_registry.register("stop") +class DragonStopResponse(DragonResponse): ... + + +@response_registry.register("handshake") +class DragonHandshakeResponse(DragonResponse): + dragon_pid: int + + +@response_registry.register("bootstrap") +class DragonBootstrapResponse(DragonResponse): + dragon_pid: int + + +@response_registry.register("shutdown") +class DragonShutdownResponse(DragonResponse): ... diff --git a/smartsim/_core/schemas/utils.py b/smartsim/_core/schemas/utils.py new file mode 100644 index 000000000..9cb36bcf5 --- /dev/null +++ b/smartsim/_core/schemas/utils.py @@ -0,0 +1,124 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import dataclasses +import typing as t + +import pydantic +import pydantic.dataclasses + +if t.TYPE_CHECKING: + from zmq.sugar.socket import Socket + +_SchemaT = t.TypeVar("_SchemaT", bound=pydantic.BaseModel) +_SendT = t.TypeVar("_SendT", bound=pydantic.BaseModel) +_RecvT = t.TypeVar("_RecvT", bound=pydantic.BaseModel) + +_DEFAULT_MSG_DELIM: t.Final[str] = "|" + + +@t.final +@pydantic.dataclasses.dataclass(frozen=True) +class _Message(t.Generic[_SchemaT]): + payload: _SchemaT + header: str = pydantic.Field(min_length=1) + delimiter: str = pydantic.Field(min_length=1, default=_DEFAULT_MSG_DELIM) + + def __str__(self) -> str: + return self.delimiter.join((self.header, self.payload.json())) + + @classmethod + def from_str( + cls, + str_: str, + payload_type: t.Type[_SchemaT], + delimiter: str = _DEFAULT_MSG_DELIM, + ) -> "_Message[_SchemaT]": + header, payload = str_.split(delimiter, 1) + return cls(payload_type.parse_raw(payload), header, delimiter) + + +class SchemaRegistry(t.Generic[_SchemaT]): + def __init__( + self, init_map: t.Optional[t.Mapping[str, t.Type[_SchemaT]]] = None + ) -> None: + self._map = dict(init_map) if init_map else {} + + def register(self, key: str) -> t.Callable[[t.Type[_SchemaT]], t.Type[_SchemaT]]: + if _DEFAULT_MSG_DELIM in key: + _msg = f"Registry key cannot contain delimiter `{_DEFAULT_MSG_DELIM}`" + raise ValueError(_msg) + if not key: + raise KeyError("Key cannot be the empty string") + if key in self._map: + raise KeyError(f"Key `{key}` has already been registered for this parser") + + def _register(cls: t.Type[_SchemaT]) -> t.Type[_SchemaT]: + self._map[key] = cls + return cls + + return _register + + def to_string(self, schema: _SchemaT) -> str: + return str(self._to_message(schema)) + + def _to_message(self, schema: _SchemaT) -> _Message[_SchemaT]: + reverse_map = dict((v, k) for k, v in self._map.items()) + try: + val = reverse_map[type(schema)] + except KeyError: + raise TypeError(f"Unregistered schema type: {type(schema)}") from None + return _Message(schema, val, _DEFAULT_MSG_DELIM) + + def from_string(self, str_: str) -> _SchemaT: + try: + type_, _ = str_.split(_DEFAULT_MSG_DELIM, 1) + except ValueError: + _msg = f"Failed to determine schema type of the string {repr(str_)}" + raise ValueError(_msg) from None + try: + cls = self._map[type_] + except KeyError: + raise ValueError(f"No type of value `{type_}` is registered") from None + msg = _Message.from_str(str_, cls, _DEFAULT_MSG_DELIM) + return self._from_message(msg) + + @staticmethod + def _from_message(msg: _Message[_SchemaT]) -> _SchemaT: + return msg.payload + + +@dataclasses.dataclass(frozen=True) +class SocketSchemaTranslator(t.Generic[_SendT, _RecvT]): + socket: "Socket[t.Any]" + _send_registry: SchemaRegistry[_SendT] + _recv_registry: SchemaRegistry[_RecvT] + + def send(self, schema: _SendT, flags: int = 0) -> None: + self.socket.send_string(self._send_registry.to_string(schema), flags) + + def recv(self, flags: int = 0) -> _RecvT: + return self._recv_registry.from_string(self.socket.recv_string(flags)) diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py index 7cb08063b..584a417a2 100644 --- a/smartsim/_core/utils/__init__.py +++ b/smartsim/_core/utils/__init__.py @@ -24,5 +24,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .helpers import colorize, delete_elements, installed_redisai_backends +from .helpers import ( + check_for_utility, + colorize, + delete_elements, + execute_platform_cmd, + installed_redisai_backends, + is_crayex_platform, +) from .redis import check_cluster_status, create_cluster, fs_is_active diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index b3b684eac..5283f048c 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -31,6 +31,7 @@ import collections.abc import os import signal +import subprocess import typing as t import uuid from datetime import datetime @@ -265,7 +266,7 @@ def get_ts_ms() -> int: return int(datetime.now().timestamp() * 1000) -def encode_cmd(cmd: t.List[str]) -> str: +def encode_cmd(cmd: t.Sequence[str]) -> str: """Transform a standard command list into an encoded string safe for providing as an argument to a proxy entrypoint """ @@ -288,6 +289,128 @@ def decode_cmd(encoded_cmd: str) -> t.List[str]: return cleaned_cmd +def check_for_utility(util_name: str) -> str: + """Check for existence of the provided CLI utility. + + :param util_name: CLI utility to locate + :returns: Full path to executable if found. Otherwise, empty string""" + utility = "" + + try: + utility = expand_exe_path(util_name) + except FileNotFoundError: + ... + + return utility + + +def execute_platform_cmd(cmd: str) -> t.Tuple[str, int]: + """Execute the platform check command as a subprocess + + :param cmd: the command to execute + :returns: True if platform is cray ex, False otherwise""" + process = subprocess.run( + cmd.split(), + capture_output=True, + check=False, + ) + return process.stdout.decode("utf-8"), process.returncode + + +class CrayExPlatformResult: + locate_msg = "Unable to locate `{0}`." + + def __init__(self, ldconfig: t.Optional[str], fi_info: t.Optional[str]) -> None: + self.ldconfig: t.Optional[str] = ldconfig + self.fi_info: t.Optional[str] = fi_info + self.has_pmi: bool = False + self.has_pmi2: bool = False + self.has_cxi: bool = False + + @property + def has_ldconfig(self) -> bool: + return bool(self.ldconfig) + + @property + def has_fi_info(self) -> bool: + return bool(self.fi_info) + + @property + def is_cray(self) -> bool: + return all( + ( + self.has_ldconfig, + self.has_fi_info, + self.has_pmi, + self.has_pmi2, + self.has_cxi, + ) + ) + + @property + def failures(self) -> t.List[str]: + """Return a list of messages describing all failed validations""" + failure_messages = [] + + if not self.has_ldconfig: + failure_messages.append(self.locate_msg.format("ldconfig")) + + if not self.has_fi_info: + failure_messages.append(self.locate_msg.format("fi_info")) + + if self.has_ldconfig and self.has_fi_info: + if not self.has_pmi: + failure_messages.append(self.locate_msg.format("pmi.so")) + if not self.has_pmi2: + failure_messages.append(self.locate_msg.format("pmi2.so")) + if not self.has_cxi: + failure_messages.append(self.locate_msg.format("cxi.so")) + + return failure_messages + + +def check_platform() -> CrayExPlatformResult: + """Returns True if the current platform is identified as Cray EX and + HSTA-aware dragon package can be installed, False otherwise. + + :returns: True if current platform is Cray EX, False otherwise""" + + # ldconfig -p | grep cray | grep pmi.so && + # ldconfig -p | grep cray | grep pmi2.so && + # fi_info | grep cxi + + ldconfig = check_for_utility("ldconfig") + fi_info = check_for_utility("fi_info") + + result = CrayExPlatformResult(ldconfig, fi_info) + if not all((result.has_ldconfig, result.has_fi_info)): + return result + + ldconfig1 = f"{ldconfig} -p" + ldc_out1, _ = execute_platform_cmd(ldconfig1) + candidates = [x for x in ldc_out1.split("\n") if "cray" in x] + result.has_pmi = any(x for x in candidates if "pmi.so" in x) + + ldconfig2 = f"{ldconfig} -p" + ldc_out2, _ = execute_platform_cmd(ldconfig2) + candidates = [x for x in ldc_out2.split("\n") if "cray" in x] + result.has_pmi2 = any(x for x in candidates if "pmi2.so" in x) + + fi_info_out, _ = execute_platform_cmd(fi_info) + result.has_cxi = any(x for x in fi_info_out.split("\n") if "cxi" in x) + + return result + + +def is_crayex_platform() -> bool: + """Returns True if the current platform is identified as Cray EX and + HSTA-aware dragon package can be installed, False otherwise. + + :returns: True if current platform is Cray EX, False otherwise""" + result = check_platform() + return result.is_cray + + @t.final class SignalInterceptionStack(collections.abc.Collection[_TSignalHandlerFn]): """Registers a stack of callables to be called when a signal is diff --git a/smartsim/_core/utils/network.py b/smartsim/_core/utils/network.py index f568597df..aaceb7fc6 100644 --- a/smartsim/_core/utils/network.py +++ b/smartsim/_core/utils/network.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import socket +import typing as t import psutil @@ -33,6 +34,11 @@ """ +class IFConfig(t.NamedTuple): + interface: t.Optional[str] + address: t.Optional[str] + + def get_ip_from_host(host: str) -> str: """Return the IP address for the interconnect. @@ -82,3 +88,32 @@ def current_ip(interface: str = "lo") -> str: # pragma: no cover return get_ip_from_interface(loopback) return get_ip_from_interface(interface) + + +def get_best_interface_and_address() -> IFConfig: + available_ifs = psutil.net_if_addrs() + # TODO make this a CONFIG-time parameter + known_ifs = ["hsn", "ipogif", "ib"] + for interface in available_ifs: + if any(interface.startswith(if_prefix) for if_prefix in known_ifs): + return IFConfig(interface, get_ip_from_interface(interface)) + return IFConfig(None, None) + + +def find_free_port(start: int = 0) -> int: + """A 'good enough' way to find an open port to bind to + + :param start: The first port number to consider + :returns: The first open port found + """ + port_num = -1 + while port_num < 0: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + try: + sock.bind(("0.0.0.0", start)) + _, port = sock.getsockname() + port_num = int(port) + except Exception: + # swallow connection exception; test if the next port is open + start += 1 + return port_num diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 8443f430d..d033cd067 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -201,7 +201,7 @@ def set_script(fs_script: FSScript, client: Client) -> None: client.set_script( name=fs_script.name, script=fs_script.script, device=device ) - else: + elif callable(fs_script.script): client.set_function( name=fs_script.name, function=fs_script.script, device=device ) @@ -229,7 +229,9 @@ def shutdown_fs_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov- if returncode != 0: logger.error(out) - logger.error(err) + err_msg = "Error while shutting down DB node. " + err_msg += f"Return code: {returncode}, err: {err}" + logger.error(err_msg) elif out: logger.debug(out) diff --git a/smartsim/_core/utils/security.py b/smartsim/_core/utils/security.py new file mode 100644 index 000000000..e6f84c81a --- /dev/null +++ b/smartsim/_core/utils/security.py @@ -0,0 +1,302 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import dataclasses +import pathlib +import stat +import typing as t +from enum import IntEnum + +import zmq +import zmq.auth + +from smartsim._core.config.config import Config +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class _KeyPermissions(IntEnum): + """Permissions used by KeyManager""" + + PRIVATE_KEY = stat.S_IRUSR | stat.S_IWUSR + """Permissions only allowing an owner to read and write the file""" + PUBLIC_KEY = stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH | stat.S_IRGRP + """Permissions allowing an owner, others, and the group to read a file""" + + PRIVATE_DIR = ( + stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXOTH | stat.S_IXGRP + ) + """Permissions allowing only owners to read, write and traverse a directory""" + PUBLIC_DIR = ( + stat.S_IRUSR + | stat.S_IWUSR + | stat.S_IXUSR + | stat.S_IROTH + | stat.S_IXOTH + | stat.S_IRGRP + | stat.S_IXGRP + ) + """Permissions allowing non-owners to traverse a directory""" + + +@dataclasses.dataclass(frozen=True) +class KeyPair: + """A public and private key pair""" + + public: bytes = dataclasses.field(default=b"") + """The public key""" + + private: bytes = dataclasses.field(default=b"", repr=False) + """The private key""" + + @property + def empty(self) -> bool: + """Return `True` if the KeyPair has no key values set. Useful + for faking the null object pattern""" + return self.public == self.private and len(self.public) == 0 + + +class _KeyLocator: + """Determines the paths to use when persisting a `KeyPair` to disk""" + + def __init__( + self, + root_dir: pathlib.Path, + filename: str, + category: str, + ) -> None: + """Initiailize a `KeyLocator` + + :param root_dir: root path where keys are persisted to disk + :param filename: the stem name of the key file + :param category: the category or use-case for the key (e.g. server) + :param separate_keys: flag indicating if public and private keys should + be persisted in separate, corresponding directories + """ + + # constants for standardized paths. + self._public_subdir = "pub" + """The category subdirectory to use when persisting a public key""" + + self._private_subdir = "priv" + """The category subdirectory to use when persisting a private key""" + + self._public_extension = "key" + """The extension found on public keys""" + + self._private_extension = "key_secret" + """The extension found on private keys""" + + self._key_root_dir = root_dir + """Path to the root directory containing key files""" + + self._filename = filename + """Base name for key files""" + + self._category = category + """Category name used to further separate key locations""" + + @property + def public_dir(self) -> pathlib.Path: + """Target directory for the public key""" + return self.public.parent + + @property + def private_dir(self) -> pathlib.Path: + """Target directory for the private key""" + return self.private.parent + + @property + def public_filename(self) -> str: + """Filename (.) of the public key file""" + return f"{self._filename}.{self._public_extension}" + + @property + def private_filename(self) -> str: + """Filename (.) of the private key file""" + return f"{self._filename}.{self._private_extension}" + + @property + def public(self) -> pathlib.Path: + """Full target path of the public key file""" + # combine the root and key type (e.g. /foo/bar + /server) + # then combine the pub/priv key subdir (e.g. /foo/bar/server + /pub) + path = self._key_root_dir / self._category / self._public_subdir + return path / self.public_filename + + @property + def private(self) -> pathlib.Path: + """Full target path of the private key file""" + # combine the root and key type (e.g. /foo/bar + /server) + # then combine the pub/priv key subdir (e.g. /foo/bar/server + /pub) + path = self._key_root_dir / self._category / self._private_subdir + # combine the pub/priv key subdir if necessary (e.g. /foo/bar + /priv) + + return path / self.private_filename + + +class KeyManager: + def __init__( + self, config: Config, as_server: bool = False, as_client: bool = False + ) -> None: + """Initialize a KeyManager instance. + :param config: SmartSim configuration + :param as_server: flag to indicate when executing in the server context; + set to `True` to avoid loading client secret key + :param as_client: flag to indicate when executing in the client context; + set to `True` to avoid loading server secret key + """ + + self._as_server = as_server + """Set to `True` to return keys appropriate for the server context""" + + self._as_client = as_client + """Set to `True` to return keys appropriate for the client context""" + + key_dir = pathlib.Path(config.smartsim_key_path).resolve() + + # Results in key path such as /server/pub/smartsim.key + self._server_locator = _KeyLocator(key_dir, "smartsim", "server") + """The locator for producing the paths to store server key files""" + + # Results in key path such as /client/pub/smartsim.key + self._client_locator = _KeyLocator(key_dir, "smartsim", "client") + """The locator for producing the paths to store client key files""" + + def create_directories(self) -> None: + """Create the subdirectory structure necessary to hold + the public and private key pairs for servers & clients""" + for locator in [self._server_locator, self._client_locator]: + if not locator.public_dir.exists(): + permission = _KeyPermissions.PUBLIC_DIR + logger.debug(f"Creating key dir: {locator.public_dir}, {permission}") + locator.public_dir.mkdir(parents=True, mode=permission) + + if not locator.private_dir.exists(): + permission = _KeyPermissions.PRIVATE_DIR + logger.debug(f"Creating key dir: {locator.private_dir}, {permission}") + locator.private_dir.mkdir(parents=True, mode=permission) + + @classmethod + def _load_keypair(cls, locator: _KeyLocator, in_context: bool) -> KeyPair: + """Load a specific `KeyPair` from disk + + :param locator: a `KeyLocator` that specifies the path to an existing key + :param in_context: Boolean flag indicating if the keypair is the active + context; ensures the public and private keys are both loaded when `True`. + Only the public key is loaded when `False` + :returns: a KeyPair containing the loaded public/private key + """ + # private keys contain public & private key parts + key_path = locator.private if in_context else locator.public + + pub_key: bytes = b"" + priv_key: t.Optional[bytes] = b"" + + if key_path.exists(): + logger.debug(f"Existing key files located at {key_path}") + pub_key, priv_key = zmq.auth.load_certificate(key_path) + else: + logger.debug(f"No key files found at {key_path}") + + # avoid a `None` value in the private key when it isn't loaded + return KeyPair(pub_key, priv_key or b"") + + def _load_keys(self) -> t.Tuple[KeyPair, KeyPair]: + """Use ZMQ auth to load public/private key pairs for the server and client + components from the standard key paths for the associated experiment + + :returns: 2-tuple of `KeyPair` (server_keypair, client_keypair) + ]""" + try: + server_keys = self._load_keypair(self._server_locator, self._as_server) + client_keys = self._load_keypair(self._client_locator, self._as_client) + + return server_keys, client_keys + except (ValueError, OSError): + # expected if no keys could be loaded from disk + logger.warning("Loading key pairs failed.", exc_info=True) + + return KeyPair(), KeyPair() + + @classmethod + def _move_public_key(cls, locator: _KeyLocator) -> None: + """The public and private key pair are created in the same directory. Move + the public key out of the private subdir and into the public subdir + + :param locator: `KeyLocator` that determines the path to the + key pair persisted in the same directory. + """ + new_path = locator.private.with_suffix(locator.public.suffix) + if new_path != locator.public: + logger.debug(f"Moving key file from {locator.public} to {new_path}") + new_path.rename(locator.public) + + def _create_keys(self) -> None: + """Create and persist key files to disk""" + for locator in [self._server_locator, self._client_locator]: + # create keys in the private directory... + zmq.auth.create_certificates(locator.private_dir, locator.private.stem) + + # ...but move the public key out of the private subdirectory + self._move_public_key(locator) + + # and ensure correct r/w/x permissions on each file. + locator.private.chmod(_KeyPermissions.PRIVATE_KEY) + locator.public.chmod(_KeyPermissions.PUBLIC_KEY) + + def get_keys(self, create: bool = True) -> t.Tuple[KeyPair, KeyPair]: + """Use ZMQ auth to generate a public/private key pair for the server + and client components. + + :param no_create: pass `no_create=True` to ensure keys are not + created and only pre-existing keys can be loaded + :returns: 2-tuple of `KeyPair` (server_keypair, client_keypair) + """ + logger.debug(f"Loading keys, creation {'is' if create else 'not'} allowed") + server_keys, client_keys = self._load_keys() + + # check if we received "empty keys" + if not server_keys.empty or not client_keys.empty: + return server_keys, client_keys + + if not create: + # if directed not to create new keys, return "empty keys" + logger.debug("Returning empty key pairs") + return KeyPair(), KeyPair() + + self.create_directories() + self._create_keys() + + # load keys to ensure they were persisted + return self._load_keys() + + @property + def client_keys_dir(self) -> pathlib.Path: + "Return the path to the client public keys directory" + return self._client_locator.public_dir diff --git a/smartsim/_core/utils/telemetry/manifest.py b/smartsim/_core/utils/telemetry/manifest.py index 33c2d8c2c..4b343ac9d 100644 --- a/smartsim/_core/utils/telemetry/manifest.py +++ b/smartsim/_core/utils/telemetry/manifest.py @@ -68,12 +68,14 @@ def load_entity( entity_type: str, entity_dict: t.Dict[str, t.Any], exp_dir: pathlib.Path, + raw_experiment: t.Dict[str, t.Any], ) -> t.List[JobEntity]: """Map entity data persisted in a manifest file to an object :param entity_type: type of the associated `SmartSimEntity` - :param entity_dict: raw dictionary deserialized from manifest JSON + :param entity_dict: raw dictionary deserialized from entity in manifest JSON :param exp_dir: root path to experiment outputs + :param raw_experiment: raw experiment deserialized from manifest JSON :return: list of loaded `JobEntity` instances """ entities = [] @@ -86,13 +88,17 @@ def load_entity( container = "shards" if "shards" in parent_keys else "models" child_type = "featurestore" if container == "shards" else "model" for child_entity in entity_dict[container]: - entity = JobEntity.from_manifest(child_type, child_entity, str(exp_dir)) + entity = JobEntity.from_manifest( + child_type, child_entity, str(exp_dir), raw_experiment + ) entities.append(entity) return entities # not a parent type, just create the entity w/the entity_type passed in - entity = JobEntity.from_manifest(entity_type, entity_dict, str(exp_dir)) + entity = JobEntity.from_manifest( + entity_type, entity_dict, str(exp_dir), raw_experiment + ) entities.append(entity) return entities @@ -101,12 +107,14 @@ def load_entities( entity_type: str, run: t.Dict[str, t.Any], exp_dir: pathlib.Path, + raw_experiment: t.Dict[str, t.Any], ) -> t.Dict[str, t.List[JobEntity]]: """Map a collection of entity data persisted in a manifest file to an object :param entity_type: type of the associated `SmartSimEntity` :param run: raw dictionary containing `Run` data deserialized from JSON :param exp_dir: root path to experiment outputs + :param raw_experiment: raw experiment deserialized from manifest JSON :return: list of loaded `JobEntity` instances """ persisted: t.Dict[str, t.List[JobEntity]] = { @@ -114,18 +122,23 @@ def load_entities( "featurestore": [], } for item in run[entity_type]: - entities = Run.load_entity(entity_type, item, exp_dir) + entities = Run.load_entity(entity_type, item, exp_dir, raw_experiment) for new_entity in entities: persisted[new_entity.type].append(new_entity) return persisted @staticmethod - def load_run(raw_run: t.Dict[str, t.Any], exp_dir: pathlib.Path) -> "Run": + def load_run( + raw_run: t.Dict[str, t.Any], + exp_dir: pathlib.Path, + raw_experiment: t.Dict[str, t.Any], + ) -> "Run": """Map run data persisted in a manifest file to an object - :param runs: raw dictionary containing `Run` data deserialized from JSON + :param raw_run: raw dictionary containing `Run` data deserialized from JSON :param exp_dir: root path to experiment outputs + :param raw_experiment: raw experiment deserialized from manifest JSON :return: populated `Run` instance """ @@ -139,7 +152,7 @@ def load_run(raw_run: t.Dict[str, t.Any], exp_dir: pathlib.Path) -> "Run": # use the output mapping keys to load all the target # entities from the deserialized JSON for entity_type in run_entities: - _entities = Run.load_entities(entity_type, raw_run, exp_dir) + _entities = Run.load_entities(entity_type, raw_run, exp_dir, raw_experiment) # load_entities may return a mapping containing types different from # entity_type IF it was a parent entity. Iterate through the keys in @@ -218,7 +231,7 @@ def load_manifest(file_path: str) -> t.Optional["RuntimeManifest"]: raise ValueError("Manifest missing required runs") exp_dir = pathlib.Path(exp["path"]) - runs = [Run.load_run(raw_run, exp_dir) for raw_run in runs] + runs = [Run.load_run(raw_run, exp_dir, exp) for raw_run in runs] manifest = RuntimeManifest( name=exp["name"], diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index f00b4d435..8a9a99aed 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -42,6 +42,7 @@ from smartsim._core.config import CONFIG from smartsim._core.control.job import JobEntity, _JobKey from smartsim._core.control.jobmanager import JobManager +from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher from smartsim._core.launcher.launcher import Launcher from smartsim._core.launcher.local.local import LocalLauncher from smartsim._core.launcher.lsf.lsfLauncher import LSFLauncher @@ -100,6 +101,7 @@ def __init__( "pbs": PBSLauncher, "lsf": LSFLauncher, "local": LocalLauncher, + "dragon": DragonLauncher, } self._collector_mgr = CollectorManager(timeout_ms) @@ -113,7 +115,7 @@ def tracked_jobs(self) -> t.Sequence[JobEntity]: def init_launcher(self, launcher: str) -> None: """Initialize the controller with a specific type of launcher. - SmartSim currently supports slurm, pbs(pro), lsf, + SmartSim currently supports Slurm, PBS(Pro), LSF, Dragon and local launching :param launcher: the name of the workload manager used by the experiment @@ -138,13 +140,17 @@ def init_job_manager(self) -> None: self.job_manager.set_launcher(self._launcher) self.job_manager.start() - def set_launcher(self, launcher: str) -> None: - """Initialize all required dependencies - - :param launcher: the name of the workload manager used by the experiment + def set_launcher(self, launcher_type: str) -> None: + """Set the launcher for the experiment + :param launcher_type: the name of the workload manager used by the experiment """ - self.init_launcher(launcher) - self.init_job_manager() + self.init_launcher(launcher_type) + + if self._launcher is None: + raise SmartSimError("Launcher init failed") + + self.job_manager.set_launcher(self._launcher) + self.job_manager.start() def process_manifest(self, manifest_path: str) -> None: """Read the manifest for the experiment. Process the @@ -300,12 +306,24 @@ async def on_timestep(self, timestamp: int) -> None: # consider not using name to avoid collisions m_jobs = [job for job in self._tracked_jobs.values() if job.is_managed] if names := {entity.name: entity for entity in m_jobs}: - step_updates = self._launcher.get_step_update(list(names.keys())) - - for step_name, step_info in step_updates: - if step_info and step_info.status in TERMINAL_STATUSES: - completed_entity = names[step_name] - await self._to_completed(timestamp, completed_entity, step_info) + step_updates: t.List[t.Tuple[str, t.Optional[StepInfo]]] = [] + + try: + task_names = list(names.keys()) + updates = self._launcher.get_step_update(task_names) + step_updates.extend(updates) + logger.debug(f"Retrieved updates for: {task_names}") + except Exception: + logger.warning(f"Telemetry step updates failed for {names.keys()}") + + try: + for step_name, step_info in step_updates: + if step_info and step_info.status in TERMINAL_STATUSES: + completed_entity = names[step_name] + await self._to_completed(timestamp, completed_entity, step_info) + except Exception as ex: + msg = f"An error occurred getting step updates on {names}" + logger.error(msg, exc_info=ex) async def shutdown(self) -> None: """Release all resources owned by the `ManifestEventHandler`""" diff --git a/smartsim/_core/utils/telemetry/util.py b/smartsim/_core/utils/telemetry/util.py index 2189a5c78..c9856bd3b 100644 --- a/smartsim/_core/utils/telemetry/util.py +++ b/smartsim/_core/utils/telemetry/util.py @@ -68,7 +68,8 @@ def write_event( if task_id: task_id = int(task_id) except ValueError: - logger.exception(f"Unable to parse task_id: {task_id}") + if not isinstance(task_id, str): + logger.exception(f"Unable to parse task_id: {task_id}") entity_dict = { "timestamp": timestamp, diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 48bf7ca15..3aeeed8df 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -23,7 +23,11 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# pylint: disable=too-many-lines + import itertools +import os.path as osp import sys import typing as t from os import environ, getcwd, getenv @@ -38,7 +42,12 @@ from .._core.utils.helpers import is_valid_cmd, unpack_fs_identifier from .._core.utils.network import get_ip_from_host from ..entity import FSNode, EntityList, TelemetryConfiguration -from ..error import SmartSimError, SSConfigError, SSUnsupportedError +from ..error import ( + SmartSimError, + SSConfigError, + SSDBFilesNotParseable, + SSUnsupportedError, +) from ..log import get_logger from ..servertype import CLUSTERED, STANDALONE from ..settings import ( @@ -60,6 +69,7 @@ logger = get_logger(__name__) by_launcher: t.Dict[str, t.List[str]] = { + "dragon": [""], "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], "pals": ["mpiexec"], @@ -71,7 +81,7 @@ def _detect_command(launcher: str) -> str: if launcher in by_launcher: for cmd in by_launcher[launcher]: - if launcher == "local": + if launcher in ["local", "dragon"]: return cmd if is_valid_cmd(cmd): return cmd @@ -105,10 +115,15 @@ def _check_run_command(launcher: str, run_command: str) -> None: raise SmartSimError(msg) -def _get_single_command(run_command: str, batch: bool, single_cmd: bool) -> bool: +def _get_single_command( + run_command: str, launcher: str, batch: bool, single_cmd: bool +) -> bool: if not single_cmd: return single_cmd + if launcher == "dragon": + return False + if run_command == "srun" and getenv("SLURM_HET_SIZE") is not None: msg = ( "srun can not launch an FeatureStore with single_cmd=True in " @@ -138,6 +153,7 @@ def _check_local_constraints(launcher: str, batch: bool) -> None: raise SmartSimError(msg) +# pylint: disable-next=too-many-public-methods class FeatureStore(EntityList[FSNode]): """The FeatureStore is an in-memory feature store that can be launched alongside entities in SmartSim. Data can be transferred between @@ -196,7 +212,9 @@ def __init__( self.launcher, self.run_command = _autodetect(launcher, run_command) _check_run_command(self.launcher, self.run_command) _check_local_constraints(self.launcher, batch) - single_cmd = _get_single_command(self.run_command, batch, single_cmd) + single_cmd = _get_single_command( + self.run_command, self.launcher, batch, single_cmd + ) self.ports: t.List[int] = [] self._hosts: t.List[str] = [] self._user_hostlist: t.List[str] = [] @@ -359,10 +377,11 @@ def is_active(self) -> bool: :return: True if feature store is active, False otherwise """ - if not self._hosts: + try: + hosts = self.hosts + except SSDBFilesNotParseable: return False - - return fs_is_active(self._hosts, self.ports, self.num_shards) + return fs_is_active(hosts, self.ports, self.num_shards) @property def _rai_module(self) -> t.Tuple[str, ...]: @@ -388,6 +407,14 @@ def _redis_exe(self) -> str: def _redis_conf(self) -> str: return CONFIG.database_conf + @property + def checkpoint_file(self) -> str: + """Get the path to the checkpoint file for this Feature Store + + :return: Path to the checkpoint file if it exists, otherwise a None + """ + return osp.join(self.path, "smartsim_db.dat") + def set_cpus(self, num_cpus: int) -> None: """Set the number of CPUs available to each feature store shard @@ -440,9 +467,8 @@ def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: raise TypeError("host_list argument must be list of strings") self._user_hostlist = host_list.copy() # TODO check length - if self.batch: - if hasattr(self, "batch_settings") and self.batch_settings: - self.batch_settings.set_hostlist(host_list) + if self.batch and hasattr(self, "batch_settings") and self.batch_settings: + self.batch_settings.set_hostlist(host_list) if self.launcher == "lsf": for fs in self.entities: @@ -844,6 +870,7 @@ def _get_start_script_args( ] if cluster: cmd.append("+cluster") # is the shard part of a cluster + return cmd def _get_fs_hosts(self) -> t.List[str]: diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 6f3010ba2..2d2f4a3b0 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -34,7 +34,7 @@ from dataclasses import dataclass from .._core.config import CONFIG -from ..error import SmartSimError +from ..error import SSDBFilesNotParseable from ..log import get_logger from ..settings.base import RunSettings from .entity import SmartSimEntity @@ -184,7 +184,7 @@ def _parse_launched_shard_info_from_files( def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": """Parse the launched feature store shard info from the output files - :raises SmartSimError: if all shard info could not be found + :raises SSDBFilesNotParseable: if all shard info could not be found :return: The found launched shard info """ ips: "t.List[LaunchedShardData]" = [] @@ -211,7 +211,7 @@ def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": f"{len(ips)} out of {self.num_shards} FS shards." ) logger.error(msg) - raise SmartSimError(msg) + raise SSDBFilesNotParseable(msg) return ips def _parse_fs_hosts(self) -> t.List[str]: @@ -220,7 +220,7 @@ def _parse_fs_hosts(self) -> t.List[str]: The IP address is preferred, but if hostname is only present then a lookup to /etc/hosts is done through the socket library. - :raises SmartSimError: if host/ip could not be found + :raises SSDBFilesNotParseable: if host/ip could not be found :return: ip addresses | hostnames """ return list({shard.hostname for shard in self.get_launched_shard_info()}) diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index 9be96e671..f82aeea18 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -186,13 +186,13 @@ def __init__( raise ValueError("Either script or script_path must be provided") @property - def script(self) -> t.Optional[str]: + def script(self) -> t.Optional[t.Union[bytes, str]]: return self.func def __str__(self) -> str: desc_str = "Name: " + self.name + "\n" if self.func: - desc_str += "Func: " + self.func + "\n" + desc_str += "Func: " + str(self.func) + "\n" if self.file: desc_str += "File path: " + str(self.file) + "\n" devices_str = self.device + ( diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index ade83b491..117b0e9d6 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -94,6 +94,7 @@ def __init__( self._key_prefixing_enabled = True self.batch_settings = batch_settings self.run_settings = run_settings + self.replicas: str super().__init__(name, str(path), perm_strat=perm_strat, **kwargs) @@ -111,6 +112,7 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: """ strategy = self._set_strategy(kwargs.pop("perm_strat")) replicas = kwargs.pop("replicas", None) + self.replicas = replicas # if a ensemble has parameters and run settings, create # the ensemble and assign run_settings to each member diff --git a/smartsim/entity/entityList.py b/smartsim/entity/entityList.py index 1045d1ad4..0da86e050 100644 --- a/smartsim/entity/entityList.py +++ b/smartsim/entity/entityList.py @@ -91,16 +91,14 @@ def fs_scripts(self) -> t.Iterable["smartsim.entity.FSScript"]: @property def batch(self) -> bool: - try: - if not hasattr(self, "batch_settings"): - return False - - if self.batch_settings: - return True - return False - # local feature store cannot launch with batches - except AttributeError: - return False + """Property indicating whether or not the entity sequence should be + launched as a batch job + + :return: ``True`` if entity sequence should be launched as a batch job, + ``False`` if the members will be launched individually. + """ + # pylint: disable-next=no-member + return hasattr(self, "batch_settings") and self.batch_settings @property def type(self) -> str: diff --git a/smartsim/error/__init__.py b/smartsim/error/__init__.py index 4268905e6..3a40548e7 100644 --- a/smartsim/error/__init__.py +++ b/smartsim/error/__init__.py @@ -32,6 +32,7 @@ ShellError, SmartSimError, SSConfigError, + SSDBFilesNotParseable, SSDBIDConflictError, SSInternalError, SSReservedKeywordError, diff --git a/smartsim/error/errors.py b/smartsim/error/errors.py index a67cf03f1..eaf3cb2d5 100644 --- a/smartsim/error/errors.py +++ b/smartsim/error/errors.py @@ -87,6 +87,12 @@ class SSDBIDConflictError(SmartSimError): """ +class SSDBFilesNotParseable(SmartSimError): + """Raised when the files related to the feature store cannot be parsed. + Includes the case when the files do not exist. + """ + + # Internal Exceptions @@ -149,3 +155,7 @@ class UnproxyableStepError(TelemetryError): class SmartSimCLIActionCancelled(SmartSimError): """Raised when a `smart` CLI command is terminated""" + + +class PreviewFormatError(SSUnsupportedError): + """Raised when the output format of the preview method call is not supported""" diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 172bd837f..df9f7105e 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -24,6 +24,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# pylint: disable=too-many-lines + import os import os.path as osp import typing as t @@ -35,7 +37,7 @@ from smartsim.error.errors import SSUnsupportedError from smartsim.status import SmartSimStatus -from ._core import Controller, Generator, Manifest +from ._core import Controller, Generator, Manifest, previewrenderer from .database import FeatureStore from .entity import ( Ensemble, @@ -158,16 +160,28 @@ def __init__( self.exp_path = exp_path - if launcher == "auto": - launcher = detect_launcher() - if launcher == "cobalt": + self._launcher = launcher.lower() + + if self._launcher == "auto": + self._launcher = detect_launcher() + if self._launcher == "cobalt": raise SSUnsupportedError("Cobalt launcher is no longer supported.") - self._control = Controller(launcher=launcher) - self._launcher = launcher.lower() + if launcher == "dragon": + self._set_dragon_server_path() + + self._control = Controller(launcher=self._launcher) + self.fs_identifiers: t.Set[str] = set() self._telemetry_cfg = ExperimentTelemetryConfiguration() + def _set_dragon_server_path(self) -> None: + """Set path for dragon server through environment varialbes""" + if not "SMARTSIM_DRAGON_SERVER_PATH" in environ: + environ["SMARTSIM_DRAGON_SERVER_PATH_EXP"] = osp.join( + self.exp_path, CONFIG.dragon_default_subdir + ) + @_contextualize def start( self, @@ -719,7 +733,7 @@ def create_feature_store( batch: bool = False, hosts: t.Optional[t.Union[t.List[str], str]] = None, run_command: str = "auto", - interface: str = "ipogif0", + interface: t.Union[str, t.List[str]] = "ipogif0", account: t.Optional[str] = None, time: t.Optional[str] = None, queue: t.Optional[str] = None, @@ -804,6 +818,53 @@ def reconnect_feature_store(self, checkpoint: str) -> FeatureStore: logger.error(e) raise + def preview( + self, + *args: t.Any, + verbosity_level: previewrenderer.Verbosity = previewrenderer.Verbosity.INFO, + output_format: previewrenderer.Format = previewrenderer.Format.PLAINTEXT, + output_filename: t.Optional[str] = None, + ) -> None: + """Preview entity information prior to launch. This method + aggregates multiple pieces of information to give users insight + into what and how entities will be launched. Any instance of + ``Model``, ``Ensemble``, or ``Feature Store`` created by the + Experiment can be passed as an argument to the preview method. + + Verbosity levels: + - info: Display user-defined fields and entities. + - debug: Display user-defined field and entities and auto-generated + fields. + - developer: Display user-defined field and entities, auto-generated + fields, and run commands. + + :param verbosity_level: verbosity level specified by user, defaults to info. + :param output_format: Set output format. The possible accepted + output formats are ``plain_text``. + Defaults to ``plain_text``. + :param output_filename: Specify name of file and extension to write + preview data to. If no output filename is set, the preview will be + output to stdout. Defaults to None. + """ + + # Retrieve any active feature store jobs + active_fsjobs = self._control.active_feature_store_jobs + + preview_manifest = Manifest(*args) + + previewrenderer.render( + self, + preview_manifest, + verbosity_level, + output_format, + output_filename, + active_fsjobs, + ) + + @property + def launcher(self) -> str: + return self._launcher + @_contextualize def summary(self, style: str = "github") -> str: """Return a summary of the ``Experiment`` diff --git a/smartsim/log.py b/smartsim/log.py index c9e0e9399..3d6c0860e 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -40,7 +40,8 @@ # constants DEFAULT_DATE_FORMAT: t.Final[str] = "%H:%M:%S" DEFAULT_LOG_FORMAT: t.Final[str] = ( - "%(asctime)s %(hostname)s %(name)s[%(process)d] %(levelname)s %(message)s" + "%(asctime)s %(hostname)s %(name)s[%(process)d:%(threadName)s] " + "%(levelname)s %(message)s" ) EXPERIMENT_LOG_FORMAT = DEFAULT_LOG_FORMAT.replace("s[%", "s {%(exp_path)s} [%") diff --git a/smartsim/ml/data.py b/smartsim/ml/data.py index 875fe90f0..36c0ae415 100644 --- a/smartsim/ml/data.py +++ b/smartsim/ml/data.py @@ -102,12 +102,13 @@ def download(self, client: Client) -> None: """ try: info_ds = client.get_dataset(self._ds_name) - except RedisReplyError: + except RedisReplyError as e: # If the info was not published, proceed with default parameters logger.warning( "Could not retrieve data for DataInfo object, the following " "values will be kept." ) + logger.error(f"Original error from Redis was {e}") logger.warning(str(self)) return self.sample_name = info_ds.get_meta_strings("sample_name")[0] @@ -284,6 +285,7 @@ def __init__( verbose: bool = False, init_samples: bool = True, max_fetch_trials: int = -1, + wait_interval: float = 10.0, ) -> None: self.address = address self.cluster = cluster @@ -310,7 +312,7 @@ def __init__( self.set_replica_parameters(replica_rank, num_replicas) if init_samples: - self.init_samples(max_fetch_trials) + self.init_samples(max_fetch_trials, wait_interval) @property def client(self) -> Client: @@ -377,7 +379,7 @@ def __iter__( self._data_generation(self._calc_indices(idx)) for idx in range(len(self)) ) - def init_samples(self, init_trials: int = -1) -> None: + def init_samples(self, init_trials: int = -1, wait_interval: float = 10.0) -> None: """Initialize samples (and targets, if needed). A new attempt to download samples will be made every ten seconds, @@ -391,10 +393,10 @@ def init_samples(self, init_trials: int = -1) -> None: max_trials = init_trials or -1 while not self and num_trials != max_trials: self._update_samples_and_targets() - self.log( - "DataLoader could not download samples, will try again in 10 seconds" - ) - time.sleep(10) + msg = "DataLoader could not download samples, will try again in " + msg += f"{wait_interval} seconds" + self.log(msg) + time.sleep(wait_interval) num_trials += 1 if not self: diff --git a/smartsim/settings/__init__.py b/smartsim/settings/__init__.py index d417c9ef8..6e8f0bc96 100644 --- a/smartsim/settings/__init__.py +++ b/smartsim/settings/__init__.py @@ -27,6 +27,7 @@ from .alpsSettings import AprunSettings from .base import RunSettings, SettingsBase from .containers import Container, Singularity +from .dragonRunSettings import DragonRunSettings from .lsfSettings import BsubBatchSettings, JsrunSettings from .mpiSettings import MpiexecSettings, MpirunSettings, OrterunSettings from .palsSettings import PalsMpiexecSettings @@ -46,6 +47,7 @@ "SbatchSettings", "SrunSettings", "PalsMpiexecSettings", + "DragonRunSettings", "Container", "Singularity", ] diff --git a/smartsim/settings/dragonRunSettings.py b/smartsim/settings/dragonRunSettings.py new file mode 100644 index 000000000..b8baa4708 --- /dev/null +++ b/smartsim/settings/dragonRunSettings.py @@ -0,0 +1,78 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import typing as t + +from ..log import get_logger +from .base import RunSettings + +logger = get_logger(__name__) + + +class DragonRunSettings(RunSettings): + def __init__( + self, + exe: str, + exe_args: t.Optional[t.Union[str, t.List[str]]] = None, + env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, + **kwargs: t.Any, + ) -> None: + """Initialize run parameters for a Dragon process + + ``DragonRunSettings`` should only be used on systems where Dragon + is available and installed in the current environment. + + If an allocation is specified, the instance receiving these run + parameters will launch on that allocation. + + :param exe: executable to run + :param exe_args: executable arguments, defaults to None + :param env_vars: environment variables for job, defaults to None + :param alloc: allocation ID if running on existing alloc, defaults to None + """ + super().__init__( + exe, + exe_args, + run_command="", + env_vars=env_vars, + **kwargs, + ) + + def set_nodes(self, nodes: int) -> None: + """Set the number of nodes + + :param nodes: number of nodes to run with + """ + self.run_args["nodes"] = nodes + + def set_tasks_per_node(self, tasks_per_node: int) -> None: + """Set the number of tasks for this job + + :param tasks_per_node: number of tasks per node + """ + self.run_args["tasks-per-node"] = tasks_per_node diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index 7bc2f7b86..5f7fc3fe2 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -32,6 +32,7 @@ AprunSettings, BsubBatchSettings, Container, + DragonRunSettings, JsrunSettings, MpiexecSettings, MpirunSettings, @@ -76,10 +77,13 @@ def create_batch_settings( "pbs": QsubBatchSettings, "slurm": SbatchSettings, "lsf": BsubBatchSettings, + "pals": QsubBatchSettings, } - if launcher == "auto": + if launcher in ["auto", "dragon"]: launcher = detect_launcher() + if launcher == "dragon": + by_launcher["dragon"] = by_launcher[launcher] if launcher == "local": raise SmartSimError("Local launcher does not support batch workloads") @@ -144,6 +148,7 @@ def create_run_settings( # run commands supported by each launcher # in order of suspected user preference by_launcher = { + "dragon": [""], "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], "pals": ["mpiexec"], @@ -156,7 +161,7 @@ def create_run_settings( def _detect_command(launcher: str) -> str: if launcher in by_launcher: - if launcher == "local": + if launcher in ["local", "dragon"]: return "" for cmd in by_launcher[launcher]: @@ -178,6 +183,11 @@ def _detect_command(launcher: str) -> str: # no auto detection for local, revert to false run_command = _detect_command(launcher) + if launcher == "dragon": + return DragonRunSettings( + exe=exe, exe_args=exe_args, env_vars=env_vars, container=container, **kwargs + ) + # if user specified and supported or auto detection worked if run_command and run_command in supported: return supported[run_command](launcher)( diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index 25e21602a..19d84d7c1 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -327,7 +327,7 @@ def check_env_vars(self) -> None: "environment. If the job is running in an interactive " f"allocation, the value {v} will not be set. Please " "consider removing the variable from the environment " - "and re-run the experiment." + "and re-running the experiment." ) logger.warning(msg) diff --git a/smartsim/templates/templates/preview/plain_text/activeinfra.template b/smartsim/templates/templates/preview/plain_text/activeinfra.template new file mode 100644 index 000000000..3e9ed6a2e --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/activeinfra.template @@ -0,0 +1,9 @@ + + = Feature Store Identifier: {{ fs.entity.fs_identifier }} = + Shards: {{ fs.entity.num_shards }} + TCP/IP Port(s): + {%- for port in fs.entity.ports %} + {{ port }} + {%- endfor %} + Network Interface: {{ fs.entity.run_settings.exe_args | get_ifname }} + Type: {{ config.database_cli | get_fstype }} diff --git a/smartsim/templates/templates/preview/plain_text/base.template b/smartsim/templates/templates/preview/plain_text/base.template new file mode 100644 index 000000000..5686b8676 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/base.template @@ -0,0 +1,52 @@ + +{% include "experiment.template" %} +{%- if manifest.has_deployable or active_fsjobs %} + +=== Entity Preview === + + {%- if active_fsjobs %} + + == Active Infrastructure == + {%- for name, fs in active_fsjobs.items() %} + {% include "activeinfra.template" %} + {%- endfor %} + {%- endif %} + {%- if manifest.fss %} + + == Feature Stores == + {%- for fs in manifest.fss %} + {%- if fs.is_active() %} + WARNING: Cannot preview {{ fs.name }}, because it is already started. + {%- else %} + {% include "orchestrator.template" %} + {%- endif %} + {%- endfor %} + {%- endif %} + {%- if manifest.models %} + + == Models == + {%- for model in manifest.models %} + + = Model Name: {{ model.name }} = + {%- include "model.template" %} + {%- if model.run_settings.colocated_fs_settings or manifest.fss %} + Client Configuration: + {%- if model.run_settings.colocated_fs_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.fss %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} + {%- endfor %} + {%- endif %} + + {%- if manifest.ensembles %} + + == Ensembles == + {%- for ensemble in manifest.ensembles %} + {%- include "ensemble.template" %} + {%- endfor %} + {%- endif %} + +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfig.template b/smartsim/templates/templates/preview/plain_text/clientconfig.template new file mode 100644 index 000000000..3342918d9 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfig.template @@ -0,0 +1,7 @@ + +{%- if verbosity_level == Verbosity.INFO %} +{%- include "clientconfig_info.template" -%} +{%- endif %} +{%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} +{%- include "clientconfig_debug.template" -%} +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfig_debug.template b/smartsim/templates/templates/preview/plain_text/clientconfig_debug.template new file mode 100644 index 000000000..b77cc98da --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfig_debug.template @@ -0,0 +1,29 @@ + + {%- for fs in manifest.fss %} + {%- if fs.name %} + Feature Store Identifier: {{ fs.name }} + {%- endif %} + {%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} + Feature Store Backend: {{ config.database_cli | get_fstype }} + TCP/IP Port(s): + {%- for port in fs.ports %} + {{ port }} + {%- endfor %} + Type: Standalone + {%- endif %} + {%- endfor %} + {%- if model.incoming_entities %} + {%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} + Incoming Entities (Available Data Sources): + {%- for incoming in model.incoming_entities %} + {{ incoming.name }} + {%- endfor %} + {%- endif %} + {%- endif %} + {%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} + Outgoing Key Collision Prevention (Key Prefixing): + Tensors: {{ model.query_key_prefixing() | as_toggle }} + Datasets: {{ model.query_key_prefixing() | as_toggle }} + ML Models/Torch Scripts: {{ False | as_toggle }} + Aggregation Lists: {{ model.query_key_prefixing() | as_toggle }} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfig_info.template b/smartsim/templates/templates/preview/plain_text/clientconfig_info.template new file mode 100644 index 000000000..998b68707 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfig_info.template @@ -0,0 +1,19 @@ + + {%- for fs in manifest.fss %} + {%- if fs.name %} + Feature Store Identifier: {{ fs.name }} + {%- endif %} + Feature Store Backend: {{ config.database_cli | get_fstype }} + TCP/IP Port(s): + {%- for port in fs.ports %} + {{ port }} + {%- endfor %} + Type: Standalone + {%- endfor %} + {%- if model.query_key_prefixing() %} + Outgoing Key Collision Prevention (Key Prefixing): + Tensors: {{ model.query_key_prefixing() | as_toggle }} + Datasets: {{ model.query_key_prefixing() | as_toggle }} + ML Models/Torch Scripts: {{ False | as_toggle }} + Aggregation Lists: {{ model.query_key_prefixing() | as_toggle }} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfigcolo.template b/smartsim/templates/templates/preview/plain_text/clientconfigcolo.template new file mode 100644 index 000000000..c1278a19a --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfigcolo.template @@ -0,0 +1,7 @@ + +{%- if verbosity_level == Verbosity.INFO %} +{%- include "clientconfigcolo_info.template" %} +{% endif %} +{%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} +{%- include "clientconfigcolo_debug.template" %} +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template new file mode 100644 index 000000000..93ad8aa7b --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template @@ -0,0 +1,37 @@ + + {%- if model.run_settings.colocated_fs_settings.fs_identifier %} + Feature Store Identifier: {{ model.run_settings.colocated_fs_settings.fs_identifier }} + {%- else %} + Feature Store Identifier: N/A + {%- endif %} + Feature Store Backend: {{ config.database_cli | get_fstype }} + {%- if model.run_settings.colocated_fs_settings %} + {%- if model.run_settings.colocated_fs_settings.port %} + Connection Type: TCP + TCP/IP Port(s): + {{ model.run_settings.colocated_fs_settings.port }} + {%- endif %} + {%- if model.run_settings.colocated_fs_settings.unix_socket %} + Connection Type: UDS + Unix Socket: {{ model.run_settings.colocated_fs_settings.unix_socket }} + {%- endif %} + {%- if model.run_settings.colocated_fs_settings.ifname %} + {%- if model.run_settings.colocated_fs_settings.ifname | is_list %} + Network Interface Name: {{ model.run_settings.colocated_fs_settings.ifname[0] }} + {%- else %} + Network Interface Name: {{ model.run_settings.colocated_fs_settings.ifname }} + {%- endif %} + {%- endif %} + Type: Colocated + {%- if model.incoming_entities %} + Incoming Entities (Available Data Sources): + {%- for incoming in model.incoming_entities %} + {{ incoming.name }} + {%- endfor %} + {%- endif %} + {%- endif %} + Outgoing Key Collision Prevention (Key Prefixing): + Tensors: {{ model.query_key_prefixing() | as_toggle }} + Datasets: {{ model.query_key_prefixing() | as_toggle }} + ML Models/Torch Scripts: {{ False | as_toggle }} + Aggregation Lists: {{ model.query_key_prefixing() | as_toggle }} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template new file mode 100644 index 000000000..c3f315676 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template @@ -0,0 +1,22 @@ + + {%- if model.run_settings.colocated_db_settings.db_identifier %} + Feature Store Identifier: {{ model.run_settings.colocated_db_settings.db_identifier }} + {%- endif %} + Feature Store Backend: {{ config.database_cli | get_dbtype }} + {%- if model.run_settings.colocated_db_settings.port %} + Connection Type: TCP + TCP/IP Port(s): + {{ model.run_settings.colocated_db_settings.port }} + {%- endif %} + {%- if model.run_settings.colocated_db_settings.unix_socket %} + Connection Type: UDS + Unix Socket: {{ model.run_settings.colocated_db_settings.unix_socket }} + {%- endif %} + Type: Colocated + {%- if model.query_key_prefixing() %} + Outgoing Key Collision Prevention (Key Prefixing): + Tensors: {{ model.query_key_prefixing() | as_toggle }} + Datasets: {{ model.query_key_prefixing() | as_toggle }} + ML Models/Torch Scripts: {{ False | as_toggle }} + Aggregation Lists: {{ model.query_key_prefixing() | as_toggle }} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/ensemble.template b/smartsim/templates/templates/preview/plain_text/ensemble.template new file mode 100644 index 000000000..040737cc9 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/ensemble.template @@ -0,0 +1,7 @@ + +{%- if verbosity_level == Verbosity.INFO %} +{%- include "ensemble_info.template" -%} +{%- endif %} +{%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} +{%- include "ensemble_debug.template" -%} +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/ensemble_debug.template b/smartsim/templates/templates/preview/plain_text/ensemble_debug.template new file mode 100644 index 000000000..c458813ca --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/ensemble_debug.template @@ -0,0 +1,62 @@ + + {% for ensemble in manifest.ensembles %} + = Ensemble Name: {{ ensemble.name }} = + {%- if ensemble.path %} + Path: {{ ensemble.path }} + {%- endif %} + Members: {{ ensemble|length }} + {%- if ensemble.params %} + Ensemble Parameters: + {%- for key, value in ensemble.params.items() %} + {{ key }}: {{ value | join(", ") | wordwrap(150) | safe | replace('\n', '\n ') }} + {%- endfor %} + {%- endif %} + {%- if ensemble.replicas %} + Replicas: {{ ensemble.replicas }} + {%- elif ensemble.perm_strat %} + Permutation Strategy: {{ ensemble.perm_strat }} + {%- endif %} + {%- if ensemble.batch_settings %} + Batch Launch: True + Batch Command: {{ ensemble.batch_settings.batch_cmd }} + {%- endif %} + {%- if ensemble.batch_settings.batch_args %} + Batch Arguments: + {%- for key, value in ensemble.batch_settings.batch_args.items() %} + {{ key }}: {{ value }} + {%- endfor %} + {%- endif %} + + {%- if verbosity_level == Verbosity.DEBUG %} + {%- for model in ensemble.entities %} + + - Model Name: {{ model.name }} - + {%- include 'model.template' %} + {%- if model.run_settings.colocated_fs_settings or manifest.fss %} + Client Configuration: + {%- if model.run_settings.colocated_fs_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.fss %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} + {%- endfor %} + {%- endif %} + {%- if verbosity_level == Verbosity.DEVELOPER %} + {%- for model in ensemble.entities %} + + - Model Name: {{ model.name }} - + {%- include 'model_debug.template' %} + {%- if model.run_settings.colocated_fs_settings or manifest.fss %} + Client Configuration: + {%- if model.run_settings.colocated_fs_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.fss %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- endif %} +{% endfor %} diff --git a/smartsim/templates/templates/preview/plain_text/ensemble_info.template b/smartsim/templates/templates/preview/plain_text/ensemble_info.template new file mode 100644 index 000000000..a7b9c2296 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/ensemble_info.template @@ -0,0 +1,51 @@ + = Ensemble Name: {{ ensemble.name }} = + Members: {{ ensemble|length }} + {%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} + {%- if ensemble.params %} + Ensemble Parameters: + {%- for key, value in ensemble.params.items() %} + {{ key }}: {{ '{:^9}'.format(value|string)|truncate(81,true,'...')}} + {%- endfor %} + {%- endif %} + {%- endif %} + {%- if ensemble.models | length > 2 %} + {% set model = ensemble.models[0] %} + - Model Name: {{ model.name }} - + {%- include 'model.template' %} + {%- if model.run_settings.colocated_fs_settings or manifest.fss %} + Client Configuration: + {%- if model.run_settings.colocated_fs_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.fss %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} + ... + {% set model = ensemble.models[(ensemble.models | length)-1] %} + - Model Name: {{ model.name }} - + {%- include 'model.template' %} + {% if model.run_settings.colocated_fs_settings or manifest.fss %} + Client Configuration: + {%- if model.run_settings.colocated_fs_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.fss %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} + {%- else %} + {% for model in ensemble %} + - Model Name: {{ model.name }} - + {%- include 'model.template' %} + {% if model.run_settings.colocated_fs_settings or manifest.fss %} + Client Configuration: + {%- if model.run_settings.colocated_fs_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.fss %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} + {% endfor %} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/experiment.template b/smartsim/templates/templates/preview/plain_text/experiment.template new file mode 100644 index 000000000..d2ef16c05 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/experiment.template @@ -0,0 +1,5 @@ +=== Experiment Overview === + + Experiment Name: {{ exp_entity.name }} + Experiment Path: {{ exp_entity.exp_path }} + Launcher: {{ exp_entity.launcher }} diff --git a/smartsim/templates/templates/preview/plain_text/model.template b/smartsim/templates/templates/preview/plain_text/model.template new file mode 100644 index 000000000..303beac67 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/model.template @@ -0,0 +1,7 @@ + +{%- if verbosity_level == Verbosity.INFO %} +{%- include "model_info.template" -%} +{%- endif %} +{%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} +{%- include "model_debug.template" -%} +{%- endif -%} diff --git a/smartsim/templates/templates/preview/plain_text/model_debug.template b/smartsim/templates/templates/preview/plain_text/model_debug.template new file mode 100644 index 000000000..6605d50ab --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/model_debug.template @@ -0,0 +1,114 @@ + + {%- if model is defined %} + {%- if model.path %} + Path: {{ model.path }} + {%- endif %} + Executable: {{ model.run_settings.exe[0] }} + Executable Arguments: + {%- for param in model.run_settings.exe_args %} + {{ param }} + {%- endfor %} + {%- if model.run_settings.run_command %} + Run Command: {{ model.run_settings.run_command }} + {%- endif %} + {%- if model.run_settings.run_args %} + Run Arguments: + {%- for key, value in model.run_settings.run_args.items() %} + {{ key }} {{ value }} + {%- endfor %} + {%- endif %} + {%- if model.batch_settings %} + Batch Launch: True + Batch Command: {{ model.batch_settings.batch_cmd }} + Batch Arguments: + {%- for key, value in model.batch_settings.batch_args.items() %} + {{ key }}: {{ value }} + {%- endfor %} + {%- endif %} + {%- if model.params %} + Model Parameters: + {%- for param, value in model.params.items() %} + {{ param }}: {{ value }} + {%- endfor %} + {%- endif %} + {%- if model.files %} + {%- if model.files.tagged %} + Tagged Files for Model Configuration: + {%- for tagged in model.files.tagged %} + {{ tagged }} + -> {{ model.path }} + {%- endfor %} + {%- endif %} + {%- if model.files.copy %} + Copy Files: + {%- for copy in model.files.copy %} + {{ copy }} + -> {{ model.path }} + {%- endfor %} + {%- endif %} + {%- if model.files.link %} + Symlink Files: + {%- for link in model.files.link %} + {{ link }} + -> {{ model.path }} + {%- endfor %} + {%- endif %} + {%- endif %} + {%- if model.run_settings.colocated_fs_settings %} + Colocated: + {%- if model.run_settings.colocated_fs_settings.fs_identifier %} + Feature Store Identifier: {{ model.run_settings.colocated_fs_settings.fs_identifier }} + {%- endif %} + {%- if model.run_settings.colocated_fs_settings.port %} + Connection Type: TCP + TCP/IP Port(s): + {{ model.run_settings.colocated_fs_settings.port }} + {%- endif %} + {%- if model.run_settings.colocated_fs_settings.unix_socket %} + Connection Type: UDS + Unix Socket: {{ model.run_settings.colocated_fs_settings.unix_socket }} + {%- endif %} + {%- if model.run_settings.colocated_fs_settings.ifname %} + {%- if model.run_settings.colocated_fs_settings.ifname | is_list %} + Network Interface Name: {{ model.run_settings.colocated_fs_settings.ifname[0] }} + {%- else %} + Network Interface Name: {{ model.run_settings.colocated_fs_settings.ifname }} + {%- endif %} + {%- endif %} + CPUs: {{ model.run_settings.colocated_fs_settings.cpus }} + Custom Pinning: {{ model.run_settings.colocated_fs_settings.custom_pinning }} + {%- endif %} + {%- if model._fs_scripts %} + Torch Scripts: + {%- for script in model._fs_scripts%} + Name: {{ script.name }} + Path: {{ script.file }} + Backend: {{ script.device }} + Devices Per Node: {{ script.devices_per_node }} + {%- endfor %} + {%- endif %} + {%- if model._fs_models %} + ML Models: + {%- for mlmodel in model._fs_models %} + Name: {{ mlmodel.name }} + Path: {{ mlmodel.file }} + Backend: {{ mlmodel.backend }} + Device: {{ mlmodel.device }} + Devices Per Node: {{ mlmodel.devices_per_node }} + {%- if mlmodel.device == "GPU" %} + First Device: {{ mlmodel.first_device }} + {%- endif %} + {%- for input in mlmodel.inputs %} + Inputs: + {{ input }} + {%- endfor %} + {%- for output in mlmodel.outputs %} + Outputs: + {{ output }} + {%- endfor %} + {%- endfor %} + {%- endif %} + {%- if model.query_key_prefixing()%} + Key Prefix: {{ model.name }} + {%- endif %} +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/model_info.template b/smartsim/templates/templates/preview/plain_text/model_info.template new file mode 100644 index 000000000..dc961ae95 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/model_info.template @@ -0,0 +1,54 @@ + + + {%- if model.batch_settings %} + Batch Launch: True + {% endif %} + {%- if model.params %} + Model Parameters: + {%- for param, value in model.params.items() %} + {{ param }}: {{ value }} + {%- endfor %} + {%- endif %} + + {%- if model.run_settings.colocated_fs_settings %} + Colocated: + {%- if model.run_settings.colocated_fs_settings.fs_identifier %} + Feature Store Identifier: {{ model.run_settings.colocated_fs_settings.fs_identifier }} + {%- endif %} + {%- if model.run_settings.colocated_fs_settings.port %} + Connection Type: TCP + TCP/IP Port(s): + {{ model.run_settings.colocated_fs_settings.port }} + {%- endif %} + {%- if model.run_settings.colocated_fs_settings.unix_socket %} + Connection Type: UDS + Unix Socket: {{ model.run_settings.colocated_fs_settings.unix_socket }} + {%- endif %} + {%- endif %} + + {%- if model.run_settings.colocated_fs_settings['fs_scripts'] %} + Torch Scripts: + {%- for script in model.run_settings.colocated_fs_settings['fs_scripts'] %} + Name: {{ script.name }} + Path: {{ script.script_path }} + {%- endfor %} + {%- endif %} + {%- if model.run_settings.colocated_fs_settings['fs_models'] %} + ML Models: + {%- for mlmodel in model.run_settings.colocated_fs_settings['fs_models'] %} + Name: {{ mlmodel.name }} + Path: {{ mlmodel.model_file }} + Backend: {{ mlmodel.backend }} + {%- for input in mlmodel.inputs %} + Inputs: + {{ input }} + {%- endfor %} + {%- for output in mlmodel.outputs %} + Outputs: + {{ output }} + {%- endfor %} + {%- endfor %} + {%- endif %} + {%- if model.query_key_prefixing() %} + Key Prefix: {{ model.name }} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/orchestrator.template b/smartsim/templates/templates/preview/plain_text/orchestrator.template new file mode 100644 index 000000000..813b062b3 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/orchestrator.template @@ -0,0 +1,7 @@ + +{%- if verbosity_level == Verbosity.INFO %} +{%- include "orchestrator_info.template" -%} +{%- endif %} +{%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} +{%- include "orchestrator_debug.template" -%} +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/orchestrator_debug.template b/smartsim/templates/templates/preview/plain_text/orchestrator_debug.template new file mode 100644 index 000000000..8dfa6ae9a --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/orchestrator_debug.template @@ -0,0 +1,33 @@ + + = Feature Store Identifier: {{ fs.name }} = + {%- if fs.path %} + Path: {{ fs.path }} + {%- endif %} + Shards: {{ fs.num_shards }} + TCP/IP Port(s): + {%- for port in fs.ports %} + {{ port }} + {%- endfor %} + Network Interface: {{ fs._interfaces[0] }} + Type: {{ config.database_cli | get_fstype }} + Executable: {{ config.database_exe }} + {%- if fs.run_settings %} + Run Command: {{ fs.run_settings.run_command }} + {%- if fs.run_settings.run_args %} + Run Arguments: + {%- for key, value in fs.run_settings.run_args.items() %} + {{ key }}: {{ value }} + {%- endfor %} + {%- endif %} + {%- endif %} + {%- if fs.run_command %} + Run Command: {{ fs.run_command }} + {%- endif %} + {%- if fs.batch_settings %} + Batch Launch: True + Batch Command: {{ fs.batch_settings.batch_cmd }} + Batch Arguments: + {%- for key, value in fs.batch_settings.batch_args.items() %} + {{ key }}: {{ value }} + {%- endfor %} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/orchestrator_info.template b/smartsim/templates/templates/preview/plain_text/orchestrator_info.template new file mode 100644 index 000000000..7964d126e --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/orchestrator_info.template @@ -0,0 +1,11 @@ + + = Feature Store Identifier: {{ fs.name }} = + TCP/IP Port(s): + {%- for port in fs.ports %} + {{ port }} + {%- endfor %} + Network Interface: {{ fs._interfaces[0] }} + Type: {{ config.database_cli | get_fstype }} + {%- if fs.batch %} + Batch Launch: {{ fs.batch }} + {%- endif %} diff --git a/tests/backends/test_cli_mini_exp.py b/tests/backends/test_cli_mini_exp.py index b1c508747..1fd110721 100644 --- a/tests/backends/test_cli_mini_exp.py +++ b/tests/backends/test_cli_mini_exp.py @@ -48,6 +48,7 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( + prepare_fs, local_fs, test_dir, monkeypatch, @@ -57,9 +58,11 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( to ensure that it does not accidentally report false positive/negatives """ + fs = prepare_fs(local_fs).featurestore + @contextmanager def _mock_make_managed_local_feature_store(*a, **kw): - (client_addr,) = local_fs.get_address() + (client_addr,) = fs.get_address() yield smartredis.Client(False, address=client_addr) monkeypatch.setattr( @@ -68,7 +71,7 @@ def _mock_make_managed_local_feature_store(*a, **kw): _mock_make_managed_local_feature_store, ) backends = installed_redisai_backends() - (fs_port,) = local_fs.ports + (fs_port,) = fs.ports smartsim._core._cli.validate.test_install( # Shouldn't matter bc we are stubbing creation of orc diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index 098f6ec4e..deb8d4835 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -167,19 +167,16 @@ def train_tf(generator): @pytest.mark.skipif(not shouldrun_tf, reason="Test needs TensorFlow to run") -def test_tf_dataloaders(test_dir, wlmutils): - exp = Experiment( - "test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher() - ) - feature_store: FeatureStore = wlmutils.get_feature_store() - exp.generate(feature_store) - exp.start(feature_store) +def test_tf_dataloaders(wlm_experiment, prepare_fs, single_fs, monkeypatch): + + fs = prepare_fs(single_fs).featurestore + feature_store = wlm_experiment.reconnect_feature_store(fs.checkpoint_file) + monkeypatch.setenv("SSDB", feature_store.get_address()[0]) + monkeypatch.setenv("SSKEYIN", "test_uploader_0,test_uploader_1") try: - os.environ["SSDB"] = feature_store.get_address()[0] data_info = run_local_uploaders(mpi_size=2, format="tf") - os.environ["SSKEYIN"] = "test_uploader_0,test_uploader_1" for rank in range(2): tf_dynamic = TFDataGenerator( data_info_or_list_name="test_data_list", @@ -190,6 +187,7 @@ def test_tf_dataloaders(test_dir, wlmutils): batch_size=4, max_fetch_trials=5, dynamic=False, # catch wrong arg + wait_interval=0.1, ) train_tf(tf_dynamic) assert len(tf_dynamic) == 4 @@ -204,6 +202,7 @@ def test_tf_dataloaders(test_dir, wlmutils): batch_size=4, max_fetch_trials=5, dynamic=True, # catch wrong arg + wait_interval=0.1, ) train_tf(tf_static) assert len(tf_static) == 4 @@ -211,11 +210,6 @@ def test_tf_dataloaders(test_dir, wlmutils): except Exception as e: raise e - finally: - exp.stop(feature_store) - os.environ.pop("SSDB", "") - os.environ.pop("SSKEYIN", "") - os.environ.pop("SSKEYOUT", "") def create_trainer_torch(experiment: Experiment, filedir, wlmutils): @@ -234,20 +228,18 @@ def create_trainer_torch(experiment: Experiment, filedir, wlmutils): @pytest.mark.skipif(not shouldrun_torch, reason="Test needs Torch to run") -def test_torch_dataloaders(fileutils, test_dir, wlmutils): - exp = Experiment( - "test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher() - ) - feature_store: FeatureStore = wlmutils.get_feature_store() +def test_torch_dataloaders( + wlm_experiment, prepare_fs, single_fs, fileutils, test_dir, wlmutils, monkeypatch +): config_dir = fileutils.get_test_dir_path("ml") - exp.generate(feature_store) - exp.start(feature_store) + fs = prepare_fs(single_fs).featurestore + feature_store = wlm_experiment.reconnect_feature_store(fs.checkpoint_file) + monkeypatch.setenv("SSDB", feature_store.get_address()[0]) + monkeypatch.setenv("SSKEYIN", "test_uploader_0,test_uploader_1") try: - os.environ["SSDB"] = feature_store.get_address()[0] data_info = run_local_uploaders(mpi_size=2) - os.environ["SSKEYIN"] = "test_uploader_0,test_uploader_1" for rank in range(2): torch_dynamic = TorchDataGenerator( data_info_or_list_name="test_data_list", @@ -258,11 +250,12 @@ def test_torch_dataloaders(fileutils, test_dir, wlmutils): batch_size=4, max_fetch_trials=5, dynamic=False, # catch wrong arg - init_samples=True, # catch wrong arg + init_samples=True, + wait_interval=0.1, ) check_dataloader(torch_dynamic, rank, dynamic=True) - torch_dynamic.init_samples(5) + torch_dynamic.init_samples(5, 0.1) for _ in range(2): for _ in torch_dynamic: continue @@ -278,26 +271,22 @@ def test_torch_dataloaders(fileutils, test_dir, wlmutils): max_fetch_trials=5, dynamic=True, # catch wrong arg init_samples=True, # catch wrong arg + wait_interval=0.1, ) check_dataloader(torch_static, rank, dynamic=False) - torch_static.init_samples(5) + torch_static.init_samples(5, 0.1) for _ in range(2): for _ in torch_static: continue - trainer = create_trainer_torch(exp, config_dir, wlmutils) - exp.start(trainer, block=True) + trainer = create_trainer_torch(wlm_experiment, config_dir, wlmutils) + wlm_experiment.start(trainer, block=True) - assert exp.get_status(trainer)[0] == SmartSimStatus.STATUS_COMPLETED + assert wlm_experiment.get_status(trainer)[0] == SmartSimStatus.STATUS_COMPLETED except Exception as e: raise e - finally: - exp.stop(feature_store) - os.environ.pop("SSDB", "") - os.environ.pop("SSKEYIN", "") - os.environ.pop("SSKEYOUT", "") def test_data_info_repr(): @@ -331,15 +320,9 @@ def test_data_info_repr(): @pytest.mark.skipif( not (shouldrun_torch or shouldrun_tf), reason="Requires TF or PyTorch" ) -def test_wrong_dataloaders(test_dir, wlmutils): - exp = Experiment( - "test-wrong-dataloaders", - exp_path=test_dir, - launcher=wlmutils.get_test_launcher(), - ) - feature_store = wlmutils.get_feature_store() - exp.generate(feature_store) - exp.start(feature_store) +def test_wrong_dataloaders(wlm_experiment, prepare_fs, single_fs): + fs = prepare_fs(single_fs).featurestore + feature_store = wlm_experiment.reconnect_feature_store(fs.checkpoint_file) if shouldrun_tf: with pytest.raises(SSInternalError): @@ -365,5 +348,3 @@ def test_wrong_dataloaders(test_dir, wlmutils): cluster=False, ) torch_data_gen.init_samples(init_trials=1) - - exp.stop(feature_store) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 6b5831373..d3f683966 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -146,36 +146,30 @@ def save_torch_cnn(path, file_name): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_tf_fs_model(fileutils, test_dir, wlmutils, mlutils): - """Test TensorFlow fs Models on remote fs""" - - # Set experiment name - exp_name = "test-tf-fs-model" +def test_tf_fs_model( + wlm_experiment, prepare_fs, single_fs, fileutils, test_dir, mlutils +): + """Test TensorFlow FS Models on remote FS""" # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = 1 # TF backend fails on multiple GPUs test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") - # Create the SmartSim Experiment - exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create RunSettings - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings = wlm_experiment.create_run_settings( + exe=sys.executable, exe_args=test_script + ) run_settings.set_nodes(1) run_settings.set_tasks(1) # Create Model - smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model = wlm_experiment.create_model("smartsim_model", run_settings) # Create feature store - host = wlmutils.choose_host(run_settings) - fs = exp.create_feature_store(port=test_port, interface=test_interface, hosts=host) - exp.generate(fs) + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() @@ -212,50 +206,41 @@ def test_tf_fs_model(fileutils, test_dir, wlmutils, mlutils): # Assert we have added both models assert len(smartsim_model._fs_models) == 2 - exp.generate(smartsim_model) + wlm_experiment.generate(smartsim_model) # Launch and check successful completion - try: - exp.start(fs, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - assert all( - stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses - ), f"Statuses: {statuses}" - finally: - exp.stop(fs) + wlm_experiment.start(smartsim_model, block=True) + statuses = wlm_experiment.get_status(smartsim_model) + assert all( + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_pt_fs_model(fileutils, test_dir, wlmutils, mlutils): - """Test PyTorch fs Models on remote fs""" - - # Set experiment name - exp_name = "test-pt-fs-model" +def test_pt_fs_model( + wlm_experiment, prepare_fs, single_fs, fileutils, test_dir, mlutils +): + """Test PyTorch FS Models on remote FS""" # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") - # Create the SmartSim Experiment - exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create RunSettings - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings = wlm_experiment.create_run_settings( + exe=sys.executable, exe_args=test_script + ) run_settings.set_nodes(1) run_settings.set_tasks(1) # Create Model - smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model = wlm_experiment.create_model("smartsim_model", run_settings) # Create feature store - host = wlmutils.choose_host(run_settings) - fs = exp.create_feature_store(port=test_port, interface=test_interface, hosts=host) - exp.generate(fs) + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) # Create and save ML model to filesystem save_torch_cnn(test_dir, "model1.pt") @@ -279,55 +264,46 @@ def test_pt_fs_model(fileutils, test_dir, wlmutils, mlutils): # Assert we have added both models assert len(smartsim_model._fs_models) == 1 - exp.generate(smartsim_model) + wlm_experiment.generate(smartsim_model) # Launch and check successful completion - try: - exp.start(fs, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - assert all( - stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses - ), f"Statuses: {statuses}" - finally: - exp.stop(fs) + wlm_experiment.start(smartsim_model, block=True) + statuses = wlm_experiment.get_status(smartsim_model) + assert all( + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_fs_model_ensemble(fileutils, test_dir, wlmutils, mlutils): - """Test FSModels on remote fs, with an ensemble""" - - # Set experiment name - exp_name = "test-fs-model-ensemble" +def test_ds_model_ensemble( + wlm_experiment, prepare_fs, single_fs, fileutils, test_dir, wlmutils, mlutils +): + """Test FSModels on remote FS, with an ensemble""" # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = 1 # TF backend fails on multiple GPUs test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") - # Create the SmartSim Experiment - exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create RunSettings - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings = wlm_experiment.create_run_settings( + exe=sys.executable, exe_args=test_script + ) run_settings.set_nodes(1) run_settings.set_tasks(1) # Create ensemble - smartsim_ensemble = exp.create_ensemble( + smartsim_ensemble = wlm_experiment.create_ensemble( "smartsim_model", run_settings=run_settings, replicas=2 ) # Create Model - smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model = wlm_experiment.create_model("smartsim_model", run_settings) # Create feature store - host = wlmutils.choose_host(run_settings) - fs = exp.create_feature_store(port=test_port, interface=test_interface, hosts=host) - exp.generate(fs) + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() @@ -380,21 +356,18 @@ def test_fs_model_ensemble(fileutils, test_dir, wlmutils, mlutils): # Assert we have added two models to each entity assert all([len(entity._fs_models) == 2 for entity in smartsim_ensemble]) - exp.generate(smartsim_ensemble) + wlm_experiment.generate(smartsim_ensemble) # Launch and check successful completion - try: - exp.start(fs, smartsim_ensemble, block=True) - statuses = exp.get_status(smartsim_ensemble) - assert all( - stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses - ), f"Statuses: {statuses}" - finally: - exp.stop(fs) + wlm_experiment.start(smartsim_ensemble, block=True) + statuses = wlm_experiment.get_status(smartsim_ensemble) + assert all( + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): +def test_colocated_fs_model_tf(fileutils, test_dir, wlmutils, mlutils): """Test fs Models on colocated fs (TensorFlow backend)""" # Set experiment name diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index b567800f7..6284e3cb2 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -57,37 +57,29 @@ def timestwo(x): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_fs_script(fileutils, test_dir, wlmutils, mlutils): - """Test fs scripts on remote fs""" +def test_fs_script(wlm_experiment, prepare_fs, single_fs, fileutils, mlutils): + """Test FS scripts on remote FS""" - # Set experiment name - exp_name = "test-fs-script" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") - # Create the SmartSim Experiment - exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create the RunSettings - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings = wlm_experiment.create_run_settings( + exe=sys.executable, exe_args=test_script + ) run_settings.set_nodes(1) run_settings.set_tasks(1) # Create the SmartSim Model - smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model = wlm_experiment.create_model("smartsim_model", run_settings) # Create the SmartSim feature store - host = wlmutils.choose_host(run_settings) - fs = exp.create_feature_store(port=test_port, interface=test_interface, hosts=host) - exp.generate(fs, smartsim_model) + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) + wlm_experiment.generate(smartsim_model) # Define the torch script string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" @@ -123,51 +115,42 @@ def test_fs_script(fileutils, test_dir, wlmutils, mlutils): assert len(smartsim_model._fs_scripts) == 3 # Launch and check successful completion - try: - exp.start(fs, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) - finally: - exp.stop(fs) + wlm_experiment.start(smartsim_model, block=True) + statuses = wlm_experiment.get_status(smartsim_model) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_fs_script_ensemble(fileutils, test_dir, wlmutils, mlutils): - """Test fs scripts on remote fs""" +def test_fs_script_ensemble(wlm_experiment, prepare_fs, single_fs, fileutils, mlutils): + """Test FS scripts on remote FS""" - # Set experiment name - exp_name = "test-fs-script" + # Set wlm_experimenteriment name + wlm_experiment_name = "test-fs-script" # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") - # Create SmartSim Experiment - exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create RunSettings - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings = wlm_experiment.create_run_settings( + exe=sys.executable, exe_args=test_script + ) run_settings.set_nodes(1) run_settings.set_tasks(1) + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) + # Create Ensemble with two identical models - ensemble = exp.create_ensemble( + ensemble = wlm_experiment.create_ensemble( "fsscript_ensemble", run_settings=run_settings, replicas=2 ) # Create SmartSim model - smartsim_model = exp.create_model("smartsim_model", run_settings) - - # Create SmartSim feature store - host = wlmutils.choose_host(run_settings) - fs = exp.create_feature_store(port=test_port, interface=test_interface, hosts=host) - exp.generate(fs) + smartsim_model = wlm_experiment.create_model("smartsim_model", run_settings) # Create the script string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" @@ -217,14 +200,11 @@ def test_fs_script_ensemble(fileutils, test_dir, wlmutils, mlutils): # Assert we have added all three models to entities in ensemble assert all([len(entity._fs_scripts) == 3 for entity in ensemble]) - exp.generate(ensemble) + wlm_experiment.generate(ensemble) - try: - exp.start(fs, ensemble, block=True) - statuses = exp.get_status(ensemble) - assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) - finally: - exp.stop(fs) + wlm_experiment.start(ensemble, block=True) + statuses = wlm_experiment.get_status(ensemble) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index f642d09dc..027e2e8a0 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -57,7 +57,7 @@ ) -def test_sklearn_onnx(test_dir, mlutils, wlmutils): +def test_sklearn_onnx(wlm_experiment, prepare_fs, single_fs, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 some sklearn models here we test the following sklearn models: @@ -74,33 +74,24 @@ def test_sklearn_onnx(test_dir, mlutils, wlmutils): You may need to put CUDNN in your LD_LIBRARY_PATH if running on GPU """ - - exp_name = "test_sklearn_onnx" - - exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) - db = wlmutils.get_feature_store(nodes=1) - db.set_path(test_dir) - - exp.start(db) - try: - run_settings = exp.create_run_settings( - sys.executable, f"run_sklearn_onnx.py --device={test_device}" - ) - if wlmutils.get_test_launcher() != "local": - run_settings.set_tasks(1) - model = exp.create_model("onnx_models", run_settings) + run_settings = wlm_experiment.create_run_settings( + sys.executable, f"run_sklearn_onnx.py --device={test_device}" + ) + if wlmutils.get_test_launcher() != "local": + run_settings.set_tasks(1) + model = wlm_experiment.create_model("onnx_models", run_settings) - script_dir = os.path.dirname(os.path.abspath(__file__)) - script_path = Path(script_dir, "run_sklearn_onnx.py").resolve() - model.attach_generator_files(to_copy=str(script_path)) - exp.generate(model) + script_dir = os.path.dirname(os.path.abspath(__file__)) + script_path = Path(script_dir, "run_sklearn_onnx.py").resolve() + model.attach_generator_files(to_copy=str(script_path)) + wlm_experiment.generate(model) - exp.start(model, block=True) - finally: - exp.stop(db) + wlm_experiment.start(model, block=True) # if model failed, test will fail - model_status = exp.get_status(model) + model_status = wlm_experiment.get_status(model) assert model_status[0] != SmartSimStatus.STATUS_FAILED diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index e16800c2a..bb8f59b51 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -50,7 +50,7 @@ (not tf_backend_available) or (not tf_available), reason="Requires RedisAI TF backend", ) -def test_keras_model(test_dir, mlutils, wlmutils): +def test_keras_model(wlm_experiment, prepare_fs, single_fs, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 for a keras model script this test can run on CPU/GPU by setting SMARTSIM_TEST_DEVICE=GPU @@ -60,33 +60,27 @@ def test_keras_model(test_dir, mlutils, wlmutils): You may need to put CUDNN in your LD_LIBRARY_PATH if running on GPU """ - exp_name = "test_keras_model" - - exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) - db = wlmutils.get_feature_store(nodes=1) - db.set_path(test_dir) - exp.start(db) - - run_settings = exp.create_run_settings( + run_settings = wlm_experiment.create_run_settings( "python", f"run_tf.py --device={test_device}" ) if wlmutils.get_test_launcher() != "local": run_settings.set_tasks(1) - model = exp.create_model("tf_script", run_settings) + model = wlm_experiment.create_model("tf_script", run_settings) script_dir = os.path.dirname(os.path.abspath(__file__)) script_path = Path(script_dir, "run_tf.py").resolve() model.attach_generator_files(to_copy=str(script_path)) - exp.generate(model) + wlm_experiment.generate(model) - exp.start(model, block=True) + wlm_experiment.start(model, block=True) - exp.stop(db) # if model failed, test will fail - model_status = exp.get_status(model)[0] + model_status = wlm_experiment.get_status(model)[0] assert model_status != SmartSimStatus.STATUS_FAILED diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index 94fc8793e..a3208fd3e 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -48,7 +48,9 @@ ) -def test_torch_model_and_script(test_dir, mlutils, wlmutils): +def test_torch_model_and_script( + wlm_experiment, prepare_fs, single_fs, mlutils, wlmutils +): """This test needs two free nodes, 1 for the db and 1 for a torch model script Here we test both the torchscipt API and the NN API from torch @@ -60,30 +62,24 @@ def test_torch_model_and_script(test_dir, mlutils, wlmutils): You may need to put CUDNN in your LD_LIBRARY_PATH if running on GPU """ - exp_name = "test_torch_model_and_script" - - exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) test_device = mlutils.get_test_device() - db = wlmutils.get_feature_store(nodes=1) - db.set_path(test_dir) - exp.start(db) - - run_settings = exp.create_run_settings( + run_settings = wlm_experiment.create_run_settings( "python", f"run_torch.py --device={test_device}" ) if wlmutils.get_test_launcher() != "local": run_settings.set_tasks(1) - model = exp.create_model("torch_script", run_settings) + model = wlm_experiment.create_model("torch_script", run_settings) script_dir = os.path.dirname(os.path.abspath(__file__)) script_path = Path(script_dir, "run_torch.py").resolve() model.attach_generator_files(to_copy=str(script_path)) - exp.generate(model) + wlm_experiment.generate(model) - exp.start(model, block=True) + wlm_experiment.start(model, block=True) - exp.stop(db) # if model failed, test will fail - model_status = exp.get_status(model)[0] + model_status = wlm_experiment.get_status(model)[0] assert model_status != SmartSimStatus.STATUS_FAILED diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 02316dfd1..fd8017c7c 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -45,7 +45,10 @@ def add_batch_resources(wlmutils, batch_settings): if isinstance(batch_settings, QsubBatchSettings): for key, value in wlmutils.get_batch_resources().items(): - batch_settings.set_resource(key, value) + if key == "queue": + batch_settings.set_queue(value) + else: + batch_settings.set_resource(key, value) def test_batch_model(fileutils, test_dir, wlmutils): @@ -55,7 +58,7 @@ def test_batch_model(fileutils, test_dir, wlmutils): exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") - batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") + batch_settings = exp.create_batch_settings(nodes=1, time="00:05:00") batch_settings.set_account(wlmutils.get_test_account()) add_batch_resources(wlmutils, batch_settings) @@ -64,6 +67,7 @@ def test_batch_model(fileutils, test_dir, wlmutils): "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings ) + exp.generate(model) exp.start(model, block=True) statuses = exp.get_status(model) assert len(statuses) == 1 @@ -89,6 +93,7 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): ensemble.add_model(M1) ensemble.add_model(M2) + exp.generate(ensemble) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 20e7261c7..9ad69b56e 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -31,6 +31,7 @@ import pytest from smartsim import Experiment +from smartsim.settings.pbsSettings import QsubBatchSettings from smartsim.status import SmartSimStatus # retrieved from pytest fixtures @@ -43,6 +44,15 @@ ) +def add_batch_resources(wlmutils, batch_settings): + if isinstance(batch_settings, QsubBatchSettings): + for key, value in wlmutils.get_batch_resources().items(): + if key == "queue": + batch_settings.set_queue(value) + else: + batch_settings.set_resource(key, value) + + def test_launch_feature_store_auto_batch(test_dir, wlmutils): """test single node feature store""" launcher = wlmutils.get_test_launcher() @@ -60,8 +70,10 @@ def test_launch_feature_store_auto_batch(test_dir, wlmutils): ) feature_store.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, feature_store.batch_settings) - feature_store.batch_settings.set_walltime("00:02:00") + feature_store.batch_settings.set_walltime("00:05:00") + feature_store.set_path(test_dir) exp.start(feature_store, block=True) statuses = exp.get_status(feature_store) @@ -95,8 +107,10 @@ def test_launch_cluster_feature_store_batch_single(test_dir, wlmutils): ) feature_store.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, feature_store.batch_settings) - feature_store.batch_settings.set_walltime("00:02:00") + feature_store.batch_settings.set_walltime("00:05:00") + feature_store.set_path(test_dir) exp.start(feature_store, block=True) statuses = exp.get_status(feature_store) @@ -130,8 +144,10 @@ def test_launch_cluster_feature_store_batch_multi(test_dir, wlmutils): ) feature_store.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, feature_store.batch_settings) - feature_store.batch_settings.set_walltime("00:03:00") + feature_store.batch_settings.set_walltime("00:05:00") + feature_store.set_path(test_dir) exp.start(feature_store, block=True) statuses = exp.get_status(feature_store) @@ -162,8 +178,9 @@ def test_launch_cluster_feature_store_reconnect(test_dir, wlmutils): ) feature_store.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, feature_store.batch_settings) - feature_store.batch_settings.set_walltime("00:03:00") + feature_store.batch_settings.set_walltime("00:05:00") exp.start(feature_store, block=True) @@ -189,25 +206,25 @@ def test_launch_cluster_feature_store_reconnect(test_dir, wlmutils): statuses = exp_2.get_status(reloaded_feature_store) assert all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses) except Exception: - # Something went wrong! Let the experiment that started the DB - # clean up the DB + # Something went wrong! Let the experiment that started the FS + # clean up the FS exp.stop(feature_store) raise try: - # Test experiment 2 can stop the DB + # Test experiment 2 can stop the FS exp_2.stop(reloaded_feature_store) assert all( stat == SmartSimStatus.STATUS_CANCELLED for stat in exp_2.get_status(reloaded_feature_store) ) except Exception: - # Something went wrong! Let the experiment that started the DB - # clean up the DB + # Something went wrong! Let the experiment that started the FS + # clean up the FS exp.stop(feature_store) raise else: - # Ensure it is the same DB that Experiment 1 was tracking + # Ensure it is the same FS that Experiment 1 was tracking time.sleep(5) assert not any( stat == SmartSimStatus.STATUS_RUNNING for stat in exp.get_status(feature_store) diff --git a/tests/on_wlm/test_symlinking.py b/tests/full_wlm/test_symlinking.py similarity index 97% rename from tests/on_wlm/test_symlinking.py rename to tests/full_wlm/test_symlinking.py index df9647342..e3bdddb63 100644 --- a/tests/on_wlm/test_symlinking.py +++ b/tests/full_wlm/test_symlinking.py @@ -28,8 +28,13 @@ import pathlib import time +import pytest + from smartsim import Experiment +if pytest.test_launcher not in pytest.wlm_options: + pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") + def test_batch_model_and_ensemble(test_dir, wlmutils): exp_name = "test-batch" diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py index ede0817ef..3040393d1 100644 --- a/tests/on_wlm/test_containers_wlm.py +++ b/tests/on_wlm/test_containers_wlm.py @@ -50,10 +50,9 @@ def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): """ launcher = wlmutils.get_test_launcher() - print(launcher) - if launcher not in ["pbs", "slurm"]: + if launcher not in ["pbs", "slurm", "dragon"]: pytest.skip( - f"Test only runs on systems with PBS or Slurm as WLM. Current launcher: {launcher}" + f"Test only runs on systems with PBS, Dragon, or Slurm as WLM. Current launcher: {launcher}" ) exp = Experiment( diff --git a/tests/on_wlm/test_dragon.py b/tests/on_wlm/test_dragon.py new file mode 100644 index 000000000..a05d38141 --- /dev/null +++ b/tests/on_wlm/test_dragon.py @@ -0,0 +1,94 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +from smartsim import Experiment +from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher +from smartsim.status import SmartSimStatus + +# retrieved from pytest fixtures +if pytest.test_launcher != "dragon": + pytestmark = pytest.mark.skip(reason="Test is only for Dragon WLM systems") + + +def test_dragon_global_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch): + monkeypatch.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) + exp: Experiment = Experiment( + "test_dragon_connection", + exp_path=test_dir, + launcher=wlmutils.get_test_launcher(), + ) + rs = exp.create_run_settings(exe="sleep", exe_args=["1"]) + model = exp.create_model("sleep", run_settings=rs) + + exp.generate(model) + exp.start(model, block=True) + + try: + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED + finally: + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() + + +def test_dragon_exp_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch): + monkeypatch.delenv("SMARTSIM_DRAGON_SERVER_PATH", raising=False) + monkeypatch.delenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", raising=False) + exp: Experiment = Experiment( + "test_dragon_connection", + exp_path=test_dir, + launcher=wlmutils.get_test_launcher(), + ) + rs = exp.create_run_settings(exe="sleep", exe_args=["1"]) + model = exp.create_model("sleep", run_settings=rs) + + exp.generate(model) + exp.start(model, block=True) + try: + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED + finally: + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() + + +def test_dragon_cannot_honor(wlmutils, test_dir): + exp: Experiment = Experiment( + "test_dragon_cannot_honor", + exp_path=test_dir, + launcher=wlmutils.get_test_launcher(), + ) + rs = exp.create_run_settings(exe="sleep", exe_args=["1"]) + rs.set_nodes(100) + model = exp.create_model("sleep", run_settings=rs) + + exp.generate(model) + exp.start(model, block=True) + + try: + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_FAILED + finally: + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() diff --git a/tests/on_wlm/test_dragon_entrypoint.py b/tests/on_wlm/test_dragon_entrypoint.py new file mode 100644 index 000000000..025b5692f --- /dev/null +++ b/tests/on_wlm/test_dragon_entrypoint.py @@ -0,0 +1,295 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os +import pathlib +import typing as t + +import pytest + +# retrieved from pytest fixtures +if pytest.test_launcher != "dragon": + pytestmark = pytest.mark.skip(reason="Test is only for Dragon WLM systems") + +try: + import smartsim._core.entrypoints.dragon as drg +except: + pytest.skip("Unable to import Dragon library", allow_module_level=True) + + +@pytest.fixture +def mock_argv() -> t.List[str]: + """Fixture for returning valid arguments to the entrypoint""" + return ["+launching_address", "mock-addr", "+interface", "mock-interface"] + + +def test_file_removal(test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Verify that the log file is removed when expected""" + mock_file_name = "mocked_file_name.txt" + expected_path = pathlib.Path(test_dir) / mock_file_name + expected_path.touch() + + with monkeypatch.context() as ctx: + # ensure we get outputs in the test directory + ctx.setattr( + "smartsim._core.entrypoints.dragon.get_log_path", lambda: str(expected_path) + ) + + drg.remove_config_log() + assert not expected_path.exists(), "Dragon config file was not removed" + + +def test_file_removal_on_bad_path(test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Verify that file removal doesn't blow up if the log file wasn't created""" + mock_file_name = "mocked_file_name.txt" + expected_path = pathlib.Path(test_dir) / mock_file_name + + with monkeypatch.context() as ctx: + # ensure we get outputs in the test directory + ctx.setattr( + "smartsim._core.entrypoints.dragon.get_log_path", lambda: str(expected_path) + ) + + # confirm the file doesn't exist... + assert not expected_path.exists(), "Dragon config file was not found" + + try: + # ensure we don't blow up + drg.remove_config_log() + except: + assert False + + +def test_dragon_failure( + mock_argv: t.List[str], test_dir: str, monkeypatch: pytest.MonkeyPatch +): + """Verify that the expected cleanup actions are taken when the dragon + entrypoint exits""" + mock_file_name = "mocked_file_name.txt" + expected_path = pathlib.Path(test_dir) / mock_file_name + expected_path.touch() + + with monkeypatch.context() as ctx: + # ensure we get outputs in the test directory + ctx.setattr( + "smartsim._core.entrypoints.dragon.get_log_path", lambda: str(expected_path) + ) + + def raiser(args_) -> int: + raise Exception("Something bad...") + + # we don't need to execute the entrypoint... + ctx.setattr("smartsim._core.entrypoints.dragon.execute_entrypoint", raiser) + + return_code = drg.main(mock_argv) + + # ensure our exception error code is returned + assert return_code == -1 + + +def test_dragon_main( + mock_argv: t.List[str], test_dir: str, monkeypatch: pytest.MonkeyPatch +): + """Verify that the expected startup & cleanup actions are taken when the dragon + entrypoint exits""" + mock_file_name = "mocked_file_name.txt" + expected_path = pathlib.Path(test_dir) / mock_file_name + expected_path.touch() + + with monkeypatch.context() as ctx: + # ensure we get outputs in the test directory + ctx.setattr( + "smartsim._core.entrypoints.dragon.get_log_path", lambda: str(expected_path) + ) + # we don't need to execute the actual entrypoint... + ctx.setattr( + "smartsim._core.entrypoints.dragon.execute_entrypoint", lambda args_: 0 + ) + + return_code = drg.main(mock_argv) + + # execute_entrypoint should return 0 from our mock + assert return_code == 0 + # the cleanup should remove our config file + assert not expected_path.exists(), "Dragon config file was not removed!" + # the environment should be set as expected + assert os.environ.get("PYTHONUNBUFFERED", None) == "1" + + +@pytest.mark.parametrize( + "signal_num", + [ + pytest.param(0, id="non-truthy signal"), + pytest.param(-1, id="negative signal"), + pytest.param(1, id="positive signal"), + ], +) +def test_signal_handler(signal_num: int, monkeypatch: pytest.MonkeyPatch): + """Verify that the signal handler performs expected actions""" + counter: int = 0 + + def increment_counter(*args, **kwargs): + nonlocal counter + counter += 1 + + with monkeypatch.context() as ctx: + ctx.setattr("smartsim._core.entrypoints.dragon.cleanup", increment_counter) + ctx.setattr("smartsim._core.entrypoints.dragon.logger.info", increment_counter) + + drg.handle_signal(signal_num, None) + + # show that we log informational message & do cleanup (take 2 actions) + assert counter == 2 + + +def test_log_path(monkeypatch: pytest.MonkeyPatch): + """Verify that the log path is loaded & returned as expected""" + + with monkeypatch.context() as ctx: + expected_filename = "foo.log" + ctx.setattr( + "smartsim._core.config.config.Config.dragon_log_filename", expected_filename + ) + + log_path = drg.get_log_path() + + assert expected_filename in log_path + + +def test_summary(test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Verify that the summary is written to expected location w/expected information""" + + with monkeypatch.context() as ctx: + expected_ip = "127.0.0.111" + expected_interface = "mock_int0" + summary_file = pathlib.Path(test_dir) / "foo.log" + expected_hostname = "mockhostname" + + ctx.setattr( + "smartsim._core.config.config.Config.dragon_log_filename", + str(summary_file), + ) + ctx.setattr( + "smartsim._core.entrypoints.dragon.socket.gethostname", + lambda: expected_hostname, + ) + + drg.print_summary(expected_interface, expected_ip) + + summary = summary_file.read_text() + + assert expected_ip in summary + assert expected_interface in summary + assert expected_hostname in summary + + +def test_cleanup(monkeypatch: pytest.MonkeyPatch): + """Verify that the cleanup function attempts to remove the log file""" + counter: int = 0 + + def increment_counter(*args, **kwargs): + nonlocal counter + counter += 1 + + with monkeypatch.context() as ctx: + ctx.setattr( + "smartsim._core.entrypoints.dragon.remove_config_log", increment_counter + ) + drg.SHUTDOWN_INITIATED = False + drg.cleanup() + + # show that cleanup removes config + assert counter == 1 + # show that cleanup alters the flag to enable shutdown + assert drg.SHUTDOWN_INITIATED + + +def test_signal_handler_registration(test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Verify that signal handlers are registered for all expected signals""" + sig_nums: t.List[int] = [] + + def track_args(*args, **kwargs): + nonlocal sig_nums + sig_nums.append(args[0]) + + with monkeypatch.context() as ctx: + ctx.setattr("smartsim._core.entrypoints.dragon.signal.signal", track_args) + + # ensure valid start point + assert not sig_nums + + drg.register_signal_handlers() + + # ensure all expected handlers are registered + assert set(sig_nums) == set(drg.SIGNALS) + + +def test_arg_parser__no_args(): + """Verify arg parser fails when no args are not supplied""" + args_list = [] + + with pytest.raises(SystemExit) as ex: + # ensure that parser complains about missing required arguments + drg.parse_arguments(args_list) + + +def test_arg_parser__invalid_launch_addr(): + """Verify arg parser fails with empty launch_address""" + addr_flag = "+launching_address" + addr_value = "" + + args_list = [addr_flag, addr_value] + + with pytest.raises(ValueError) as ex: + args = drg.parse_arguments(args_list) + + +def test_arg_parser__required_only(): + """Verify arg parser succeeds when optional args are omitted""" + addr_flag = "+launching_address" + addr_value = "mock-address" + + args_list = [addr_flag, addr_value] + + args = drg.parse_arguments(args_list) + + assert args.launching_address == addr_value + assert not args.interface + + +def test_arg_parser__with_optionals(): + """Verify arg parser succeeds when optional args are included""" + addr_flag = "+launching_address" + addr_value = "mock-address" + + interface_flag = "+interface" + interface_value = "mock-int" + + args_list = [interface_flag, interface_value, addr_flag, addr_value] + + args = drg.parse_arguments(args_list) + + assert args.launching_address == addr_value + assert args.interface == interface_value diff --git a/tests/on_wlm/test_preview_wlm.py b/tests/on_wlm/test_preview_wlm.py new file mode 100644 index 000000000..d6c4b71bd --- /dev/null +++ b/tests/on_wlm/test_preview_wlm.py @@ -0,0 +1,409 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from os import path as osp + +import numpy as np +import pytest +from jinja2.filters import FILTERS + +from smartsim import Experiment +from smartsim._core import Manifest, previewrenderer +from smartsim._core.config import CONFIG +from smartsim.database import FeatureStore +from smartsim.settings import QsubBatchSettings, RunSettings + +pytestmark = pytest.mark.slow_tests + +on_wlm = (pytest.test_launcher in pytest.wlm_options,) + + +@pytest.fixture +def choose_host(): + def _choose_host(wlmutils, index: int = 0): + hosts = wlmutils.get_test_hostlist() + if hosts: + return hosts[index] + return None + + return _choose_host + + +def add_batch_resources(wlmutils, batch_settings): + if isinstance(batch_settings, QsubBatchSettings): + for key, value in wlmutils.get_batch_resources().items(): + batch_settings.set_resource(key, value) + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_wlm_run_commands_cluster_feature_store_model( + test_dir, coloutils, fileutils, wlmutils +): + """ + Test preview of wlm run command and run aruguments on a + feature store and model + """ + + exp_name = "test-preview-feature-store-model" + launcher = wlmutils.get_test_launcher() + test_port = wlmutils.get_test_port() + test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + + network_interface = wlmutils.get_test_interface() + feature_store = exp.create_featurestore( + wlmutils.get_test_port(), + fs_nodes=3, + batch=False, + interface=network_interface, + single_cmd=True, + hosts=wlmutils.get_test_hostlist(), + fs_identifier="testfs_reg", + ) + + fs_args = { + "port": test_port, + "fs_cpus": 1, + "debug": True, + "fs_identifier": "testfs_colo", + } + + # Create model with colocated feature store + smartsim_model = coloutils.setup_test_colo( + fileutils, "uds", exp, test_script, fs_args, on_wlm=on_wlm + ) + + preview_manifest = Manifest(feature_store, smartsim_model) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + if pytest.test_launcher != "dragon": + assert "Run Command" in output + assert "ntasks" in output + assert "Run Arguments" in output + assert "nodes" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_model_on_wlm(fileutils, test_dir, wlmutils): + """ + Test preview of wlm run command and run aruguments for a model + """ + exp_name = "test-preview-model-wlm" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + + script = fileutils.get_test_conf_path("sleep.py") + settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") + settings2 = wlmutils.get_base_run_settings("python", f"{script} --time=5") + M1 = exp.create_model("m1", path=test_dir, run_settings=settings1) + M2 = exp.create_model("m2", path=test_dir, run_settings=settings2) + + preview_manifest = Manifest(M1, M2) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + if pytest.test_launcher != "dragon": + assert "Run Command" in output + assert "ntasks" in output + assert "time" in output + assert "nodes" in output + assert "Run Arguments" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_batch_model(fileutils, test_dir, wlmutils): + """Test the preview of a model with batch settings""" + + exp_name = "test-batch-model" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + + script = fileutils.get_test_conf_path("sleep.py") + batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") + + batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, batch_settings) + run_settings = wlmutils.get_run_settings("python", f"{script} --time=5") + model = exp.create_model( + "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings + ) + model.set_path(test_dir) + + preview_manifest = Manifest(model) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + assert "Batch Launch: True" in output + assert "Batch Command" in output + assert "Batch Arguments" in output + assert "nodes" in output + assert "time" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_batch_ensemble(fileutils, test_dir, wlmutils): + """Test preview of a batch ensemble""" + + exp_name = "test-preview-batch-ensemble" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + + script = fileutils.get_test_conf_path("sleep.py") + settings = wlmutils.get_run_settings("python", f"{script} --time=5") + M1 = exp.create_model("m1", path=test_dir, run_settings=settings) + M2 = exp.create_model("m2", path=test_dir, run_settings=settings) + + batch = exp.create_batch_settings(nodes=1, time="00:01:00") + add_batch_resources(wlmutils, batch) + + batch.set_account(wlmutils.get_test_account()) + ensemble = exp.create_ensemble("batch-ens", batch_settings=batch) + ensemble.add_model(M1) + ensemble.add_model(M2) + ensemble.set_path(test_dir) + + preview_manifest = Manifest(ensemble) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + assert "Batch Launch: True" in output + assert "Batch Command" in output + assert "Batch Arguments" in output + assert "nodes" in output + assert "time" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_launch_command(test_dir, wlmutils, choose_host): + """Test preview launch command for feature store, models, and + ensembles""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + exp_name = "test_preview_launch_command" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + # create regular feature store + feature_store = exp.create_featurestore( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + ) + + model_params = {"port": 6379, "password": "unbreakable_password"} + rs1 = RunSettings("bash", "multi_tags_template.sh") + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hello_world_model = exp.create_model( + "echo-hello", run_settings=rs1, params=model_params + ) + + spam_eggs_model = exp.create_model("echo-spam", run_settings=rs2) + + # setup ensemble parameter space + learning_rate = list(np.linspace(0.01, 0.5)) + train_params = {"LR": learning_rate} + + run = exp.create_run_settings(exe="python", exe_args="./train-model.py") + + ensemble = exp.create_ensemble( + "Training-Ensemble", + params=train_params, + params_as_args=["LR"], + run_settings=run, + perm_strategy="random", + n_models=4, + ) + + preview_manifest = Manifest(feature_store, spam_eggs_model, hello_world_model, ensemble) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + assert "feature store" in output + assert "echo-spam" in output + assert "echo-hello" in output + + assert "Training-Ensemble" in output + assert "me: Training-Ensemble_0" in output + assert "Training-Ensemble_1" in output + assert "Training-Ensemble_2" in output + assert "Training-Ensemble_3" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_batch_launch_command(fileutils, test_dir, wlmutils): + """Test the preview of a model with batch settings""" + + exp_name = "test-batch-entities" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + + script = fileutils.get_test_conf_path("sleep.py") + batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") + + batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, batch_settings) + run_settings = wlmutils.get_run_settings("python", f"{script} --time=5") + model = exp.create_model( + "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings + ) + model.set_path(test_dir) + + feature_store = FeatureStore( + wlmutils.get_test_port(), + fs_nodes=3, + batch=True, + interface="lo", + launcher="slurm", + run_command="srun", + ) + feature_store.set_batch_arg("account", "ACCOUNT") + + preview_manifest = Manifest(feature_store, model) + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Batch Launch: True" in output + assert "Batch Command" in output + assert "Batch Arguments" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_ensemble_batch(test_dir, wlmutils): + """ + Test preview of client configuration and key prefixing in Ensemble preview + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp = Experiment( + "test-preview-ensemble-clientconfig", exp_path=test_dir, launcher=test_launcher + ) + # Create feature store + fs = exp.create_featurestore(port=6780, interface="lo") + exp.generate(fs, overwrite=True) + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + # Create ensemble + batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") + batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, batch_settings) + ensemble = exp.create_ensemble( + "fd_simulation", run_settings=rs1, batch_settings=batch_settings, replicas=2 + ) + # enable key prefixing on ensemble + ensemble.enable_key_prefixing() + exp.generate(ensemble, overwrite=True) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + # Create model + ml_model = exp.create_model("tf_training", rs2) + + for sim in ensemble.entities: + ml_model.register_incoming_entity(sim) + + exp.generate(ml_model, overwrite=True) + + preview_manifest = Manifest(fs, ml_model, ensemble) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Client Configuration" in output + assert "Feature Store Identifier" in output + assert "Feature Store Backend" in output + assert "Type" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_ensemble_fs_script(wlmutils, test_dir): + """ + Test preview of a torch script on a model in an ensemble. + """ + # Initialize the Experiment and set the launcher to auto + test_launcher = wlmutils.get_test_launcher() + exp = Experiment("getting-started", launcher=test_launcher) + + feature_store = exp.create_featurestore(fs_identifier="test_fs1") + feature_store_2 = exp.create_featurestore(fs_identifier="test_fs2", fs_nodes=3) + # Initialize a RunSettings object + model_settings = exp.create_run_settings(exe="python", exe_args="params.py") + model_settings_2 = exp.create_run_settings(exe="python", exe_args="params.py") + model_settings_3 = exp.create_run_settings(exe="python", exe_args="params.py") + # Initialize a Model object + model_instance = exp.create_model("model_name", model_settings) + model_instance_2 = exp.create_model("model_name_2", model_settings_2) + batch = exp.create_batch_settings(time="24:00:00", account="test") + ensemble = exp.create_ensemble( + "ensemble", batch_settings=batch, run_settings=model_settings_3, replicas=2 + ) + ensemble.add_model(model_instance) + ensemble.add_model(model_instance_2) + + # TorchScript string + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + # Attach TorchScript to Model + model_instance.add_script( + name="example_script", + script=torch_script_str, + device="GPU", + devices_per_node=2, + first_device=0, + ) + preview_manifest = Manifest(ensemble, feature_store, feature_store_2) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Torch Script" in output diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index d16c81487..28ddf92f7 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -24,7 +24,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os.path from copy import deepcopy +from pathlib import Path import pytest @@ -63,6 +65,37 @@ def test_models(fileutils, test_dir, wlmutils): assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) +def test_multinode_app(mpi_app_path, test_dir, wlmutils): + + if not mpi_app_path: + pytest.skip("Test needs MPI to run") + + exp_name = "test-mpi-app" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + + settings = exp.create_run_settings(str(mpi_app_path), []) + settings.set_nodes(3) + + model = exp.create_model("mpi_app", run_settings=settings) + exp.generate(model) + + exp.start(model, block=True) + + p = Path(model.path) + output_files = sorted([str(path) for path in p.glob("mpi_hello*")]) + expected_files = sorted( + [os.path.join(model.path, f"mpi_hello.{idx}.log") for idx in range(3)] + ) + + assert output_files == expected_files + + for index, file in enumerate(output_files): + with open(file) as f: + assert f.readlines() == [ + f"Hello world from rank {index} out of 3 processors\n" + ] + + def test_ensemble(fileutils, test_dir, wlmutils): exp_name = "test-ensemble-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) @@ -84,21 +117,21 @@ def test_summary(fileutils, test_dir, wlmutils): exp_name = "test-launch-summary" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) - sleep = fileutils.get_test_conf_path("sleep.py") + sleep_exp = fileutils.get_test_conf_path("sleep.py") bad = fileutils.get_test_conf_path("bad.py") - sleep_settings = exp.create_run_settings("python", f"{sleep} --time=3") + sleep_settings = exp.create_run_settings("python", f"{sleep_exp} --time=3") sleep_settings.set_tasks(1) bad_settings = exp.create_run_settings("python", f"{bad} --time=6") bad_settings.set_tasks(1) - sleep = exp.create_model("sleep", path=test_dir, run_settings=sleep_settings) + sleep_exp = exp.create_model("sleep", path=test_dir, run_settings=sleep_settings) bad = exp.create_model("bad", path=test_dir, run_settings=bad_settings) # start and poll - exp.start(sleep, bad) + exp.start(sleep_exp, bad) assert exp.get_status(bad)[0] == SmartSimStatus.STATUS_FAILED - assert exp.get_status(sleep)[0] == SmartSimStatus.STATUS_COMPLETED + assert exp.get_status(sleep_exp)[0] == SmartSimStatus.STATUS_COMPLETED summary_str = exp.summary(style="plain") print(summary_str) @@ -106,13 +139,18 @@ def test_summary(fileutils, test_dir, wlmutils): rows = [s.split() for s in summary_str.split("\n")] headers = ["Index"] + rows.pop(0) + # There is no guarantee that the order of + # the rows will be sleep, bad row = dict(zip(headers, rows[0])) - assert sleep.name == row["Name"] - assert sleep.type == row["Entity-Type"] + row_1 = dict(zip(headers, rows[1])) + if row["Name"] != sleep_exp.name: + row_1, row = row, row_1 + + assert sleep_exp.name == row["Name"] + assert sleep_exp.type == row["Entity-Type"] assert 0 == int(row["RunID"]) assert 0 == int(row["Returncode"]) - row_1 = dict(zip(headers, rows[1])) assert bad.name == row_1["Name"] assert bad.type == row_1["Entity-Type"] assert 0 == int(row_1["RunID"]) diff --git a/tests/on_wlm/test_wlm_orc_config_settings.py b/tests/on_wlm/test_wlm_orc_config_settings.py index fc661638a..50654b026 100644 --- a/tests/on_wlm/test_wlm_orc_config_settings.py +++ b/tests/on_wlm/test_wlm_orc_config_settings.py @@ -27,6 +27,9 @@ import pytest from smartsim.error import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -40,13 +43,15 @@ pytestmark = pytest.mark.skip(reason="SmartRedis version is < 0.3.1") -def test_config_methods_on_wlm_single(fsutils, fs): +def test_config_methods_on_wlm_single(fsutils, prepare_fs, single_fs): """Test all configuration file edit methods on single node WLM fs""" + fs = prepare_fs(single_fs).featurestore # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs configs = fsutils.get_fs_configs() for setting, value in configs.items(): + logger.debug(f"Setting {setting}={value}") config_set_method = fsutils.get_config_edit_method(fs, setting) config_set_method(value) @@ -67,14 +72,16 @@ def test_config_methods_on_wlm_single(fsutils, fs): fs.set_fs_conf(key, value) -def test_config_methods_on_wlm_cluster(fsutils, fs_cluster): +def test_config_methods_on_wlm_cluster(fsutils, prepare_fs, clustered_fs): """Test all configuration file edit methods on an active clustered fs""" + fs = prepare_fs(clustered_fs).featurestore # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs configs = fsutils.get_fs_configs() for setting, value in configs.items(): - config_set_method = fsutils.get_config_edit_method(fs_cluster, setting) + logger.debug(f"Setting {setting}={value}") + config_set_method = fsutils.get_config_edit_method(fs, setting) config_set_method(value) # ensure SmartSimError is raised when a clustered feature store's @@ -83,7 +90,8 @@ def test_config_methods_on_wlm_cluster(fsutils, fs_cluster): for key, value_list in ss_error_configs.items(): for value in value_list: with pytest.raises(SmartSimError): - fs_cluster.set_fs_conf(key, value) + logger.debug(f"Setting {key}={value}") + fs.set_fs_conf(key, value) # ensure TypeError is raised when a clustered feature store's # FeatureStore.set_fs_conf is given invalid CONFIG key-value pairs @@ -91,4 +99,5 @@ def test_config_methods_on_wlm_cluster(fsutils, fs_cluster): for key, value_list in type_error_configs.items(): for value in value_list: with pytest.raises(TypeError): - fs_cluster.set_fs_conf(key, value) + logger.debug(f"Setting {key}={value}") + fs.set_fs_conf(key, value) diff --git a/tests/test_collector_manager.py b/tests/test_collector_manager.py index 5590f8b9f..dd789086d 100644 --- a/tests/test_collector_manager.py +++ b/tests/test_collector_manager.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -246,11 +246,13 @@ async def test_collector_manager_collect_filesink( @pytest.mark.asyncio async def test_collector_manager_collect_integration( - test_dir: str, mock_entity: MockCollectorEntityFunc, local_fs, mock_sink + test_dir: str, mock_entity: MockCollectorEntityFunc, prepare_fs, local_fs, mock_sink ) -> None: """Ensure that all collectors are executed and some metric is retrieved""" - entity1 = mock_entity(port=local_fs.ports[0], name="e1", telemetry_on=True) - entity2 = mock_entity(port=local_fs.ports[0], name="e2", telemetry_on=True) + + fs = prepare_fs(local_fs).featurestore + entity1 = mock_entity(port=fs.ports[0], name="e1", telemetry_on=True) + entity2 = mock_entity(port=fs.ports[0], name="e2", telemetry_on=True) # todo: consider a MockSink so i don't have to save the last value in the collector sinks = [mock_sink(), mock_sink(), mock_sink()] diff --git a/tests/test_collector_sink.py b/tests/test_collector_sink.py index 4506a3c0c..148a72ef7 100644 --- a/tests/test_collector_sink.py +++ b/tests/test_collector_sink.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_collectors.py b/tests/test_collectors.py index 09fac1484..fdc8f6780 100644 --- a/tests/test_collectors.py +++ b/tests/test_collectors.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2024, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -42,6 +42,8 @@ # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a +PrepareFS = t.Callable[[dict], smartsim.experiment.FeatureStore] + @pytest.mark.asyncio async def test_dbmemcollector_prepare( @@ -171,12 +173,15 @@ async def test_dbmemcollector_collect( async def test_dbmemcollector_integration( mock_entity: MockCollectorEntityFunc, mock_sink: MockSink, - local_fs: smartsim.experiment.FeatureStore, + prepare_fs: PrepareFS, + local_fs: dict, monkeypatch: pytest.MonkeyPatch, ) -> None: """Integration test with a real feature store instance to ensure output data matches expectations and proper db client API uage""" - entity = mock_entity(port=local_fs.ports[0], telemetry_on=True) + + fs = prepare_fs(local_fs).featurestore + entity = mock_entity(port=fs.ports[0], telemetry_on=True) sink = mock_sink() collector = DBMemoryCollector(entity, sink) @@ -268,12 +273,15 @@ async def test_dbconn_count_collector_collect( async def test_dbconncollector_integration( mock_entity: MockCollectorEntityFunc, mock_sink: MockSink, - local_fs: smartsim.experiment.FeatureStore, + prepare_fs: PrepareFS, + local_fs: dict, monkeypatch: pytest.MonkeyPatch, ) -> None: """Integration test with a real feature store instance to ensure output data matches expectations and proper db client API uage""" - entity = mock_entity(port=local_fs.ports[0], telemetry_on=True) + + fs = prepare_fs(local_fs).featurestore + entity = mock_entity(port=fs.ports[0], telemetry_on=True) sink = mock_sink() collector = DBConnectionCollector(entity, sink) diff --git a/tests/test_config.py b/tests/test_config.py index 5cd13f2c5..00a1fcdd3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -253,3 +253,31 @@ def test_telemetry_cooldown( monkeypatch.delenv("SMARTSIM_TELEMETRY_COOLDOWN", raising=False) config = Config() assert config.telemetry_cooldown == exp_result + + +def test_key_path_unset(monkeypatch: pytest.MonkeyPatch): + """Ensure that the default value of the key path meets expectations""" + monkeypatch.delenv("SMARTSIM_KEY_PATH", raising=False) + + config = Config() + + key_path = config.smartsim_key_path + + exp_default = Path.home() / ".smartsim" / "keys" + assert str(exp_default) == key_path, "Unexpected default key path" + + +def test_key_path_non_default(monkeypatch: pytest.MonkeyPatch): + """Ensure that the environment variable for key path overrides + the default when it is set""" + key_path1 = "/foo/bar" + key_path2 = "/foo/baz" + config = Config() + + monkeypatch.setenv("SMARTSIM_KEY_PATH", key_path1) + actual_value = config.smartsim_key_path + assert key_path1 == actual_value, "Key path 1 didn't match overridden value" + + monkeypatch.setenv("SMARTSIM_KEY_PATH", key_path2) + actual_value = config.smartsim_key_path + assert key_path2 == actual_value, "Key path 2 didn't match overridden value" diff --git a/tests/test_configs/mpi/mpi_hello.c b/tests/test_configs/mpi/mpi_hello.c new file mode 100755 index 000000000..dcf80f3ac --- /dev/null +++ b/tests/test_configs/mpi/mpi_hello.c @@ -0,0 +1,35 @@ +#include +#include +#include +#include +#include +#include + + +int main(int argc, char** argv) { + sleep(1); + // Initialize the MPI environment + MPI_Init(NULL, NULL); + + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + char filename[64]; + sprintf(filename, "mpi_hello.%d.log", world_rank); + FILE *log = fopen(filename, "w"); + + fprintf(log, "Hello world from rank %d out of %d processors\n", + world_rank, world_size); + fflush(log); + + // unlink(filename); + fclose(log); + + // Finalize the MPI environment. + MPI_Finalize(); +} diff --git a/tests/test_configs/smartredis/multidbid_colo_env_vars_only.py b/tests/test_configs/smartredis/multidbid_colo_env_vars_only.py new file mode 100644 index 000000000..74a15c010 --- /dev/null +++ b/tests/test_configs/smartredis/multidbid_colo_env_vars_only.py @@ -0,0 +1,52 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import os + +from smartredis import Client, ConfigOptions + +if __name__ == "__main__": + """For inclusion in test with two unique database identifiers with multiple + databases where one (presumably colocated) database is started before the + other, and thus only one DB ID is known at application runtime and + available via environment variable. + """ + + parser = argparse.ArgumentParser(description="SmartRedis") + parser.add_argument("--exchange", action="store_true") + parser.add_argument("--should-see-reg-db", action="store_true") + args = parser.parse_args() + + env_vars = [ + "SSDB_testdb_colo", + "SR_DB_TYPE_testdb_colo", + ] + + assert all([var in os.environ for var in env_vars]) + + opts = ConfigOptions.create_from_environment("testdb_colo") + Client(opts, logger_name="SmartSim") diff --git a/tests/test_containers.py b/tests/test_containers.py index 18651183b..48cd31dbe 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -31,8 +31,7 @@ import pytest -from smartsim import Experiment -from smartsim.database import FeatureStore +from smartsim import Experiment, status from smartsim.entity import Ensemble from smartsim.settings.containers import Singularity from smartsim.status import SmartSimStatus @@ -143,7 +142,7 @@ def test_singularity_args(fileutils, test_dir): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_smartredis(test_dir, fileutils, wlmutils): +def test_singularity_smartredis(local_experiment, prepare_fs, local_fs, fileutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. @@ -151,18 +150,13 @@ def test_singularity_smartredis(test_dir, fileutils, wlmutils): Note: This is a containerized port of test_smartredis.py """ - exp = Experiment( - "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" - ) - # create and start a feature store - feature_store = FeatureStore(port=wlmutils.get_test_port()) - exp.generate(feature_store) - exp.start(feature_store, block=False) + fs = prepare_fs(local_fs).featurestore + local_experiment.reconnect_feature_store(fs.checkpoint_file) container = Singularity(containerURI) - rs = exp.create_run_settings( + rs = local_experiment.create_run_settings( "python3", "producer.py --exchange", container=container ) params = {"mult": [1, -10]} @@ -179,18 +173,12 @@ def test_singularity_smartredis(test_dir, fileutils, wlmutils): config = fileutils.get_test_conf_path("smartredis") ensemble.attach_generator_files(to_copy=[config]) - exp.generate(ensemble) + local_experiment.generate(ensemble) # start the models - exp.start(ensemble, summary=False) + local_experiment.start(ensemble, summary=False) # get and confirm statuses - statuses = exp.get_status(ensemble) + statuses = local_experiment.get_status(ensemble) if not all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]): - exp.stop(feature_store) assert False # client ensemble failed - - # stop the FeatureStore - exp.stop(feature_store) - - print(exp.summary()) diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index c3b5df4dc..58f53c379 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -29,6 +29,7 @@ from smartsim._core.control import Controller, Manifest from smartsim._core.launcher.step import Step +from smartsim._core.launcher.step.dragonStep import DragonStep from smartsim.database import FeatureStore from smartsim.entity import Model from smartsim.entity.ensemble import Ensemble @@ -160,23 +161,23 @@ def test_duplicate_running_entity(test_dir, wlmutils, entity): def test_restarting_entity(test_dir, wlmutils, entity): """Validate restarting a completed Model/Ensemble job""" step_settings = RunSettings("echo") + test_launcher = wlmutils.get_test_launcher() step = MockStep("mock-step", test_dir, step_settings) step.meta["status_dir"] = test_dir entity.path = test_dir - test_launcher = wlmutils.get_test_launcher() controller = Controller(test_launcher) controller._jobs.add_job(entity.name, job_id="1234", entity=entity) controller._jobs.move_to_completed(controller._jobs.jobs.get(entity.name)) controller._launch_step(step, entity=entity) -def test_restarting_feature_storeh(test_dir, wlmutils): +def test_restarting_feature_store(test_dir, wlmutils): """Validate restarting a completed FeatureStore job""" step_settings = RunSettings("echo") + test_launcher = wlmutils.get_test_launcher() step = MockStep("mock-step", test_dir, step_settings) step.meta["status_dir"] = test_dir feature_store.path = test_dir - test_launcher = wlmutils.get_test_launcher() controller = Controller(test_launcher) controller._jobs.add_job(feature_store.name, job_id="1234", entity=feature_store) controller._jobs.move_to_completed(controller._jobs.fs_jobs.get(feature_store.name)) diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index 231961c33..7111f5ce5 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -49,22 +49,12 @@ def test_parse_fs_host_error(): feature_store.entities[0].host -def test_hosts(test_dir, wlmutils): - exp_name = "test_hosts" - exp = Experiment(exp_name, exp_path=test_dir) - - feature_store = FeatureStore(port=wlmutils.get_test_port(), interface="lo", launcher="local") - feature_store.set_path(test_dir) - exp.start(feature_store) - - hosts = [] - try: - hosts = feature_store.hosts - assert len(hosts) == feature_store.fs_nodes == 1 - finally: - # stop the feature store even if there is an error raised - exp.stop(feature_store) - feature_store.remove_stale_files() +def test_hosts(local_experiment, prepare_fs, local_fs): + fs = prepare_fs(local_fs).featurestore + feature_store = local_experiment.reconnect_feature_store(fs.checkpoint_file) + + hosts = feature_store.hosts + assert len(hosts) == feature_store.fs_nodes == 1 def _random_shard_info(): diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py new file mode 100644 index 000000000..a510f660a --- /dev/null +++ b/tests/test_dragon_backend.py @@ -0,0 +1,453 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import collections +import sys +import textwrap +import time +from unittest.mock import MagicMock + +import pytest + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_a + +try: + import dragon +except ImportError: + pass +else: + pytest.skip( + reason="Using dragon as launcher, not running Dragon unit tests", + allow_module_level=True, + ) + +from smartsim._core.config import CONFIG +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * +from smartsim._core.utils.helpers import create_short_id_str +from smartsim.status import TERMINAL_STATUSES, SmartSimStatus + +if t.TYPE_CHECKING: + from smartsim._core.launcher.dragon.dragonBackend import ( + DragonBackend, + ProcessGroupInfo, + ) + + +class NodeMock(MagicMock): + @property + def hostname(self) -> str: + return create_short_id_str() + + +class GroupStateMock(MagicMock): + def Running(self) -> MagicMock: + running = MagicMock(**{"__str__.return_value": "Running"}) + return running + + def Error(self) -> MagicMock: + error = MagicMock(**{"__str__.return_value": "Error"}) + return error + + +class ProcessGroupMock(MagicMock): + puids = [121, 122] + + +def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": + + process_mock = MagicMock(returncode=0) + process_group_mock = MagicMock(**{"Process.return_value": ProcessGroupMock()}) + process_module_mock = MagicMock() + process_module_mock.Process = process_mock + node_mock = NodeMock() + system_mock = MagicMock(nodes=["node1", "node2", "node3"]) + monkeypatch.setitem( + sys.modules, + "dragon", + MagicMock( + **{ + "native.machine.Node.return_value": node_mock, + "native.machine.System.return_value": system_mock, + "native.group_state": GroupStateMock(), + "native.process_group.ProcessGroup.return_value": ProcessGroupMock(), + } + ), + ) + monkeypatch.setitem( + sys.modules, + "dragon.infrastructure.connection", + MagicMock(), + ) + monkeypatch.setitem( + sys.modules, + "dragon.infrastructure.policy", + MagicMock(**{"Policy.return_value": MagicMock()}), + ) + monkeypatch.setitem(sys.modules, "dragon.native.process", process_module_mock) + monkeypatch.setitem(sys.modules, "dragon.native.process_group", process_group_mock) + + monkeypatch.setitem(sys.modules, "dragon.native.group_state", GroupStateMock()) + monkeypatch.setitem( + sys.modules, + "dragon.native.machine", + MagicMock( + **{"System.return_value": system_mock, "Node.return_value": node_mock} + ), + ) + from smartsim._core.launcher.dragon.dragonBackend import DragonBackend + + dragon_backend = DragonBackend(pid=99999) + monkeypatch.setattr( + dragon_backend, "_free_hosts", collections.deque(dragon_backend._hosts) + ) + + return dragon_backend + + +def set_mock_group_infos( + monkeypatch: pytest.MonkeyPatch, dragon_backend: "DragonBackend" +) -> t.Dict[str, "ProcessGroupInfo"]: + dragon_mock = MagicMock() + process_mock = MagicMock() + process_mock.configure_mock(**{"returncode": 0}) + dragon_mock.configure_mock(**{"native.process.Process.return_value": process_mock}) + monkeypatch.setitem(sys.modules, "dragon", dragon_mock) + from smartsim._core.launcher.dragon.dragonBackend import ProcessGroupInfo + + running_group = MagicMock(status="Running") + error_group = MagicMock(status="Error") + hosts = dragon_backend._hosts + + group_infos = { + "abc123-1": ProcessGroupInfo( + SmartSimStatus.STATUS_RUNNING, + running_group, + [123], + [], + hosts[0:1], + MagicMock(), + ), + "del999-2": ProcessGroupInfo( + SmartSimStatus.STATUS_CANCELLED, + error_group, + [124], + [-9], + hosts[1:2], + MagicMock(), + ), + "c101vz-3": ProcessGroupInfo( + SmartSimStatus.STATUS_COMPLETED, + MagicMock(), + [125, 126], + [0], + hosts[1:3], + MagicMock(), + ), + "0ghjk1-4": ProcessGroupInfo( + SmartSimStatus.STATUS_FAILED, + error_group, + [127], + [-1], + hosts[2:3], + MagicMock(), + ), + "ljace0-5": ProcessGroupInfo( + SmartSimStatus.STATUS_NEVER_STARTED, None, [], [], [], None + ), + } + + monkeypatch.setattr(dragon_backend, "_group_infos", group_infos) + monkeypatch.setattr(dragon_backend, "_free_hosts", collections.deque(hosts[1:3])) + monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: "abc123-1"}) + monkeypatch.setattr(dragon_backend, "_running_steps", ["abc123-1"]) + + return group_infos + + +def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + + handshake_req = DragonHandshakeRequest() + handshake_resp = dragon_backend.process_request(handshake_req) + + assert isinstance(handshake_resp, DragonHandshakeResponse) + assert handshake_resp.dragon_pid == 99999 + + +def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + ) + + run_resp = dragon_backend.process_request(run_req) + assert isinstance(run_resp, DragonRunResponse) + + step_id = run_resp.step_id + assert dragon_backend._queued_steps[step_id] == run_req + + mock_process_group = MagicMock(puids=[123, 124]) + + dragon_backend._group_infos[step_id].process_group = mock_process_group + dragon_backend._group_infos[step_id].puids = [123, 124] + dragon_backend._start_steps() + + assert dragon_backend._running_steps == [step_id] + assert len(dragon_backend._queued_steps) == 0 + assert len(dragon_backend._free_hosts) == 1 + assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + + monkeypatch.setattr( + dragon_backend._group_infos[step_id].process_group, "status", "Running" + ) + + dragon_backend._update() + + assert dragon_backend._running_steps == [step_id] + assert len(dragon_backend._queued_steps) == 0 + assert len(dragon_backend._free_hosts) == 1 + assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + + dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + + dragon_backend._update() + assert not dragon_backend._running_steps + + +def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + + dragon_backend._shutdown_requested = True + + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + ) + + run_resp = dragon_backend.process_request(run_req) + assert isinstance(run_resp, DragonRunResponse) + assert run_resp.error_message == "Cannot satisfy request, server is shutting down." + step_id = run_resp.step_id + + assert dragon_backend.group_infos[step_id].status == SmartSimStatus.STATUS_FAILED + + +def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + + group_infos = set_mock_group_infos(monkeypatch, dragon_backend) + + status_update_request = DragonUpdateStatusRequest(step_ids=list(group_infos.keys())) + + status_update_response = dragon_backend.process_request(status_update_request) + + assert isinstance(status_update_response, DragonUpdateStatusResponse) + assert status_update_response.statuses == { + step_id: (grp_info.status, grp_info.return_codes) + for step_id, grp_info in group_infos.items() + } + + +def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + group_infos = set_mock_group_infos(monkeypatch, dragon_backend) + + running_steps = [ + step_id + for step_id, group in group_infos.items() + if group.status == SmartSimStatus.STATUS_RUNNING + ] + + step_id_to_stop = running_steps[0] + + stop_request = DragonStopRequest(step_id=step_id_to_stop) + + stop_response = dragon_backend.process_request(stop_request) + + assert isinstance(stop_response, DragonStopResponse) + assert len(dragon_backend._stop_requests) == 1 + + dragon_backend._update() + + assert len(dragon_backend._stop_requests) == 0 + assert ( + dragon_backend._group_infos[step_id_to_stop].status + == SmartSimStatus.STATUS_CANCELLED + ) + + assert len(dragon_backend._allocated_hosts) == 0 + assert len(dragon_backend._free_hosts) == 3 + + +@pytest.mark.parametrize( + "immediate, kill_jobs, frontend_shutdown", + [ + [True, True, True], + [True, True, False], + [True, False, True], + [True, False, False], + [False, True, True], + [False, True, False], + ], +) +def test_shutdown_request( + monkeypatch: pytest.MonkeyPatch, + immediate: bool, + kill_jobs: bool, + frontend_shutdown: bool, +) -> None: + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "0") + dragon_backend = get_mock_backend(monkeypatch) + monkeypatch.setattr(dragon_backend, "_cooldown_period", 1) + set_mock_group_infos(monkeypatch, dragon_backend) + + if kill_jobs: + for group_info in dragon_backend.group_infos.values(): + if not group_info.status in TERMINAL_STATUSES: + group_info.status = SmartSimStatus.STATUS_FAILED + group_info.return_codes = [-9] + group_info.process_group = None + group_info.redir_workers = None + dragon_backend._running_steps.clear() + + shutdown_req = DragonShutdownRequest( + immediate=immediate, frontend_shutdown=frontend_shutdown + ) + shutdown_resp = dragon_backend.process_request(shutdown_req) + + if not kill_jobs: + stop_request_ids = ( + stop_request.step_id for stop_request in dragon_backend._stop_requests + ) + for step_id, group_info in dragon_backend.group_infos.items(): + if not group_info.status in TERMINAL_STATUSES: + assert step_id in stop_request_ids + + assert isinstance(shutdown_resp, DragonShutdownResponse) + assert dragon_backend._shutdown_requested + assert dragon_backend.frontend_shutdown == frontend_shutdown + + dragon_backend._update() + assert not dragon_backend.should_shutdown + time.sleep(dragon_backend._cooldown_period + 0.1) + dragon_backend._update() + + assert dragon_backend._can_shutdown == kill_jobs + assert dragon_backend.should_shutdown == kill_jobs + assert dragon_backend._has_cooled_down == kill_jobs + + +@pytest.mark.parametrize("telemetry_flag", ["0", "1"]) +def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -> None: + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", telemetry_flag) + dragon_backend = get_mock_backend(monkeypatch) + + expected_cooldown = ( + 2 * CONFIG.telemetry_frequency + 5 if int(telemetry_flag) > 0 else 5 + ) + + if telemetry_flag: + assert dragon_backend.cooldown_period == expected_cooldown + else: + assert dragon_backend.cooldown_period == expected_cooldown + + +def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + first_heartbeat = dragon_backend.last_heartbeat + assert dragon_backend.current_time > first_heartbeat + dragon_backend._heartbeat() + assert dragon_backend.last_heartbeat > first_heartbeat + + +@pytest.mark.parametrize("num_nodes", [1, 3, 100]) +def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=num_nodes, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + ) + + assert dragon_backend._can_honor(run_req)[0] == ( + num_nodes <= len(dragon_backend._hosts) + ) + + +def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + step_id = next(dragon_backend._step_ids) + + assert step_id.endswith("0") + assert step_id != next(dragon_backend._step_ids) + + +def test_view(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + set_mock_group_infos(monkeypatch, dragon_backend) + hosts = dragon_backend.hosts + + expected_message = textwrap.dedent(f"""\ + Dragon server backend update + | Host | Status | + |---------|----------| + | {hosts[0]} | Busy | + | {hosts[1]} | Free | + | {hosts[2]} | Free | + | Step | Status | Hosts | Return codes | Num procs | + |----------|--------------|-----------------|----------------|-------------| + | abc123-1 | Running | {hosts[0]} | | 1 | + | del999-2 | Cancelled | {hosts[1]} | -9 | 1 | + | c101vz-3 | Completed | {hosts[1]},{hosts[2]} | 0 | 2 | + | 0ghjk1-4 | Failed | {hosts[2]} | -1 | 1 | + | ljace0-5 | NeverStarted | | | 0 |""") + + assert dragon_backend.status_message == expected_message diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py new file mode 100644 index 000000000..b23a1a7ef --- /dev/null +++ b/tests/test_dragon_installer.py @@ -0,0 +1,471 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import sys +import tarfile +import typing as t +from collections import namedtuple + +import pytest +from github.GitReleaseAsset import GitReleaseAsset +from github.Requester import Requester + +import smartsim +import smartsim._core.utils.helpers as helpers +from smartsim._core._cli.scripts.dragon_install import ( + cleanup, + create_dotenv, + install_dragon, + install_package, + retrieve_asset, + retrieve_asset_info, +) +from smartsim.error.errors import SmartSimCLIActionCancelled + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +mock_archive_name = "dragon-0.8-py3.9.4.1-CRAYEX-ac132fe95.tar.gz" +_git_attr = namedtuple("_git_attr", "value") + + +@pytest.fixture +def test_archive(test_dir: str, archive_path: pathlib.Path) -> pathlib.Path: + """Fixture for returning a simple tarfile to test on""" + num_files = 10 + with tarfile.TarFile.open(archive_path, mode="w:gz") as tar: + mock_whl = pathlib.Path(test_dir) / "mock.whl" + mock_whl.touch() + + for i in range(num_files): + content = pathlib.Path(test_dir) / f"{i:04}.txt" + content.write_text(f"i am file {i}\n") + tar.add(content) + return archive_path + + +@pytest.fixture +def archive_path(test_dir: str) -> pathlib.Path: + """Fixture for returning a dir path based on the default mock asset archive name""" + path = pathlib.Path(test_dir) / mock_archive_name + return path + + +@pytest.fixture +def extraction_dir(test_dir: str) -> pathlib.Path: + """Fixture for returning a dir path based on the default mock asset archive name""" + path = pathlib.Path(test_dir) / mock_archive_name.replace(".tar.gz", "") + return path + + +@pytest.fixture +def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset]: + requester = Requester( + auth=None, + base_url="https://github.com", + user_agent="mozilla", + per_page=10, + verify=False, + timeout=1, + retry=1, + pool_size=1, + ) + headers = {"mock-header": "mock-value"} + attributes = {"mock-attr": "mock-attr-value"} + completed = True + + assets: t.List[GitReleaseAsset] = [] + mock_archive_name_tpl = "{}-{}.4.1-{}ac132fe95.tar.gz" + + for python_version in ["py3.9", "py3.10", "py3.11"]: + for dragon_version in ["dragon-0.8", "dragon-0.9", "dragon-0.10"]: + for platform in ["", "CRAYEX-"]: + + asset = GitReleaseAsset(requester, headers, attributes, completed) + + archive_name = mock_archive_name_tpl.format( + dragon_version, python_version, platform + ) + + monkeypatch.setattr( + asset, + "_browser_download_url", + _git_attr(value=f"http://foo/{archive_name}"), + ) + monkeypatch.setattr(asset, "_name", _git_attr(value=archive_name)) + assets.append(asset) + + return assets + + +def test_cleanup_no_op(archive_path: pathlib.Path) -> None: + """Ensure that the cleanup method doesn't bomb when called with + missing archive path; simulate a failed download""" + # confirm assets do not exist + assert not archive_path.exists() + + # call cleanup. any exceptions should break test... + cleanup(archive_path) + + +def test_cleanup_archive_exists(test_archive: pathlib.Path) -> None: + """Ensure that the cleanup method removes the archive""" + assert test_archive.exists() + + cleanup(test_archive) + + # verify archive is gone after cleanup + assert not test_archive.exists() + + +def test_retrieve_cached( + test_dir: str, + # archive_path: pathlib.Path, + test_archive: pathlib.Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Verify that a previously retrieved asset archive is re-used""" + with tarfile.TarFile.open(test_archive) as tar: + tar.extractall(test_dir) + + ts1 = test_archive.parent.stat().st_ctime + + requester = Requester( + auth=None, + base_url="https://github.com", + user_agent="mozilla", + per_page=10, + verify=False, + timeout=1, + retry=1, + pool_size=1, + ) + headers = {"mock-header": "mock-value"} + attributes = {"mock-attr": "mock-attr-value"} + completed = True + + asset = GitReleaseAsset(requester, headers, attributes, completed) + + # ensure mocked asset has values that we use... + monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo")) + monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name)) + + asset_path = retrieve_asset(test_archive.parent, asset) + ts2 = asset_path.stat().st_ctime + + assert ( + asset_path == test_archive.parent + ) # show that the expected path matches the output path + assert ts1 == ts2 # show that the file wasn't changed... + + +@pytest.mark.parametrize( + "dragon_pin,pyv,is_found,is_crayex", + [ + pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8"), + pytest.param("0.8", "py3.9", True, False, id="0.8,python 3.9"), + pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10"), + pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11"), + pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12"), + pytest.param("0.8", "py3.8", False, True, id="0.8,python 3.8,CrayEX"), + pytest.param("0.8", "py3.9", True, True, id="0.8,python 3.9,CrayEX"), + pytest.param("0.8", "py3.10", True, True, id="0.8,python 3.10,CrayEX"), + pytest.param("0.8", "py3.11", True, True, id="0.8,python 3.11,CrayEX"), + pytest.param("0.8", "py3.12", False, True, id="0.8,python 3.12,CrayEX"), + pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8"), + pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9"), + pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10"), + pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11"), + pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12"), + pytest.param("0.9", "py3.8", False, True, id="0.9,python 3.8,CrayEX"), + pytest.param("0.9", "py3.9", True, True, id="0.9,python 3.9,CrayEX"), + pytest.param("0.9", "py3.10", True, True, id="0.9,python 3.10,CrayEX"), + pytest.param("0.9", "py3.11", True, True, id="0.9,python 3.11,CrayEX"), + pytest.param("0.9", "py3.12", False, True, id="0.9,python 3.12,CrayEX"), + # add a couple variants for a dragon version that isn't in the asset list + pytest.param("0.7", "py3.9", False, False, id="0.7,python 3.9"), + pytest.param("0.7", "py3.9", False, True, id="0.7,python 3.9,CrayEX"), + ], +) +def test_retrieve_asset_info( + test_assets: t.Collection[GitReleaseAsset], + monkeypatch: pytest.MonkeyPatch, + dragon_pin: str, + pyv: str, + is_found: bool, + is_crayex: bool, +) -> None: + """Verify that an information is retrieved correctly based on the python + version, platform (e.g. CrayEX, !CrayEx), and target dragon pin""" + + with monkeypatch.context() as ctx: + ctx.setattr( + smartsim._core._cli.scripts.dragon_install, + "python_version", + lambda: pyv, + ) + ctx.setattr( + smartsim._core._cli.scripts.dragon_install, + "is_crayex_platform", + lambda: is_crayex, + ) + ctx.setattr( + smartsim._core._cli.scripts.dragon_install, + "dragon_pin", + lambda: dragon_pin, + ) + # avoid hitting github API + ctx.setattr( + smartsim._core._cli.scripts.dragon_install, + "_get_release_assets", + lambda: test_assets, + ) + + if is_found: + chosen_asset = retrieve_asset_info() + + assert chosen_asset + assert pyv in chosen_asset.name + assert dragon_pin in chosen_asset.name + + if is_crayex: + assert "crayex" in chosen_asset.name.lower() + else: + assert "crayex" not in chosen_asset.name.lower() + else: + with pytest.raises(SmartSimCLIActionCancelled): + retrieve_asset_info() + + +def test_check_for_utility_missing(test_dir: str) -> None: + """Ensure that looking for a missing utility doesn't raise an exception""" + ld_config = pathlib.Path(test_dir) / "ldconfig" + + utility = helpers.check_for_utility(ld_config) + + assert not utility + + +def test_check_for_utility_exists() -> None: + """Ensure that looking for an existing utility returns a non-empty path""" + utility = helpers.check_for_utility("ls") + assert utility + + +def test_is_crayex_missing_ldconfig(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure the cray ex platform check doesn't fail when ldconfig isn't + available for use""" + + def mock_util_check(util: str) -> str: + if util == "ldconfig": + return "" + return "w00t!" + + with monkeypatch.context() as ctx: + # mock utility existence + ctx.setattr( + helpers, + "check_for_utility", + mock_util_check, + ) + + is_cray = helpers.is_crayex_platform() + assert not is_cray + + +def test_is_crayex_missing_fi_info(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure the cray ex platform check doesn't fail when fi_info isn't + available for use""" + + def mock_util_check(util: str) -> str: + if util == "fi_info": + return "" + return "w00t!" + + with monkeypatch.context() as ctx: + # mock utility existence + ctx.setattr( + helpers, + "check_for_utility", + mock_util_check, + ) + + is_cray = helpers.is_crayex_platform() + assert not is_cray + + +@pytest.mark.parametrize( + "is_cray,output,return_code", + [ + pytest.param(True, "cray pmi2.so\ncxi\ncray pmi.so\npni.so", 0, id="CrayEX"), + pytest.param(False, "cray pmi2.so\ncxi\npni.so", 0, id="No PMI"), + pytest.param(False, "cxi\ncray pmi.so\npni.so", 0, id="No PMI 2"), + pytest.param(False, "cray pmi2.so\ncray pmi.so\npni.so", 0, id="No CXI"), + pytest.param(False, "pmi.so\ncray pmi2.so\ncxi", 0, id="Non Cray PMI"), + pytest.param(False, "cray pmi.so\npmi2.so\ncxi", 0, id="Non Cray PMI2"), + ], +) +def test_is_cray_ex( + monkeypatch: pytest.MonkeyPatch, is_cray: bool, output: str, return_code: int +) -> None: + """Test that cray ex platform check result is returned as expected""" + + def mock_util_check(util: str) -> bool: + # mock that we have the necessary tools + return True + + with monkeypatch.context() as ctx: + # make it look like the utilies always exist + ctx.setattr( + helpers, + "check_for_utility", + mock_util_check, + ) + # mock + ctx.setattr( + helpers, + "execute_platform_cmd", + lambda x: (output, return_code), + ) + + platform_result = helpers.is_crayex_platform() + assert is_cray == platform_result + + +def test_install_package_no_wheel(extraction_dir: pathlib.Path): + """Verify that a missing wheel does not blow up and has a failure retcode""" + exp_path = extraction_dir + + result = install_package(exp_path) + assert result != 0 + + +def test_install_macos(monkeypatch: pytest.MonkeyPatch, extraction_dir: pathlib.Path): + """Verify that installation exits cleanly if installing on unsupported platform""" + with monkeypatch.context() as ctx: + ctx.setattr(sys, "platform", "darwin") + + result = install_dragon(extraction_dir) + assert result == 1 + + +def test_create_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Verify that attempting to create a .env file without any existing + file or container directory works""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + exp_env_path = pathlib.Path(test_dir) / "dragon" / ".env" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + + # ensure no .env exists before trying to create it. + assert not exp_env_path.exists() + + create_dotenv(mock_dragon_root) + + # ensure the .env is created as side-effect of create_dotenv + assert exp_env_path.exists() + + +def test_create_dotenv_existing_dir(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Verify that attempting to create a .env file in an existing + target dir works""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + exp_env_path = pathlib.Path(test_dir) / "dragon" / ".env" + + # set up the parent directory that will contain the .env + exp_env_path.parent.mkdir(parents=True) + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + + # ensure no .env exists before trying to create it. + assert not exp_env_path.exists() + + create_dotenv(mock_dragon_root) + + # ensure the .env is created as side-effect of create_dotenv + assert exp_env_path.exists() + + +def test_create_dotenv_existing_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Verify that attempting to create a .env file when one exists works as expected""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + exp_env_path = pathlib.Path(test_dir) / "dragon" / ".env" + + # set up the parent directory that will contain the .env + exp_env_path.parent.mkdir(parents=True) + + # write something into file to verify it is overwritten + var_name = "DRAGON_BASE_DIR" + exp_env_path.write_text(f"{var_name}=/foo/bar") + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + + # ensure .env exists so we can update it + assert exp_env_path.exists() + + create_dotenv(mock_dragon_root) + + # ensure the .env is created as side-effect of create_dotenv + assert exp_env_path.exists() + + # ensure file was overwritten and env vars are not duplicated + dotenv_content = exp_env_path.read_text(encoding="utf-8") + split_content = dotenv_content.split(var_name) + + # split to confirm env var only appars once + assert len(split_content) == 2 + + +def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Verify that created .env files are correctly formatted""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + exp_env_path = pathlib.Path(test_dir) / "dragon" / ".env" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + + create_dotenv(mock_dragon_root) + + # ensure the .env is created as side-effect of create_dotenv + content = exp_env_path.read_text(encoding="utf-8") + + # ensure we have values written, but ignore empty lines + lines = [line for line in content.split("\n") if line] + assert lines + + # ensure each line is formatted as key=value + for line in lines: + line_split = line.split("=") + assert len(line_split) == 2 diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py new file mode 100644 index 000000000..ee0fcb14b --- /dev/null +++ b/tests/test_dragon_launcher.py @@ -0,0 +1,523 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import logging +import multiprocessing as mp +import os +import pathlib +import sys +import time +import typing as t + +import pytest +import zmq + +import smartsim._core.config +from smartsim._core._cli.scripts.dragon_install import create_dotenv +from smartsim._core.config.config import get_config +from smartsim._core.launcher.dragon.dragonLauncher import DragonConnector +from smartsim._core.launcher.dragon.dragonSockets import ( + get_authenticator, + get_secure_socket, +) +from smartsim._core.schemas.dragonRequests import DragonBootstrapRequest +from smartsim._core.schemas.dragonResponses import DragonHandshakeResponse +from smartsim._core.utils.network import IFConfig, find_free_port +from smartsim._core.utils.security import KeyManager + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +is_mac = sys.platform == "darwin" + + +class MockPopen: + calls = [] + + def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: + self.args = args + self.kwargs = kwargs + + MockPopen.calls.append((args, kwargs)) + + @property + def pid(self) -> int: + return 99999 + + @property + def returncode(self) -> int: + return 0 + + @property + def stdout(self): + return None + + @property + def stderr(self): + return None + + def wait(self, timeout: float) -> None: + time.sleep(timeout) + + +class MockSocket: + def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: + self._bind_address = "" + + def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: + return self + + def bind(self, addr: str) -> None: + self._bind_address = addr + + def recv_string(self, flags: int) -> str: + dbr = DragonBootstrapRequest(address=self._bind_address) + return f"bootstrap|{dbr.json()}" + + def close(self) -> None: ... + + def send(self, *args, **kwargs) -> None: ... + + def send_json(self, json: str) -> None: ... + + def send_string(*args, **kwargs) -> None: ... + + def connect(*args, **kwargs) -> None: ... + + @property + def bind_address(self) -> str: + return self._bind_address + + +class MockAuthenticator: + def __init__(self, context: zmq.Context, log: t.Any) -> None: + self.num_starts: int = 0 + self.num_stops: int = 0 + self.num_configure_curves: int = 0 + self.context = context + self.thread = None + + def configure_curve(self, *args, **kwargs) -> None: + self.cfg_args = args + self.cfg_kwargs = kwargs + self.num_configure_curves += 1 + + def start(self) -> None: + self.num_starts += 1 + + def stop(self) -> None: + self.num_stops += 1 + + def is_alive(self) -> bool: + return self.num_starts > 0 and self.num_stops == 0 + + +def mock_dragon_env(test_dir, *args, **kwargs): + """Create a mock dragon environment that can talk to the launcher through ZMQ""" + logger = logging.getLogger(__name__) + config = get_config() + logging.basicConfig(level=logging.DEBUG) + try: + addr = "127.0.0.1" + callback_port = kwargs["port"] + head_port = find_free_port(start=callback_port + 1) + context = zmq.Context.instance() + context.setsockopt(zmq.SNDTIMEO, config.dragon_server_timeout) + context.setsockopt(zmq.RCVTIMEO, config.dragon_server_timeout) + authenticator = get_authenticator(context, -1) + + callback_socket = get_secure_socket(context, zmq.REQ, False) + dragon_head_socket = get_secure_socket(context, zmq.REP, True) + + full_addr = f"{addr}:{callback_port}" + callback_socket.connect(f"tcp://{full_addr}") + + full_head_addr = f"tcp://{addr}:{head_port}" + dragon_head_socket.bind(full_head_addr) + + req = DragonBootstrapRequest(address=full_head_addr) + + msg_sent = False + while not msg_sent: + logger.info("Sending bootstrap request to callback socket") + callback_socket.send_string("bootstrap|" + req.json()) + # hold until bootstrap response is received + logger.info("Receiving bootstrap response from callback socket") + _ = callback_socket.recv() + msg_sent = True + + hand_shaken = False + while not hand_shaken: + # other side should set up a socket and push me a `HandshakeRequest` + logger.info("Receiving handshake request through dragon head socket") + _ = dragon_head_socket.recv() + # acknowledge handshake success w/DragonHandshakeResponse + logger.info("Sending handshake response through dragon head socket") + handshake_ack = DragonHandshakeResponse(dragon_pid=os.getpid()) + dragon_head_socket.send_string(f"handshake|{handshake_ack.json()}") + + hand_shaken = True + + shutting_down = False + while not shutting_down: + logger.info("Waiting for shutdown request through dragon head socket") + # any incoming request at this point in test is my shutdown... + try: + message = dragon_head_socket.recv() + logger.info(f"Received final message {message}") + finally: + shutting_down = True + try: + logger.info("Handshake complete. Shutting down mock dragon env.") + authenticator.stop() + finally: + logger.info("Dragon mock env exiting...") + + except Exception as ex: + logger.info(f"exception occurred while configuring mock handshaker: {ex}") + raise ex from None + + +def test_dragon_connect_attributes(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Test the connection to a dragon environment dynamically selects an open port + in the range supplied and passes the correct environment""" + test_path = pathlib.Path(test_dir) + + with monkeypatch.context() as ctx: + # make sure we don't touch "real keys" during a test + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + mock_socket = MockSocket() + + # look at test_dir for dragon config + ctx.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) + # avoid finding real interface + ctx.setattr( + "smartsim._core.launcher.dragon.dragonConnector.get_best_interface_and_address", + lambda: IFConfig(interface="faux_interface", address="127.0.0.1"), + ) + # we need to set the socket value or is_connected returns False + ctx.setattr( + "smartsim._core.launcher.dragon.dragonLauncher.DragonConnector._handshake", + lambda self, address: ..., + ) + # avoid starting a real authenticator thread + ctx.setattr("zmq.auth.thread.ThreadAuthenticator", MockAuthenticator) + # avoid starting a real zmq socket + ctx.setattr("zmq.Context.socket", mock_socket) + # avoid starting a real process for dragon entrypoint + ctx.setattr( + "subprocess.Popen", lambda *args, **kwargs: MockPopen(*args, **kwargs) + ) + + # avoid reading "real" config in test... + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + dotenv_path = smartsim._core.config.CONFIG.dragon_dotenv + dotenv_path.parent.mkdir(parents=True) + dotenv_path.write_text("FOO=BAR\nBAZ=BOO") + + dragon_connector = DragonConnector() + dragon_connector.connect_to_dragon() + + chosen_port = int(mock_socket.bind_address.split(":")[-1]) + assert chosen_port >= 5995 + + # grab the kwargs env=xxx from the mocked popen to check what was passed + env = MockPopen.calls[0][1].get("env", None) + + # confirm the environment values were passed from .env file to dragon process + assert "PYTHONUNBUFFERED" in env + assert "FOO" in env + assert "BAZ" in env + + dragon_connector._authenticator.stop() + + +@pytest.mark.parametrize( + "socket_type, is_server", + [ + pytest.param(zmq.REQ, True, id="as-server"), + pytest.param(zmq.REP, False, id="as-client"), + ], +) +def test_secure_socket_authenticator_setup( + test_dir: str, monkeypatch: pytest.MonkeyPatch, socket_type: int, is_server: bool +): + """Ensure the authenticator created by the secure socket factory method + is fully configured and started when returned to a client""" + + with monkeypatch.context() as ctx: + # look at test dir for dragon config + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + # avoid starting a real authenticator thread + ctx.setattr("zmq.auth.thread.ThreadAuthenticator", MockAuthenticator) + + authenticator = get_authenticator(zmq.Context.instance()) + + km = KeyManager(get_config(), as_server=is_server) + + assert isinstance(authenticator, MockAuthenticator) + + # ensure authenticator was configured + assert authenticator.num_configure_curves > 0 + # ensure authenticator was started + assert authenticator.num_starts > 0 + assert authenticator.context == zmq.Context.instance() + # ensure authenticator will accept any secured connection + assert authenticator.cfg_kwargs.get("domain", "") == "*" + # ensure authenticator is using the expected set of keys + assert authenticator.cfg_kwargs.get("location", "") == km.client_keys_dir + + authenticator.stop() + + +@pytest.mark.parametrize( + "as_server", + [ + pytest.param(True, id="server-socket"), + pytest.param(False, id="client-socket"), + ], +) +def test_secure_socket_setup( + test_dir: str, monkeypatch: pytest.MonkeyPatch, as_server: bool +): + """Ensure the authenticator created by the secure socket factory method + is fully configured and started when returned to a client""" + + with monkeypatch.context() as ctx: + # look at test dir for dragon config + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + # avoid starting a real authenticator thread + ctx.setattr("zmq.auth.thread.ThreadAuthenticator", MockAuthenticator) + + context = zmq.Context.instance() + + socket = get_secure_socket(context, zmq.REP, as_server) + + # verify the socket is correctly configured to use curve authentication + assert bool(socket.CURVE_SERVER) == as_server + assert not socket.closed + + socket.close() + + +def test_secure_socket(test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Ensure the authenticator created by the secure socket factory method + is fully configured and started when returned to a client""" + logger = logging.getLogger(__name__) + logging.basicConfig(level=logging.DEBUG) + with monkeypatch.context() as ctx: + # make sure we don't touch "real keys" during a test + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + context = zmq.Context.instance() + authenticator = get_authenticator(context) + server = get_secure_socket(context, zmq.REP, True) + + ip, port = "127.0.0.1", find_free_port(start=9999) + + try: + server.bind(f"tcp://*:{port}") + + client = get_secure_socket(context, zmq.REQ, False) + + client.connect(f"tcp://{ip}:{port}") + + to_send = "you get a foo! you get a foo! everybody gets a foo!" + client.send_string(to_send, flags=zmq.NOBLOCK) + + received_msg = server.recv_string() + assert received_msg == to_send + logger.debug(f"server received: {received_msg}") + finally: + if authenticator: + authenticator.stop() + if client: + client.close() + if server: + server.close() + + +@pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") +def test_dragon_launcher_handshake(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Test that a real handshake between a launcher & dragon environment + completes successfully using secure sockets""" + addr = "127.0.0.1" + bootstrap_port = find_free_port(start=5995) + + with monkeypatch.context() as ctx: + # make sure we don't touch "real keys" during a test + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + # look at test dir for dragon config + ctx.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) + # avoid finding real interface since we may not be on a super + ctx.setattr( + "smartsim._core.launcher.dragon.dragonConnector.get_best_interface_and_address", + lambda: IFConfig("faux_interface", addr), + ) + + ctx.setattr( + "smartsim._core.launcher.dragon.dragonConnector._dragon_cleanup", + lambda server_socket, server_process_pid, server_authenticator: server_authenticator.stop(), + ) + + # start up a faux dragon env that knows how to do the handshake process + # but uses secure sockets for all communication. + mock_dragon = mp.Process( + target=mock_dragon_env, + daemon=True, + kwargs={"port": bootstrap_port, "test_dir": test_dir}, + ) + + def fn(*args, **kwargs): + mock_dragon.start() + return mock_dragon + + ctx.setattr("subprocess.Popen", fn) + + connector = DragonConnector() + + try: + # connect executes the complete handshake and raises an exception if comms fails + connector.connect_to_dragon() + finally: + connector.cleanup() + + +def test_load_env_no_file(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Ensure an empty dragon .env file doesn't break the launcher""" + test_path = pathlib.Path(test_dir) + # mock_dragon_root = pathlib.Path(test_dir) / "dragon" + # exp_env_path = pathlib.Path(test_dir) / "dragon" / ".env" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + + dragon_conf = smartsim._core.config.CONFIG.dragon_dotenv + # verify config doesn't exist + assert not dragon_conf.exists() + + connector = DragonConnector() + + loaded_env = connector.load_persisted_env() + assert not loaded_env + + +def test_load_env_env_file_created(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Ensure a populated dragon .env file is loaded correctly by the launcher""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + create_dotenv(mock_dragon_root) + dragon_conf = smartsim._core.config.CONFIG.dragon_dotenv + + # verify config does exist + assert dragon_conf.exists() + + # load config w/launcher + connector = DragonConnector() + + loaded_env = connector.load_persisted_env() + assert loaded_env + + # confirm .env was parsed as expected by inspecting a key + assert "DRAGON_ROOT_DIR" in loaded_env + + +def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Ensure repeated attempts to use dragon env don't hit file system""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + create_dotenv(mock_dragon_root) + + # load config w/launcher + connector = DragonConnector() + + loaded_env = connector.load_persisted_env() + assert loaded_env + + # ensure attempting to reload would bomb + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", None) + + # attempt to load and if it doesn't blow up, it used the cached copy + + loaded_env = connector.load_persisted_env() + assert loaded_env + + +def test_merge_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Ensure that merging dragon .env file into current env has correct precedences""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + create_dotenv(mock_dragon_root) + + # load config w/launcher + connector = DragonConnector() + loaded_env = {**connector.load_persisted_env()} + assert loaded_env + + curr_base_dir = "/foo" + curr_path = "/foo:/bar" + curr_only = "some-value" + + loaded_path = loaded_env.get("PATH", "") + + # ensure some non-dragon value exists in env; we want + # to see that it is in merged output without empty prepending + non_dragon_key = "NON_DRAGON_KEY" + non_dragon_value = "non_dragon_value" + connector._env_vars[non_dragon_key] = non_dragon_value + + curr_env = { + "DRAGON_BASE_DIR": curr_base_dir, # expect overwrite + "PATH": curr_path, # expect prepend + "ONLY_IN_CURRENT": curr_only, # expect pass-through + } + + merged_env = connector.merge_persisted_env(curr_env) + + # any dragon env vars should be overwritten + assert merged_env["DRAGON_BASE_DIR"] != curr_base_dir + + # any non-dragon collisions should result in prepending + assert merged_env["PATH"] == f"{loaded_path}:{curr_path}" + # ensure we actually see a change + assert merged_env["PATH"] != loaded_env["PATH"] + + # any keys that were in curr env should still exist, unchanged + assert merged_env["ONLY_IN_CURRENT"] == curr_only + + # any non-dragon keys that didn't exist avoid unnecessary prepending + assert merged_env[non_dragon_key] == non_dragon_value diff --git a/tests/test_experiment.py b/tests/test_experiment.py index bd609b530..dc67fc3cd 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -189,6 +189,8 @@ def test_launcher_detection( pytest.skip(reason="Launcher detection cannot currently detect pbs vs pals") if wlmutils.get_test_launcher() == "local": monkeypatch.setenv("PATH", "") # Remove all WLMs from PATH + if wlmutils.get_test_launcher() == "dragon": + pytest.skip(reason="Launcher detection cannot currently detect dragon") exp = Experiment("test-launcher-detection", launcher="auto") diff --git a/tests/test_fixtures.py b/tests/test_fixtures.py new file mode 100644 index 000000000..f3296ed99 --- /dev/null +++ b/tests/test_fixtures.py @@ -0,0 +1,56 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os + +import psutil +import pytest + +from smartsim import Experiment +from smartsim.database import FeatureStore +from smartsim.error import SmartSimError +from smartsim.error.errors import SSUnsupportedError + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +def test_fs_fixtures(local_experiment, local_fs, prepare_fs): + fs = prepare_fs(local_fs).featurestore + local_experiment.reconnect_feature_store(fs.checkpoint_file) + assert fs.is_active() + local_experiment.stop(fs) + + +def test_create_new_fs_fixture_if_stopped(local_experiment, local_fs, prepare_fs): + # Run this twice to make sure that there is a stopped feature store + output = prepare_fs(local_fs) + local_experiment.reconnect_feature_store(output.featurestore.checkpoint_file) + local_experiment.stop(output.featurestore) + + output = prepare_fs(local_fs) + assert output.new_fs + local_experiment.reconnect_feature_store(output.featurestore.checkpoint_file) + assert output.featurestore.is_active() diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 959abb294..78ff7cfbc 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -227,7 +227,7 @@ def test_fs_identifier_standard_twice_not_unique(wlmutils, test_dir): assert feature_store2.name == "my_fs" # CREATE feature store with fs_identifier - with make_entity_context(exp, feature_store), make_entity_context(exp, feature_store2): + with make_entity_context(exp, feature_store2), make_entity_context(exp, feature_store): exp.start(feature_store) with pytest.raises(SSDBIDConflictError) as ex: exp.start(feature_store) @@ -403,7 +403,9 @@ def test_multifs_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, fs # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() - test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") + test_script = fileutils.get_test_conf_path( + "smartredis/multidbid_colo_env_vars_only.py" + ) test_interface = wlmutils.get_test_interface() test_launcher = wlmutils.get_test_launcher() @@ -433,8 +435,9 @@ def test_multifs_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, fs ) with make_entity_context(exp, fs), make_entity_context(exp, smartsim_model): + exp.start(smartsim_model, block=False) exp.start(fs) - exp.start(smartsim_model, block=True) + exp.poll(smartsim_model) check_not_failed(exp, fs, smartsim_model) @@ -444,13 +447,13 @@ def test_multifs_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, fs reason="Not testing WLM integrations", ) @pytest.mark.parametrize("fs_type", supported_fss) -def test_launch_cluster_feature_store_single_dbid( +def test_launch_cluster_feature_store_single_fsid( test_dir, coloutils, fileutils, wlmutils, fs_type ): """test clustered 3-node FeatureStore with single command with a feature store identifier""" # TODO detect number of nodes in allocation and skip if not sufficent - exp_name = "test_launch_cluster_feature_store_single_dbid" + exp_name = "test_launch_cluster_feature_store_single_fsid" launcher = wlmutils.get_test_launcher() test_port = wlmutils.get_test_port() test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") diff --git a/tests/test_orc_config_settings.py b/tests/test_orc_config_settings.py index dc49f9d6a..2c3e6db22 100644 --- a/tests/test_orc_config_settings.py +++ b/tests/test_orc_config_settings.py @@ -27,6 +27,7 @@ import pytest +from smartsim.database import FeatureStore from smartsim.error import SmartSimError try: @@ -40,14 +41,15 @@ pytestmark = pytest.mark.group_b -def test_config_methods(fsutils, local_fs): - """Test all configuration file edit methods on an active fs""" +def test_config_methods(fsutils, prepare_fs, local_fs): + """Test all configuration file edit methods on an active feature store""" + fs = prepare_fs(local_fs).featurestore # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs configs = fsutils.get_fs_configs() for setting, value in configs.items(): - config_set_method = fsutils.get_config_edit_method(local_fs, setting) + config_set_method = fsutils.get_config_edit_method(fs, setting) config_set_method(value) # ensure SmartSimError is raised when FeatureStore.set_fs_conf @@ -56,7 +58,7 @@ def test_config_methods(fsutils, local_fs): for key, value_list in ss_error_configs.items(): for value in value_list: with pytest.raises(SmartSimError): - local_fs.set_fs_conf(key, value) + fs.set_fs_conf(key, value) # ensure TypeError is raised when FeatureStore.set_fs_conf # is given either a key or a value that is not a string @@ -64,14 +66,14 @@ def test_config_methods(fsutils, local_fs): for key, value_list in type_error_configs.items(): for value in value_list: with pytest.raises(TypeError): - local_fs.set_fs_conf(key, value) + fs.set_fs_conf(key, value) -def test_config_methods_inactive(wlmutils, fsutils): +def test_config_methods_inactive(fsutils): """Ensure a SmartSimError is raised when trying to set configurations on an inactive feature store """ - fs = wlmutils.get_feature_store() + fs = FeatureStore() configs = fsutils.get_fs_configs() for setting, value in configs.items(): config_set_method = fsutils.get_config_edit_method(fs, setting) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 18d1e6cc1..c69ab723a 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -74,34 +74,22 @@ def test_inactive_feature_store_get_address() -> None: fs.get_address() -def test_feature_store_active_functions(test_dir: str, wlmutils: "conftest.WLMUtils") -> None: - exp_name = "test_feature_store_active_functions" - exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - - fs = FeatureStore(port=wlmutils.get_test_port()) - fs.set_path(test_dir) - - exp.start(fs) - - # check if the FeatureStore is active +def test_feature_store_is_active_functions( + local_experiment, + prepare_fs, + local_fs, +) -> None: + fs = prepare_fs(local_fs).featurestore + fs = local_experiment.reconnect_feature_store(fs.checkpoint_file) assert fs.is_active() - # check if the FeatureStore can get the address - correct_address = fs.get_address() == ["127.0.0.1:" + str(wlmutils.get_test_port())] - if not correct_address: - exp.stop(fs) - assert False - - exp.stop(fs) + # check if the featurestore can get the address + assert fs.get_address() == [f"127.0.0.1:{fs.ports[0]}"] - assert not fs.is_active() - # check if FeatureStore.get_address() raises an exception - with pytest.raises(SmartSimError): - fs.get_address() - - -def test_multiple_interfaces(test_dir: str, wlmutils: "conftest.WLMUtils") -> None: +def test_multiple_interfaces( + test_dir: str, wlmutils: t.Type["conftest.WLMUtils"] +) -> None: exp_name = "test_multiple_interfaces" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) @@ -112,7 +100,8 @@ def test_multiple_interfaces(test_dir: str, wlmutils: "conftest.WLMUtils") -> No net_if_addrs = ["lo", net_if_addrs[0]] - fs = FeatureStore(port=wlmutils.get_test_port(), interface=net_if_addrs) + port = wlmutils.get_test_port() + fs = FeatureStore(port=port, interface=net_if_addrs) fs.set_path(test_dir) exp.start(fs) @@ -120,9 +109,10 @@ def test_multiple_interfaces(test_dir: str, wlmutils: "conftest.WLMUtils") -> No # check if the FeatureStore is active assert fs.is_active() - # check if the FeatureStore can get the address - correct_address = fs.get_address() == ["127.0.0.1:" + str(wlmutils.get_test_port())] - if not correct_address: + # check if the feature store can get the address + correct_address = [f"127.0.0.1:{port}"] + + if not correct_address == fs.get_address(): exp.stop(fs) assert False @@ -146,7 +136,7 @@ def test_catch_local_feature_store_errors() -> None: ##### PBS ###### -def test_pbs_set_run_arg(wlmutils: "conftest.WLMUtils") -> None: +def test_pbs_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: feature_store = FeatureStore( wlmutils.get_test_port(), fs_nodes=3, @@ -165,7 +155,7 @@ def test_pbs_set_run_arg(wlmutils: "conftest.WLMUtils") -> None: ) -def test_pbs_set_batch_arg(wlmutils: "conftest.WLMUtils") -> None: +def test_pbs_set_batch_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: feature_store = FeatureStore( wlmutils.get_test_port(), fs_nodes=3, @@ -194,7 +184,7 @@ def test_pbs_set_batch_arg(wlmutils: "conftest.WLMUtils") -> None: ##### Slurm ###### -def test_slurm_set_run_arg(wlmutils: "conftest.WLMUtils") -> None: +def test_slurm_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: feature_store = FeatureStore( wlmutils.get_test_port(), fs_nodes=3, @@ -209,7 +199,7 @@ def test_slurm_set_run_arg(wlmutils: "conftest.WLMUtils") -> None: ) -def test_slurm_set_batch_arg(wlmutils: "conftest.WLMUtils") -> None: +def test_slurm_set_batch_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: feature_store = FeatureStore( wlmutils.get_test_port(), fs_nodes=3, @@ -265,7 +255,7 @@ def test_feature_store_results_in_correct_number_of_shards(single_cmd: bool) -> ###### LSF ###### -def test_catch_feature_store_errors_lsf(wlmutils: "conftest.WLMUtils") -> None: +def test_catch_feature_store_errors_lsf(wlmutils: t.Type["conftest.WLMUtils"]) -> None: with pytest.raises(SSUnsupportedError): feature_store = FeatureStore( wlmutils.get_test_port(), @@ -288,7 +278,7 @@ def test_catch_feature_store_errors_lsf(wlmutils: "conftest.WLMUtils") -> None: feature_store.set_batch_arg("P", "MYPROJECT") -def test_lsf_set_run_args(wlmutils: "conftest.WLMUtils") -> None: +def test_lsf_set_run_args(wlmutils: t.Type["conftest.WLMUtils"]) -> None: feature_store = FeatureStore( wlmutils.get_test_port(), fs_nodes=3, @@ -301,7 +291,7 @@ def test_lsf_set_run_args(wlmutils: "conftest.WLMUtils") -> None: assert all(["l" not in fs.run_settings.run_args for fs in feature_store.entities]) -def test_lsf_set_batch_args(wlmutils: "conftest.WLMUtils") -> None: +def test_lsf_set_batch_args(wlmutils: t.Type["conftest.WLMUtils"]) -> None: feature_store = FeatureStore( wlmutils.get_test_port(), fs_nodes=3, @@ -316,8 +306,8 @@ def test_lsf_set_batch_args(wlmutils: "conftest.WLMUtils") -> None: assert feature_store.batch_settings.batch_args["D"] == "102400000" -def test_feature_store_telemetry(test_dir: str, wlmutils: "conftest.WLMUtils") -> None: - """Ensure the default behavior for an FeatureStore is to disable telemetry""" +def test_orc_telemetry(test_dir: str, wlmutils: t.Type["conftest.WLMUtils"]) -> None: + """Ensure the default behavior for an feature store is to disable telemetry""" fs = FeatureStore(port=wlmutils.get_test_port()) fs.set_path(test_dir) diff --git a/tests/test_pbs_parser.py b/tests/test_pbs_parser.py index f77eb7c93..ae01ffb19 100644 --- a/tests/test_pbs_parser.py +++ b/tests/test_pbs_parser.py @@ -72,3 +72,23 @@ def test_parse_qstat_status(): status = "R" parsed_status = pbsParser.parse_qstat_jobid(output, "1289903.sdb") assert status == parsed_status + + +def test_parse_qstat_status_not_found(): + output = ( + "Job id Name User Time Use S Queue\n" + "---------------- ---------------- ---------------- -------- - -----\n" + "1289903.sdb jobname username 00:00:00 R queue\n" + ) + parsed_status = pbsParser.parse_qstat_jobid(output, "9999999.sdb") + + assert parsed_status is None + + +def test_parse_qstat_status_json(fileutils): + """Parse nodes from qsub called with -f -F json""" + file_path = fileutils.get_test_conf_path("qstat.json") + output = Path(file_path).read_text() + status = "R" + parsed_status = pbsParser.parse_qstat_jobid_json(output, "16705.sdb") + assert status == parsed_status diff --git a/tests/test_preview.py b/tests/test_preview.py new file mode 100644 index 000000000..7b731bb62 --- /dev/null +++ b/tests/test_preview.py @@ -0,0 +1,1330 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import sys +import typing as t +from os import path as osp + +import jinja2 +import numpy as np +import pytest + +import smartsim +import smartsim._core._cli.utils as _utils +from smartsim import Experiment +from smartsim._core import Manifest, previewrenderer +from smartsim._core.config import CONFIG +from smartsim._core.control.controller import Controller +from smartsim._core.control.job import Job +from smartsim.database import FeatureStore +from smartsim.entity.entity import SmartSimEntity +from smartsim.error.errors import PreviewFormatError +from smartsim.settings import QsubBatchSettings, RunSettings + +pytestmark = pytest.mark.group_b + + +@pytest.fixture +def choose_host(): + def _choose_host(wlmutils, index: int = 0): + hosts = wlmutils.get_test_hostlist() + if hosts: + return hosts[index] + return None + + return _choose_host + + +@pytest.fixture +def preview_object(test_dir) -> t.Dict[str, Job]: + """ + Bare bones orch + """ + rs = RunSettings(exe="echo", exe_args="ifname=lo") + s = SmartSimEntity(name="faux-name", path=test_dir, run_settings=rs) + o = FeatureStore() + o.entity = s + s.fs_identifier = "test_fs_id" + s.ports = [1235] + s.num_shards = 1 + job = Job("faux-name", "faux-step-id", s, "slurm", True) + active_fsjobs: t.Dict[str, Job] = {"mock_job": job} + return active_fsjobs + + +@pytest.fixture +def preview_object_multifs(test_dir) -> t.Dict[str, Job]: + """ + Bare bones orch + """ + rs = RunSettings(exe="echo", exe_args="ifname=lo") + s = SmartSimEntity(name="faux-name", path=test_dir, run_settings=rs) + o = FeatureStore() + o.entity = s + s.fs_identifier = "testfs_reg" + s.ports = [8750] + s.num_shards = 1 + job = Job("faux-name", "faux-step-id", s, "slurm", True) + + rs2 = RunSettings(exe="echo", exe_args="ifname=lo") + s2 = SmartSimEntity(name="faux-name_2", path=test_dir, run_settings=rs) + o2 = FeatureStore() + o2.entity = s2 + s2.fs_identifier = "testfs_reg2" + s2.ports = [8752] + s2.num_shards = 1 + job2 = Job("faux-name_2", "faux-step-id_2", s2, "slurm", True) + + active_fsjobs: t.Dict[str, Job] = {"mock_job": job, "mock_job2": job2} + return active_fsjobs + + +def add_batch_resources(wlmutils, batch_settings): + if isinstance(batch_settings, QsubBatchSettings): + for key, value in wlmutils.get_batch_resources().items(): + batch_settings.set_resource(key, value) + + +def test_get_ifname_filter(): + """Test get_ifname filter""" + + # Test input and expected output + value_dict = ( + (["+ifname=ib0"], "ib0"), + ("", ""), + ("+ifnameib0", ""), + ("=ib0", ""), + (["_ifname=bad_if_key"], "bad_if_key"), + (["ifname=mock_if_name"], "mock_if_name"), + ("IFname=case_sensitive_key", ""), + ("xfname=not_splittable", ""), + (None, ""), + ) + + template_str = "{{ value | get_ifname }}" + template_dict = {"ts": template_str} + + loader = jinja2.DictLoader(template_dict) + env = jinja2.Environment(loader=loader, autoescape=True) + env.filters["get_ifname"] = previewrenderer.get_ifname + + t = env.get_template("ts") + + for input, expected_output in value_dict: + output = t.render(value=input) + # assert that that filter output matches expected output + assert output == expected_output + + +def test_get_fstype_filter(): + """Test get_fstype filter to extract feature store backend from config""" + + template_str = "{{ config | get_fstype }}" + template_dict = {"ts": template_str} + loader = jinja2.DictLoader(template_dict) + env = jinja2.Environment(loader=loader, autoescape=True) + env.filters["get_fstype"] = previewrenderer.get_fstype + + t = env.get_template("ts") + output = t.render(config=CONFIG.database_cli) + + assert output in CONFIG.database_cli + # Test empty input + test_string = "" + output = t.render(config=test_string) + assert output == "" + # Test empty path + test_string = "SmartSim/smartsim/_core/bin/" + output = t.render(config=test_string) + assert output == "" + # Test no hyphen + test_string = "SmartSim/smartsim/_core/bin/rediscli" + output = t.render(config=test_string) + assert output == "" + # Test no LHS + test_string = "SmartSim/smartsim/_core/bin/redis-" + output = t.render(config=test_string) + assert output == "" + # Test no RHS + test_string = "SmartSim/smartsim/_core/bin/-cli" + output = t.render(config=test_string) + assert output == "" + + +def test_experiment_preview(test_dir, wlmutils): + """Test correct preview output fields for Experiment preview""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp_name = "test_experiment_preview" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Execute method for template rendering + output = previewrenderer.render(exp, verbosity_level="debug") + + # Evaluate output + summary_lines = output.split("\n") + summary_lines = [item.replace("\t", "").strip() for item in summary_lines[-3:]] + assert 3 == len(summary_lines) + summary_dict = dict(row.split(": ") for row in summary_lines) + assert set(["Experiment Name", "Experiment Path", "Launcher"]).issubset( + summary_dict + ) + + +def test_experiment_preview_properties(test_dir, wlmutils): + """Test correct preview output properties for Experiment preview""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp_name = "test_experiment_preview_properties" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Execute method for template rendering + output = previewrenderer.render(exp, verbosity_level="debug") + + # Evaluate output + summary_lines = output.split("\n") + summary_lines = [item.replace("\t", "").strip() for item in summary_lines[-3:]] + assert 3 == len(summary_lines) + summary_dict = dict(row.split(": ") for row in summary_lines) + assert exp.name == summary_dict["Experiment Name"] + assert exp.exp_path == summary_dict["Experiment Path"] + assert exp.launcher == summary_dict["Launcher"] + + +def test_feature_store_preview_render(test_dir, wlmutils, choose_host): + """Test correct preview output properties for FeatureStore preview""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + exp_name = "test_feature_store_preview_properties" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + # create regular feature store + orc = exp.create_feature_store( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + ) + preview_manifest = Manifest(orc) + + # Execute method for template rendering + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Feature Store Identifier" in output + assert "Shards" in output + assert "TCP/IP Port(s)" in output + assert "Network Interface" in output + assert "Type" in output + assert "Executable" in output + + fs_path = _utils.get_fs_path() + if fs_path: + fs_type, _ = fs_path.name.split("-", 1) + + assert orc.fs_identifier in output + assert str(orc.num_shards) in output + assert orc._interfaces[0] in output + assert fs_type in output + assert CONFIG.database_exe in output + assert orc.run_command in output + assert str(orc.fs_nodes) in output + + +def test_preview_to_file(test_dir, wlmutils): + """ + Test that if an output_filename is given, a file + is rendered for Experiment preview" + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp_name = "test_preview_output_filename" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + filename = "test_preview_output_filename.txt" + path = pathlib.Path(test_dir) / filename + # Execute preview method + exp.preview( + output_format=previewrenderer.Format.PLAINTEXT, + output_filename=str(path), + verbosity_level="debug", + ) + + # Evaluate output + assert path.exists() + assert path.is_file() + + +def test_model_preview(test_dir, wlmutils): + """ + Test correct preview output fields for Model preview + """ + # Prepare entities + exp_name = "test_model_preview" + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + model_params = {"port": 6379, "password": "unbreakable_password"} + rs1 = RunSettings("bash", "multi_tags_template.sh") + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hello_world_model = exp.create_model( + "echo-hello", run_settings=rs1, params=model_params + ) + + spam_eggs_model = exp.create_model("echo-spam", run_settings=rs2) + + preview_manifest = Manifest(hello_world_model, spam_eggs_model) + + # Execute preview method + rendered_preview = previewrenderer.render( + exp, preview_manifest, verbosity_level="debug" + ) + + # Evaluate output + assert "Model Name" in rendered_preview + assert "Executable" in rendered_preview + assert "Executable Arguments" in rendered_preview + assert "Model Parameters" in rendered_preview + + +def test_model_preview_properties(test_dir, wlmutils): + """ + Test correct preview output properties for Model preview + """ + # Prepare entities + exp_name = "test_model_preview_parameters" + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + hw_name = "echo-hello" + hw_port = 6379 + hw_password = "unbreakable_password" + hw_rs = "multi_tags_template.sh" + model_params = {"port": hw_port, "password": hw_password} + hw_param1 = "bash" + rs1 = RunSettings(hw_param1, hw_rs) + + se_name = "echo-spam" + se_param1 = "echo" + se_param2 = "spam" + se_param3 = "eggs" + rs2 = exp.create_run_settings(se_param1, [se_param2, se_param3]) + + hello_world_model = exp.create_model(hw_name, run_settings=rs1, params=model_params) + spam_eggs_model = exp.create_model(se_name, run_settings=rs2) + + preview_manifest = Manifest(hello_world_model, spam_eggs_model) + + # Execute preview method + rendered_preview = previewrenderer.render( + exp, preview_manifest, verbosity_level="debug" + ) + + # Evaluate output for hello world model + assert hw_name in rendered_preview + assert hw_param1 in rendered_preview + assert hw_rs in rendered_preview + assert "port" in rendered_preview + assert "password" in rendered_preview + assert str(hw_port) in rendered_preview + assert hw_password in rendered_preview + + assert hw_name == hello_world_model.name + assert hw_param1 in hello_world_model.run_settings.exe[0] + assert hw_rs == hello_world_model.run_settings.exe_args[0] + assert None == hello_world_model.batch_settings + assert "port" in list(hello_world_model.params.items())[0] + assert hw_port in list(hello_world_model.params.items())[0] + assert "password" in list(hello_world_model.params.items())[1] + assert hw_password in list(hello_world_model.params.items())[1] + + # Evaluate outputfor spam eggs model + assert se_name in rendered_preview + assert se_param1 in rendered_preview + assert se_param2 in rendered_preview + assert se_param3 in rendered_preview + + assert se_name == spam_eggs_model.name + assert se_param1 in spam_eggs_model.run_settings.exe[0] + assert se_param2 == spam_eggs_model.run_settings.exe_args[0] + assert se_param3 == spam_eggs_model.run_settings.exe_args[1] + + +def test_preview_model_tagged_files(fileutils, test_dir, wlmutils): + """ + Test model with tagged files in preview. + """ + # Prepare entities + exp_name = "test_model_preview_parameters" + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + model_params = {"port": 6379, "password": "unbreakable_password"} + model_settings = RunSettings("bash", "multi_tags_template.sh") + + hello_world_model = exp.create_model( + "echo-hello", run_settings=model_settings, params=model_params + ) + + config = fileutils.get_test_conf_path( + osp.join("generator_files", "multi_tags_template.sh") + ) + hello_world_model.attach_generator_files(to_configure=[config]) + exp.generate(hello_world_model, overwrite=True) + + preview_manifest = Manifest(hello_world_model) + + # Execute preview method + rendered_preview = previewrenderer.render( + exp, preview_manifest, verbosity_level="debug" + ) + + # Evaluate output + assert "Tagged Files for Model Configuration" in rendered_preview + assert "generator_files/multi_tags_template.sh" in rendered_preview + assert "generator_files/multi_tags_template.sh" in hello_world_model.files.tagged[0] + + +def test_model_key_prefixing(test_dir, wlmutils): + """ + Test preview for enabling key prefixing for a Model + """ + # Prepare entities + exp_name = "test_model_key_prefixing" + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + fs = exp.create_feature_store(port=6780, interface="lo") + exp.generate(fs, overwrite=True) + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + model = exp.create_model("model_test", run_settings=rs1) + + # enable key prefixing on model + model.enable_key_prefixing() + exp.generate(model, overwrite=True) + + preview_manifest = Manifest(fs, model) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Key Prefix" in output + assert "model_test" in output + assert "Outgoing Key Collision Prevention (Key Prefixing)" in output + assert "Tensors: On" in output + assert "Datasets: On" in output + assert "ML Models/Torch Scripts: Off" in output + assert "Aggregation Lists: On" in output + + +def test_ensembles_preview(test_dir, wlmutils): + """ + Test ensemble preview fields are correct in template render + """ + test_launcher = wlmutils.get_test_launcher() + exp = Experiment( + "test-ensembles-preview", exp_path=test_dir, launcher=test_launcher + ) + + # setup ensemble parameter space + learning_rate = list(np.linspace(0.01, 0.5)) + train_params = {"LR": learning_rate} + + # define how each member should run + run = exp.create_run_settings(exe="python", exe_args="./train-model.py") + + ensemble = exp.create_ensemble( + "Training-Ensemble", + params=train_params, + params_as_args=["LR"], + run_settings=run, + perm_strategy="random", + n_models=4, + ) + + preview_manifest = Manifest(ensemble) + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Ensemble Name" in output + assert "Members" in output + assert "Ensemble Parameters" in output + + +def test_preview_models_and_ensembles(test_dir, wlmutils): + """ + Test preview of separate model entity and ensemble entity + """ + exp_name = "test-preview-model-and-ensemble" + test_dir = pathlib.Path(test_dir) / exp_name + test_dir.mkdir(parents=True) + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=str(test_dir), launcher=test_launcher) + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hw_name = "echo-hello" + se_name = "echo-spam" + ens_name = "echo-ensemble" + hello_world_model = exp.create_model(hw_name, run_settings=rs1) + spam_eggs_model = exp.create_model(se_name, run_settings=rs2) + hello_ensemble = exp.create_ensemble(ens_name, run_settings=rs1, replicas=3) + + exp.generate(hello_world_model, spam_eggs_model, hello_ensemble) + + preview_manifest = Manifest(hello_world_model, spam_eggs_model, hello_ensemble) + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Models" in output + assert hw_name in output + assert se_name in output + + assert "Ensembles" in output + assert ens_name + "_1" in output + assert ens_name + "_2" in output + + +def test_ensemble_preview_client_configuration(test_dir, wlmutils): + """ + Test preview of client configuration and key prefixing in Ensemble preview + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp = Experiment( + "test-preview-ensemble-clientconfig", exp_path=test_dir, launcher=test_launcher + ) + # Create Feature Store + fs = exp.create_feature_store(port=6780, interface="lo") + exp.generate(fs, overwrite=True) + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + # Create ensemble + ensemble = exp.create_ensemble("fd_simulation", run_settings=rs1, replicas=2) + # enable key prefixing on ensemble + ensemble.enable_key_prefixing() + exp.generate(ensemble, overwrite=True) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + # Create model + ml_model = exp.create_model("tf_training", rs2) + + for sim in ensemble.entities: + ml_model.register_incoming_entity(sim) + + exp.generate(ml_model, overwrite=True) + preview_manifest = Manifest(fs, ml_model, ensemble) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Client Configuration" in output + assert "Feature Store Identifier" in output + assert "Feature Store Backend" in output + assert "Type" in output + + +def test_ensemble_preview_client_configuration_multifs(test_dir, wlmutils): + """ + Test preview of client configuration and key prefixing in Ensemble preview + with multiple feature stores + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp = Experiment( + "test-preview-multifs-clinet-config", exp_path=test_dir, launcher=test_launcher + ) + # Create feature store + fs1_fsid = "fs_1" + fs1 = exp.create_feature_store(port=6780, interface="lo", fs_identifier=fs1_fsid) + exp.generate(fs1, overwrite=True) + # Create another feature store + fs2_fsid = "fs_2" + fs2 = exp.create_feature_store(port=6784, interface="lo", fs_identifier=fs2_fsid) + exp.generate(fs2, overwrite=True) + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + # Create ensemble + ensemble = exp.create_ensemble("fd_simulation", run_settings=rs1, replicas=2) + # enable key prefixing on ensemble + ensemble.enable_key_prefixing() + exp.generate(ensemble, overwrite=True) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + # Create model + ml_model = exp.create_model("tf_training", rs2) + for sim in ensemble.entities: + ml_model.register_incoming_entity(sim) + exp.generate(ml_model, overwrite=True) + preview_manifest = Manifest(fs1, fs2, ml_model, ensemble) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Client Configuration" in output + assert "Feature Store Identifier" in output + assert "Feature Store Backend" in output + assert "TCP/IP Port(s)" in output + assert "Type" in output + + assert fs1_fsid in output + assert fs2_fsid in output + + +def test_ensemble_preview_attached_files(fileutils, test_dir, wlmutils): + """ + Test the preview of tagged, copy, and symlink files attached + to an ensemble + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp = Experiment( + "test-preview-attached-files", exp_path=test_dir, launcher=test_launcher + ) + ensemble = exp.create_ensemble( + "dir_test", replicas=1, run_settings=RunSettings("python", exe_args="sleep.py") + ) + ensemble.entities = [] + params = {"THERMO": [10, 20], "STEPS": [20, 30]} + ensemble = exp.create_ensemble( + "dir_test", + params=params, + run_settings=RunSettings("python", exe_args="sleep.py"), + ) + gen_dir = fileutils.get_test_conf_path(osp.join("generator_files", "test_dir")) + symlink_dir = fileutils.get_test_conf_path( + osp.join("generator_files", "to_symlink_dir") + ) + copy_dir = fileutils.get_test_conf_path(osp.join("generator_files", "to_copy_dir")) + + ensemble.attach_generator_files() + ensemble.attach_generator_files( + to_configure=[gen_dir, copy_dir], to_copy=copy_dir, to_symlink=symlink_dir + ) + preview_manifest = Manifest(ensemble) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Tagged Files for Model Configuration" in output + assert "Copy Files" in output + assert "Symlink" in output + assert "Ensemble Parameters" in output + assert "Model Parameters" in output + + assert "generator_files/test_dir" in output + assert "generator_files/to_copy_dir" in output + assert "generator_files/to_symlink_dir" in output + + for model in ensemble: + assert "generator_files/test_dir" in model.files.tagged[0] + for copy in model.files.copy: + assert "generator_files/to_copy_dir" in copy + for link in model.files.link: + assert "generator_files/to_symlink_dir" in link + + +def test_preview_colocated_fs_model_ensemble(fileutils, test_dir, wlmutils, mlutils): + """ + Test preview of FSModel on colocated ensembles + """ + + exp_name = "test-preview-colocated-fs-model-ensemble" + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = 1 + + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + colo_settings.set_nodes(1) + colo_settings.set_tasks(1) + + # Create the ensemble of two identical SmartSim Model + colo_ensemble = exp.create_ensemble( + "colocated_ens", run_settings=colo_settings, replicas=2 + ) + + # Create colocated SmartSim Model + colo_model = exp.create_model("colocated_model", colo_settings) + + # Create and save ML model to filesystem + content = "empty test" + model_path = pathlib.Path(test_dir) / "model1.pt" + model_path.write_text(content) + + # Test adding a model from ensemble + colo_ensemble.add_ml_model( + "cnn", + "TF", + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs="args_0", + outputs="Identity", + ) + + # Colocate a feature store with the first ensemble members + for i, entity in enumerate(colo_ensemble): + entity.colocate_fs_tcp( + port=test_port + i, fs_cpus=1, debug=True, ifname=test_interface + ) + # Add ML models to each ensemble member to make sure they + # do not conflict with other ML models + entity.add_ml_model( + "cnn2", + "TF", + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs="args_0", + outputs="Identity", + ) + entity.disable_key_prefixing() + + # Add another ensemble member + colo_ensemble.add_model(colo_model) + + # Colocate a feature store with the new ensemble member + colo_model.colocate_fs_tcp( + port=test_port + len(colo_ensemble) - 1, + fs_cpus=1, + debug=True, + ifname=test_interface, + ) + # Add a ML model to the new ensemble member + model_inputs = "args_0" + model_outputs = "Identity" + model_name = "cnn2" + model_backend = "TF" + colo_model.add_ml_model( + model_name, + model_backend, + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=model_inputs, + outputs=model_outputs, + ) + + exp.generate(colo_ensemble) + + preview_manifest = Manifest(colo_ensemble) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Models" in output + assert "Name" in output + assert "Backend" in output + assert "Path" in output + assert "Device" in output + assert "Devices Per Node" in output + assert "Inputs" in output + assert "Outputs" in output + + assert model_name in output + assert model_backend in output + assert "Path" in output + assert "/model1.pt" in output + assert "CPU" in output + assert model_inputs in output + assert model_outputs in output + + +def test_preview_colocated_fs_script_ensemble(fileutils, test_dir, wlmutils, mlutils): + """ + Test preview of FS Scripts on colocated FS from ensemble + """ + + exp_name = "test-preview-colocated-fs-script" + + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + + expected_torch_script = "torchscript.py" + test_script = fileutils.get_test_conf_path("run_fsscript_smartredis.py") + torch_script = fileutils.get_test_conf_path(expected_torch_script) + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + colo_settings.set_nodes(1) + colo_settings.set_tasks(1) + + # Create SmartSim Ensemble with two identical models + colo_ensemble = exp.create_ensemble( + "colocated_ensemble", run_settings=colo_settings, replicas=2 + ) + + # Create a SmartSim model + colo_model = exp.create_model("colocated_model", colo_settings) + + # Colocate a fs with each ensemble entity and add a script + # to each entity via file + for i, entity in enumerate(colo_ensemble): + entity.disable_key_prefixing() + entity.colocate_fs_tcp( + port=test_port + i, + fs_cpus=1, + debug=True, + ifname=test_interface, + ) + + entity.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + # Colocate a fs with the non-ensemble Model + colo_model.colocate_fs_tcp( + port=test_port + len(colo_ensemble), + fs_cpus=1, + debug=True, + ifname=test_interface, + ) + + # Add a script to the non-ensemble model + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + cm_name2 = "test_script2" + colo_ensemble.add_script( + cm_name2, + script=torch_script_str, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + # Add the third SmartSim model to the ensemble + colo_ensemble.add_model(colo_model) + + # Add another script via file to the entire ensemble + cm_name1 = "test_script1" + colo_model.add_script( + cm_name1, + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + # Assert we have added one model to the ensemble + assert len(colo_ensemble._fs_scripts) == 1 + # Assert we have added both models to each entity + assert all([len(entity._fs_scripts) == 2 for entity in colo_ensemble]) + + exp.generate(colo_ensemble) + + preview_manifest = Manifest(colo_ensemble) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Torch Scripts" in output + assert "Name" in output + assert "Path" in output + assert "Devices Per Node" in output + + assert cm_name2 in output + assert expected_torch_script in output + assert test_device in output + assert cm_name1 in output + + +def test_preview_active_infrastructure(wlmutils, test_dir, preview_object): + """Test active infrastructure without other feature stores""" + + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp_name = "test_active_infrastructure_preview" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Execute method for template rendering + output = previewrenderer.render( + exp, active_fsjobs=preview_object, verbosity_level="debug" + ) + + assert "Active Infrastructure" in output + assert "Feature Store Identifier" in output + assert "Shards" in output + assert "Network Interface" in output + assert "Type" in output + assert "TCP/IP" in output + + +def test_preview_orch_active_infrastructure( + wlmutils, test_dir, choose_host, preview_object +): + """ + Test correct preview output properties for active infrastructure preview + with other feature stores + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + exp_name = "test_feature_store_active_infrastructure_preview" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + orc2 = exp.create_feature_store( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + fs_identifier="orc_2", + ) + + orc3 = exp.create_feature_store( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + fs_identifier="orc_3", + ) + + preview_manifest = Manifest(orc2, orc3) + + # Execute method for template rendering + output = previewrenderer.render( + exp, preview_manifest, active_fsjobs=preview_object, verbosity_level="debug" + ) + + assert "Active Infrastructure" in output + assert "Feature Store Identifier" in output + assert "Shards" in output + assert "Network Interface" in output + assert "Type" in output + assert "TCP/IP" in output + + +def test_preview_multifs_active_infrastructure( + wlmutils, test_dir, choose_host, preview_object_multifs +): + """multiple started feature stores active infrastructure""" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + + # start a new Experiment for this section + exp = Experiment( + "test_preview_multifs_active_infrastructure", + exp_path=test_dir, + launcher=test_launcher, + ) + + # Execute method for template rendering + output = previewrenderer.render( + exp, active_fsjobs=preview_object_multifs, verbosity_level="debug" + ) + + assert "Active Infrastructure" in output + assert "Feature Store Identifier" in output + assert "Shards" in output + assert "Network Interface" in output + assert "Type" in output + assert "TCP/IP" in output + + assert "testfs_reg" in output + assert "testfs_reg2" in output + assert "Ochestrators" not in output + + +def test_preview_active_infrastructure_feature_store_error( + wlmutils, test_dir, choose_host, monkeypatch: pytest.MonkeyPatch +): + """Demo error when trying to preview a started feature store""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + exp_name = "test_active_infrastructure_preview_orch_error" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + monkeypatch.setattr( + smartsim.database.orchestrator.FeatureStore, "is_active", lambda x: True + ) + + orc = exp.create_feature_store( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + fs_identifier="orc_1", + ) + + # Retrieve any active jobs + active_fsjobs = exp._control.active_feature_store_jobs + + preview_manifest = Manifest(orc) + + # Execute method for template rendering + output = previewrenderer.render( + exp, preview_manifest, active_fsjobs=active_fsjobs, verbosity_level="debug" + ) + + assert "WARNING: Cannot preview orc_1, because it is already started" in output + + +def test_active_feature_stpre_jobs_property( + wlmutils, + test_dir, + preview_object, +): + """Ensure fs_jobs remaines unchanged after deletion + of active_feature_store_jobs property stays intact when retrieving fs_jobs""" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + + # start a new Experiment for this section + exp = Experiment( + "test-active_feature_store_jobs-property", + exp_path=test_dir, + launcher=test_launcher, + ) + + controller = Controller() + controller._jobs.fs_jobs = preview_object + + # Modify the returned job collection + active_feature_store_jobs = exp._control.active_feature_store_jobs + active_feature_store_jobs["test"] = "test_value" + + # Verify original collection is not also modified + assert not exp._control.active_feature_store_jobs.get("test", None) + + +def test_verbosity_info_ensemble(test_dir, wlmutils): + """ + Test preview of separate model entity and ensemble entity + with verbosity level set to info + """ + exp_name = "test-model-and-ensemble" + test_dir = pathlib.Path(test_dir) / exp_name + test_dir.mkdir(parents=True) + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=str(test_dir), launcher=test_launcher) + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hw_name = "echo-hello" + se_name = "echo-spam" + ens_name = "echo-ensemble" + hello_world_model = exp.create_model(hw_name, run_settings=rs1) + spam_eggs_model = exp.create_model(se_name, run_settings=rs2) + hello_ensemble = exp.create_ensemble(ens_name, run_settings=rs1, replicas=3) + + exp.generate(hello_world_model, spam_eggs_model, hello_ensemble) + + preview_manifest = Manifest(hello_world_model, spam_eggs_model, hello_ensemble) + output = previewrenderer.render(exp, preview_manifest, verbosity_level="info") + + assert "Executable" not in output + assert "Executable Arguments" not in output + + assert "echo_ensemble_1" not in output + + +def test_verbosity_info_colocated_fs_model_ensemble( + fileutils, test_dir, wlmutils, mlutils +): + """Test preview of FSModel on colocated ensembles, first adding the FSModel to the + ensemble, then colocating FS. + """ + + exp_name = "test-colocated-fs-model-ensemble-reordered" + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = 1 + + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + colo_settings.set_nodes(1) + colo_settings.set_tasks(1) + + # Create the ensemble of two identical SmartSim Model + colo_ensemble = exp.create_ensemble( + "colocated_ens", run_settings=colo_settings, replicas=2 + ) + + # Create colocated SmartSim Model + colo_model = exp.create_model("colocated_model", colo_settings) + + # Create and save ML model to filesystem + content = "empty test" + model_path = pathlib.Path(test_dir) / "model1.pt" + model_path.write_text(content) + + # Test adding a model from ensemble + colo_ensemble.add_ml_model( + "cnn", + "TF", + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs="args_0", + outputs="Identity", + ) + + # Colocate a feature store with the first ensemble members + for i, entity in enumerate(colo_ensemble): + entity.colocate_fs_tcp( + port=test_port + i, fs_cpus=1, debug=True, ifname=test_interface + ) + # Add ML models to each ensemble member to make sure they + # do not conflict with other ML models + entity.add_ml_model( + "cnn2", + "TF", + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs="args_0", + outputs="Identity", + ) + entity.disable_key_prefixing() + + # Add another ensemble member + colo_ensemble.add_model(colo_model) + + # Colocate a feature store with the new ensemble member + colo_model.colocate_fs_tcp( + port=test_port + len(colo_ensemble) - 1, + fs_cpus=1, + debug=True, + ifname=test_interface, + ) + # Add a ML model to the new ensemble member + model_inputs = "args_0" + model_outputs = "Identity" + model_name = "cnn2" + model_backend = "TF" + colo_model.add_ml_model( + model_name, + model_backend, + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=model_inputs, + outputs=model_outputs, + ) + + exp.generate(colo_ensemble) + + preview_manifest = Manifest(colo_ensemble) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="info") + + assert "Outgoing Key Collision Prevention (Key Prefixing)" not in output + assert "Devices Per Node" not in output + + +def test_verbosity_info_feature_store(test_dir, wlmutils, choose_host): + """Test correct preview output properties for feature store preview""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + exp_name = "test_feature_store_preview_properties" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + # create regular feature store + orc = exp.create_feature_store( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + ) + preview_manifest = Manifest(orc) + + # Execute method for template rendering + output = previewrenderer.render(exp, preview_manifest, verbosity_level="info") + + # Evaluate output + assert "Executable" not in output + assert "Run Command" not in output + + +def test_verbosity_info_ensemble(test_dir, wlmutils): + """ + Test client configuration and key prefixing in Ensemble preview + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp = Experiment("key_prefix_test", exp_path=test_dir, launcher=test_launcher) + # Create feature store + fs = exp.create_feature_store(port=6780, interface="lo") + exp.generate(fs, overwrite=True) + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + # Create ensemble + ensemble = exp.create_ensemble("fd_simulation", run_settings=rs1, replicas=2) + # enable key prefixing on ensemble + ensemble.enable_key_prefixing() + exp.generate(ensemble, overwrite=True) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + # Create model + ml_model = exp.create_model("tf_training", rs2) + + for sim in ensemble.entities: + ml_model.register_incoming_entity(sim) + + exp.generate(ml_model, overwrite=True) + preview_manifest = Manifest(fs, ml_model, ensemble) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="info") + + # Evaluate output + assert "Outgoing Key Collision Prevention (Key Prefixing)" in output + + +def test_check_output_format_error(): + """ + Test error when invalid ouput format is given. + """ + # Prepare entities + exp_name = "test_output_format" + exp = Experiment(exp_name) + + # Execute preview method + with pytest.raises(PreviewFormatError) as ex: + exp.preview(output_format="hello") + assert ( + "The only valid output format currently available is plain_text" + in ex.value.args[0] + ) + + +def test_check_verbosity_level_error(): + """ + Testing that an error does occur when a string verbosity is passed + """ + # Prepare entities + exp_name = "test_verbosity_level_error" + exp = Experiment(exp_name) + + # Execute preview method + with pytest.raises(ValueError) as ex: + exp.preview(verbosity_level="hello") + + +def test_check_verbosity_level(): + """ + Testing that an error doesnt occur when a string verbosity is passed + """ + # Prepare entities + exp_name = "test_verbosity_level" + exp = Experiment(exp_name) + + # Execute preview method + exp.preview(verbosity_level="info") + + +def test_preview_colocated_fs_singular_model(wlmutils, test_dir): + """Test preview behavior when a colocated fs is only added to + one model. The expected behviour is that both models are colocated + """ + + test_launcher = wlmutils.get_test_launcher() + + exp = Experiment("colocated test", exp_path=test_dir, launcher=test_launcher) + + rs = exp.create_run_settings("sleep", ["100"]) + + model_1 = exp.create_model("model_1", run_settings=rs) + model_2 = exp.create_model("model_2", run_settings=rs) + + model_1.colocate_fs() + + exp.generate(model_1, model_2, overwrite=True) + + preview_manifest = Manifest(model_1, model_2) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + assert "model_1" in output + assert "model_2" in output + assert "Client Configuration" in output + + +def test_preview_fs_script(wlmutils, test_dir): + """ + Test preview of model instance with a torch script. + """ + test_launcher = wlmutils.get_test_launcher() + # Initialize the Experiment and set the launcher to auto + + exp = Experiment("getting-started", launcher=test_launcher) + + # Initialize a RunSettings object + model_settings = exp.create_run_settings(exe="python", exe_args="params.py") + + # Initialize a Model object + model_instance = exp.create_model("model_name", model_settings) + model_instance.colocate_fs_tcp() + + # TorchScript string + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + # Attach TorchScript to Model + model_instance.add_script( + name="example_script", + script=torch_script_str, + device="GPU", + devices_per_node=2, + first_device=0, + ) + preview_manifest = Manifest(model_instance) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Torch Script" in output diff --git a/tests/test_schema_utils.py b/tests/test_schema_utils.py new file mode 100644 index 000000000..78789f8ef --- /dev/null +++ b/tests/test_schema_utils.py @@ -0,0 +1,217 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import collections +import json + +import pydantic +import pytest + +from smartsim._core.schemas.utils import ( + _DEFAULT_MSG_DELIM, + SchemaRegistry, + SocketSchemaTranslator, + _Message, +) + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + +class Person(pydantic.BaseModel): + name: str + age: int + + +class Dog(pydantic.BaseModel): + name: str + age: int + + +class Book(pydantic.BaseModel): + title: str + num_pages: int + + +def test_equivalent_messages_are_equivalent(): + book = Book(title="A Story", num_pages=250) + msg_1 = _Message(book, "header") + msg_2 = _Message(book, "header") + + assert msg_1 is not msg_2 + assert msg_1 == msg_2 + assert str(msg_1) == str(msg_2) + assert msg_1 == _Message.from_str(str(msg_1), Book) + + +def test_schema_registrartion(): + registry = SchemaRegistry() + assert registry._map == {} + + registry.register("person")(Person) + assert registry._map == {"person": Person} + + registry.register("book")(Book) + assert registry._map == {"person": Person, "book": Book} + + +def test_cannot_register_a_schema_under_an_empty_str(): + registry = SchemaRegistry() + with pytest.raises(KeyError, match="Key cannot be the empty string"): + registry.register("") + + +def test_schema_to_string(): + registry = SchemaRegistry() + registry.register("person")(Person) + registry.register("book")(Book) + person = Person(name="Bob", age=36) + book = Book(title="The Greatest Story of All Time", num_pages=10_000) + assert registry.to_string(person) == str(_Message(person, "person")) + assert registry.to_string(book) == str(_Message(book, "book")) + + +def test_schemas_with_same_shape_are_mapped_correctly(): + registry = SchemaRegistry() + registry.register("person")(Person) + registry.register("dog")(Dog) + + person = Person(name="Mark", age=34) + dog = Dog(name="Fido", age=5) + + parsed_person = registry.from_string(registry.to_string(person)) + parsed_dog = registry.from_string(registry.to_string(dog)) + + assert isinstance(parsed_person, Person) + assert isinstance(parsed_dog, Dog) + + assert parsed_person == person + assert parsed_dog == dog + + +def test_registry_errors_if_types_overloaded(): + registry = SchemaRegistry() + registry.register("schema")(Person) + + with pytest.raises(KeyError): + registry.register("schema")(Book) + + +def test_registry_errors_if_msg_type_registered_with_delim_present(): + registry = SchemaRegistry() + with pytest.raises(ValueError, match="cannot contain delimiter"): + registry.register(f"some_key_with_the_{_DEFAULT_MSG_DELIM}_as_a_substring") + + +def test_registry_errors_on_unknown_schema(): + registry = SchemaRegistry() + registry.register("person")(Person) + + with pytest.raises(TypeError): + registry.to_string(Book(title="The Shortest Story of All Time", num_pages=1)) + + +def test_registry_correctly_maps_to_expected_type(): + registry = SchemaRegistry() + registry.register("person")(Person) + registry.register("book")(Book) + person = Person(name="Bob", age=36) + book = Book(title="The Most Average Story of All Time", num_pages=500) + assert registry.from_string(str(_Message(person, "person"))) == person + assert registry.from_string(str(_Message(book, "book"))) == book + + +def test_registery_errors_if_type_key_not_recognized(): + registry = SchemaRegistry() + registry.register("person")(Person) + + with pytest.raises(ValueError, match="^No type of value .* registered$"): + registry.from_string(str(_Message(Person(name="Grunk", age=5_000), "alien"))) + + +def test_registry_errors_if_type_key_is_missing(): + registry = SchemaRegistry() + registry.register("person")(Person) + + with pytest.raises(ValueError, match="Failed to determine schema type"): + registry.from_string("This string does not contain a delimiter") + + +class MockSocket: + def __init__(self, send_queue, recv_queue): + self.send_queue = send_queue + self.recv_queue = recv_queue + + def send_string(self, str_, *_args, **_kwargs): + assert isinstance(str_, str) + self.send_queue.append(str_) + + def recv_string(self, *_args, **_kwargs): + str_ = self.recv_queue.popleft() + assert isinstance(str_, str) + return str_ + + +class Request(pydantic.BaseModel): ... + + +class Response(pydantic.BaseModel): ... + + +def test_socket_schema_translator_uses_schema_registries(): + server_to_client = collections.deque() + client_to_server = collections.deque() + + server_socket = MockSocket(server_to_client, client_to_server) + client_socket = MockSocket(client_to_server, server_to_client) + + req_reg = SchemaRegistry() + res_reg = SchemaRegistry() + + req_reg.register("message")(Request) + res_reg.register("message")(Response) + + server = SocketSchemaTranslator(server_socket, res_reg, req_reg) + client = SocketSchemaTranslator(client_socket, req_reg, res_reg) + + # Check sockets are able to communicate seamlessly with schemas only + client.send(Request()) + assert len(client_to_server) == 1 + req = server.recv() + assert len(client_to_server) == 0 + assert isinstance(req, Request) + + server.send(Response()) + assert len(server_to_client) == 1 + res = client.recv() + assert len(server_to_client) == 0 + assert isinstance(res, Response) + + # Ensure users cannot send unexpected schemas + with pytest.raises(TypeError, match="Unregistered schema"): + client.send(Response()) + with pytest.raises(TypeError, match="Unregistered schema"): + server.send(Request()) diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py index 00ea341b4..82de615cf 100644 --- a/tests/test_smartredis.py +++ b/tests/test_smartredis.py @@ -60,22 +60,17 @@ ) -def test_exchange(fileutils, test_dir, wlmutils): +def test_exchange(local_experiment, local_fs, prepare_fs, fileutils): """Run two processes, each process puts a tensor on - the DB, then accesses the other process's tensor. + the FS, then accesses the other process's tensor. Finally, the tensor is used to run a model. """ - exp = Experiment( - "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" - ) - + fs = prepare_fs(local_fs).featurestore # create and start a feature store - feature_store = FeatureStore(port=wlmutils.get_test_port()) - exp.generate(feature_store) - exp.start(feature_store, block=False) + local_experiment.reconnect_feature_store(fs.checkpoint_file) - rs = exp.create_run_settings("python", "producer.py --exchange") + rs = local_experiment.create_run_settings("python", "producer.py --exchange") params = {"mult": [1, -10]} ensemble = Ensemble( name="producer", @@ -90,39 +85,29 @@ def test_exchange(fileutils, test_dir, wlmutils): config = fileutils.get_test_conf_path("smartredis") ensemble.attach_generator_files(to_copy=[config]) - exp.generate(ensemble) + local_experiment.generate(ensemble) # start the models - exp.start(ensemble, summary=False) + local_experiment.start(ensemble, summary=False) # get and confirm statuses - statuses = exp.get_status(ensemble) - try: - assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) - finally: - # stop the FeatureStore - exp.stop(feature_store) + statuses = local_experiment.get_status(ensemble) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) -def test_consumer(fileutils, test_dir, wlmutils): +def test_consumer(local_experiment, local_fs, prepare_fs, fileutils): """Run three processes, each one of the first two processes - puts a tensor on the DB; the third process accesses the + puts a tensor on the FS; the third process accesses the tensors put by the two producers. Finally, the tensor is used to run a model by each producer and the consumer accesses the two results. """ - exp = Experiment( - "smartredis_ensemble_consumer", exp_path=test_dir, launcher="local" - ) - - # create and start a feature store - feature_store = FeatureStore(port=wlmutils.get_test_port()) - exp.generate(feature_store) - exp.start(feature_store, block=False) + fs = prepare_fs(local_fs).featurestore + local_experiment.reconnect_feature_store(fs.checkpoint_file) - rs_prod = exp.create_run_settings("python", "producer.py") - rs_consumer = exp.create_run_settings("python", "consumer.py") + rs_prod = local_experiment.create_run_settings("python", "producer.py") + rs_consumer = local_experiment.create_run_settings("python", "consumer.py") params = {"mult": [1, -10]} ensemble = Ensemble( name="producer", params=params, run_settings=rs_prod, perm_strat="step" @@ -139,15 +124,11 @@ def test_consumer(fileutils, test_dir, wlmutils): config = fileutils.get_test_conf_path("smartredis") ensemble.attach_generator_files(to_copy=[config]) - exp.generate(ensemble) + local_experiment.generate(ensemble) # start the models - exp.start(ensemble, summary=False) + local_experiment.start(ensemble, summary=False) # get and confirm statuses - statuses = exp.get_status(ensemble) - try: - assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) - finally: - # stop the FeatureStore - exp.stop(feature_store) + statuses = local_experiment.get_status(ensemble) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index 6120d6486..8546afdfb 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -72,8 +72,7 @@ pytest.test_launcher == "local", reason="Test requires WLM" ) - -logger = logging.getLogger() +logger = logging.getLogger(__name__) # The tests in this file belong to the slow_tests group pytestmark = pytest.mark.slow_tests @@ -85,7 +84,7 @@ def turn_on_tm(monkeypatch): yield -def write_stop_file(entity: JobEntity, test_dir: str, duration: int): +def write_stop_file(entity: JobEntity, test_dir: pathlib.Path, duration: int): time.sleep(duration) write_event( get_ts_ms(), @@ -403,7 +402,8 @@ def test_persistable_computed_properties( "step_id": step_id, }, } - persistables = Run.load_entity(etype, stored, exp_dir) + faux_experiment = {"launcher": "local"} + persistables = Run.load_entity(etype, stored, exp_dir, faux_experiment) persistable = persistables[0] if persistables else None assert persistable.is_managed == exp_ismanaged @@ -583,7 +583,8 @@ def is_alive(self) -> bool: entity.status_dir = test_dir p = mp.Process( - target=write_stop_file, args=(entity, test_dir, (task_duration_ms / 1000)) + target=write_stop_file, + args=(entity, pathlib.Path(test_dir), (task_duration_ms / 1000)), ) frequency = 1000 @@ -870,7 +871,7 @@ def test_telemetry_fs_and_model(fileutils, test_dir, wlmutils, monkeypatch, conf ctx.setattr(cfg.Config, "telemetry_frequency", 1) # Set experiment name - exp_name = "telemetry_db_and_model" + exp_name = "telemetry_fs_and_model" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -1137,7 +1138,7 @@ def test_unmanaged_steps_are_proxyed_through_indirect( @for_all_wlm_launchers -def test_unmanaged_steps_are_not_proxied_if_the_telemetry_monitor_is_disabled( +def test_unmanaged_steps_are_not_proxyed_if_the_telemetry_monitor_is_disabled( wlm_launcher, mock_step_meta_dict, test_dir, monkeypatch ): monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, False) diff --git a/tests/utils/test_network.py b/tests/utils/test_network.py new file mode 100644 index 000000000..cdc3168ef --- /dev/null +++ b/tests/utils/test_network.py @@ -0,0 +1,30 @@ +import pytest + +from smartsim._core.utils.network import find_free_port + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +def test_find_free_port_no_start(): + """Test that a free port is identified and returned when no + starting port number is specified""" + port = find_free_port() + assert port > 0 + + +@pytest.mark.parametrize( + "start_at", + [ + pytest.param(1000, id="start at 1000"), + pytest.param(2000, id="start at 2000"), + pytest.param(5000, id="start at 5000"), + pytest.param(10000, id="start at 10000"), + pytest.param(16000, id="start at 16000"), + ], +) +def test_find_free_port_range_specified(start_at): + """Test that a free port greater than or equal to the specified + starting port number is identified and returned""" + port = find_free_port(start_at) + assert port >= start_at diff --git a/tests/utils/test_security.py b/tests/utils/test_security.py new file mode 100644 index 000000000..1a7a9586b --- /dev/null +++ b/tests/utils/test_security.py @@ -0,0 +1,234 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import pathlib +import stat + +import pytest +from sympy import public + +from smartsim._core.config.config import get_config +from smartsim._core.utils.security import KeyManager, _KeyLocator, _KeyPermissions + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +def test_keylocator_filename_resolution(test_dir: str) -> None: + """Ensure the key locator resolves filenames as expected.""" + key_path = pathlib.Path(test_dir) + key_category = "mycategory" + key_file = "mykey" + locator = _KeyLocator(key_path, key_file, key_category) + + assert locator.public_filename == f"{key_file}.key", "public mismatch" + assert locator.private_filename == f"{key_file}.key_secret", "private mismatch" + + +def test_keylocator_dir_resolution(test_dir: str) -> None: + """Ensure the key locator resolves paths as expected.""" + key_path = pathlib.Path(test_dir) + key_name = "test" + key_category = "mycategory" + + locator = _KeyLocator(key_path, key_name, key_category) + + # we expect a category and pub/priv subdirectory + exp_pub = pathlib.Path(f"{test_dir}/{key_category}/pub").resolve() + assert str(locator.public_dir) == str(exp_pub) + + exp_priv = pathlib.Path(f"{test_dir}/{key_category}/priv").resolve() + assert str(locator.private_dir) == str(exp_priv) + + # and to be explicit... prove pub & priv are not same directory + assert str(locator.private_dir) != str(locator.public_dir) + + +def test_key_manager_dir_preparation( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the KeyManager creates the appropriate directory + structure required for public/private key pairs.""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + km = KeyManager(cfg) + + km.create_directories() + + # verify the expected paths are created + server_locator = _KeyLocator(pathlib.Path(test_dir), "curve", "server") + client_locator = _KeyLocator(pathlib.Path(test_dir), "curve", "client") + + locators = [server_locator, client_locator] + + for locator in locators: + assert locator.public_dir.exists() + assert locator.private_dir.exists() + + +def test_key_manager_get_existing_keys_only_no_keys_found( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the key manager cannot load keys when + directed not to create missing keys.""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + km = KeyManager(cfg) + + # use create=False to only load pre-existing keys + server_keys, client_keys = km.get_keys(create=False) + + assert server_keys.empty + assert client_keys.empty + + +def test_key_manager_get_existing_keys_only_existing( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the key manager can load keys when + they exist from a previous call.""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + + # use a KeyManager to create some keys + km = KeyManager(cfg, as_server=True, as_client=True) + old_server_keys, old_client_keys = km.get_keys(create=True) + + # create a new KM to verify keys reload + km = KeyManager(cfg, as_server=True, as_client=True) + + # use create=True to manifest any bugs missing existing keys + server_keys, client_keys = km.get_keys(create=True) + + # ensure we loaded something + assert not server_keys.empty + assert not client_keys.empty + + # and show the old keys were reloaded from disk + assert server_keys == old_server_keys + assert client_keys == old_client_keys + + +def test_key_manager_get_or_create_keys_default( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the key manager creates keys when none can be loaded""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + km = KeyManager(cfg) + + key_set = km.get_keys() + + # public keys are returned by default + assert key_set[0].public != b"" + assert key_set[1].public != b"" + + # default behavior will only return public keys + assert not key_set[0].private + assert not key_set[1].private + + +@pytest.mark.parametrize( + "as_server, as_client", + [ + pytest.param(False, True, id="as-client"), + pytest.param(True, False, id="as-server"), + pytest.param(True, True, id="as-both"), + pytest.param(False, False, id="public-only"), + ], +) +def test_key_manager_as_context( + as_server: bool, + as_client: bool, + test_dir: str, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure the key manager loads the correct keys + when passed `as_server=True` and `as_client=True`""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + km = KeyManager(cfg, as_server=as_server, as_client=as_client) + + server_keyset, client_keyset = km.get_keys() + + assert bool(server_keyset.public) == True + assert bool(server_keyset.private) == as_server + + assert bool(client_keyset.public) == True + assert bool(client_keyset.private) == as_client + + +def test_key_manager_applied_permissions( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the key manager applies the appropriate file-system + permissions to the keys and directories""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + km = KeyManager(cfg, as_client=True, as_server=True) + + server_keys, client_keys = km.get_keys() + + # ensure public dirs are open for reading by others + s_pub_stat = km._server_locator.public_dir.stat() + c_pub_stat = km._client_locator.public_dir.stat() + + assert stat.S_IMODE(s_pub_stat.st_mode) == _KeyPermissions.PUBLIC_DIR + assert stat.S_IMODE(c_pub_stat.st_mode) == _KeyPermissions.PUBLIC_DIR + + # ensure private dirs are open only to owner + s_priv_stat = km._server_locator.private_dir.stat() + c_priv_stat = km._client_locator.private_dir.stat() + + assert stat.S_IMODE(s_priv_stat.st_mode) == _KeyPermissions.PRIVATE_DIR + assert stat.S_IMODE(c_priv_stat.st_mode) == _KeyPermissions.PRIVATE_DIR + + # ensure public files are open for reading by others + s_pub_stat = km._server_locator.public.stat() + c_pub_stat = km._client_locator.public.stat() + + assert stat.S_IMODE(s_pub_stat.st_mode) == _KeyPermissions.PUBLIC_KEY + assert stat.S_IMODE(c_pub_stat.st_mode) == _KeyPermissions.PUBLIC_KEY + + # ensure private files are read-only for owner + s_priv_stat = km._server_locator.private.stat() + c_priv_stat = km._client_locator.private.stat() + + assert stat.S_IMODE(s_priv_stat.st_mode) == _KeyPermissions.PRIVATE_KEY + assert stat.S_IMODE(c_priv_stat.st_mode) == _KeyPermissions.PRIVATE_KEY From 27c6da9853e14fef0d00aa74e8ca534aaeb1003f Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Wed, 22 May 2024 14:52:55 -0500 Subject: [PATCH 04/11] more renaming --- smartsim/_core/control/controller.py | 2 +- smartsim/_core/launcher/step/alpsStep.py | 4 ++-- smartsim/_core/launcher/step/localStep.py | 4 ++-- smartsim/_core/launcher/step/lsfStep.py | 6 +++--- smartsim/_core/launcher/step/mpiStep.py | 10 +++++----- smartsim/_core/launcher/step/pbsStep.py | 4 ++-- smartsim/_core/launcher/step/slurmStep.py | 8 ++++---- smartsim/_core/launcher/step/step.py | 4 ++-- 8 files changed, 21 insertions(+), 21 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 713da43fe..4d49de11c 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -152,7 +152,7 @@ def start( @property def active_feature_store_jobs(self) -> t.Dict[str, Job]: - """Return active orchestrator jobs.""" + """Return active feature store jobs.""" return {**self._jobs.fs_jobs} @property diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index 3d769058e..d0ac3f33d 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -33,13 +33,13 @@ from ....log import get_logger from ....settings import AprunSettings, RunSettings, Singularity from .step import Step, proxyable_launch_cmd -from ....entity import Model, DBNode +from ....entity import Model, FSNode logger = get_logger(__name__) class AprunStep(Step): - def __init__(self, entity: t.Union[Model, DBNode], run_settings: AprunSettings) -> None: + def __init__(self, entity: t.Union[Model, FSNode], run_settings: AprunSettings) -> None: """Initialize a ALPS aprun job step :param name: name of the entity to be launched diff --git a/smartsim/_core/launcher/step/localStep.py b/smartsim/_core/launcher/step/localStep.py index 6301c4804..785577e27 100644 --- a/smartsim/_core/launcher/step/localStep.py +++ b/smartsim/_core/launcher/step/localStep.py @@ -30,12 +30,12 @@ from ....settings import Singularity from ....settings.base import RunSettings -from ....entity import Model, DBNode +from ....entity import Model, FSNode from .step import Step, proxyable_launch_cmd class LocalStep(Step): - def __init__(self, entity: t.Union[Model, DBNode], run_settings: RunSettings): + def __init__(self, entity: t.Union[Model, FSNode], run_settings: RunSettings): super().__init__(entity, run_settings) self.run_settings = entity.run_settings self._env = self._set_env() diff --git a/smartsim/_core/launcher/step/lsfStep.py b/smartsim/_core/launcher/step/lsfStep.py index 2c7d87348..aaf616ed4 100644 --- a/smartsim/_core/launcher/step/lsfStep.py +++ b/smartsim/_core/launcher/step/lsfStep.py @@ -33,13 +33,13 @@ from ....settings import BsubBatchSettings, JsrunSettings from ....settings.base import RunSettings from .step import Step -from ....entity import Model, DBNode +from ....entity import Model, FSNode logger = get_logger(__name__) class BsubBatchStep(Step): - def __init__(self, entity: t.Union[Model, DBNode], batch_settings: BsubBatchSettings) -> None: + def __init__(self, entity: t.Union[Model, FSNode], batch_settings: BsubBatchSettings) -> None: """Initialize a LSF bsub step :param name: name of the entity to launch @@ -104,7 +104,7 @@ def _write_script(self) -> str: class JsrunStep(Step): - def __init__(self, entity: t.Union[Model, DBNode], run_settings: RunSettings): + def __init__(self, entity: t.Union[Model, FSNode], run_settings: RunSettings): """Initialize a LSF jsrun job step :param name: name of the entity to be launched diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 3bebda448..e144e75c1 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -34,13 +34,13 @@ from ....settings import MpiexecSettings, MpirunSettings, OrterunSettings from ....settings.base import RunSettings from .step import Step, proxyable_launch_cmd -from ....entity import Model, DBNode +from ....entity import Model, FSNode logger = get_logger(__name__) class _BaseMPIStep(Step): - def __init__(self, entity: t.Union[Model, DBNode], run_settings: RunSettings) -> None: + def __init__(self, entity: t.Union[Model, FSNode], run_settings: RunSettings) -> None: """Initialize a job step conforming to the MPI standard :param name: name of the entity to be launched @@ -153,7 +153,7 @@ def _make_mpmd(self) -> t.List[str]: class MpiexecStep(_BaseMPIStep): - def __init__(self, entity: t.Union[Model, DBNode], run_settings: MpiexecSettings) -> None: + def __init__(self, entity: t.Union[Model, FSNode], run_settings: MpiexecSettings) -> None: """Initialize an mpiexec job step :param name: name of the entity to be launched @@ -167,7 +167,7 @@ def __init__(self, entity: t.Union[Model, DBNode], run_settings: MpiexecSettings class MpirunStep(_BaseMPIStep): - def __init__(self, entity: t.Union[Model, DBNode], run_settings: MpirunSettings) -> None: + def __init__(self, entity: t.Union[Model, FSNode], run_settings: MpirunSettings) -> None: """Initialize an mpirun job step :param name: name of the entity to be launched @@ -181,7 +181,7 @@ def __init__(self, entity: t.Union[Model, DBNode], run_settings: MpirunSettings) class OrterunStep(_BaseMPIStep): - def __init__(self, entity: t.Union[Model, DBNode], run_settings: OrterunSettings) -> None: + def __init__(self, entity: t.Union[Model, FSNode], run_settings: OrterunSettings) -> None: """Initialize an orterun job step :param name: name of the entity to be launched diff --git a/smartsim/_core/launcher/step/pbsStep.py b/smartsim/_core/launcher/step/pbsStep.py index f5c5a746c..24928e9da 100644 --- a/smartsim/_core/launcher/step/pbsStep.py +++ b/smartsim/_core/launcher/step/pbsStep.py @@ -29,14 +29,14 @@ from ....log import get_logger from ....settings import QsubBatchSettings from .step import Step -from ....entity import Model, DBNode +from ....entity import Model, FSNode logger = get_logger(__name__) class QsubBatchStep(Step): - def __init__(self, entity: t.Union[Model, DBNode], batch_settings: QsubBatchSettings) -> None: + def __init__(self, entity: t.Union[Model, FSNode], batch_settings: QsubBatchSettings) -> None: """Initialize a PBSpro qsub step :param name: name of the entity to launch diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 038326ba0..9c33209e2 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -33,13 +33,13 @@ from ....log import get_logger from ....settings import RunSettings, SbatchSettings, Singularity, SrunSettings from .step import Step -from ....entity import Model, Ensemble, DBNode +from ....entity import Model, Ensemble, FSNode logger = get_logger(__name__) class SbatchStep(Step): - def __init__(self, entity: t.Union[Model, DBNode], batch_settings: SbatchSettings) -> None: + def __init__(self, entity: t.Union[Model, FSNode], batch_settings: SbatchSettings) -> None: """Initialize a Slurm Sbatch step :param name: name of the entity to launch @@ -99,7 +99,7 @@ def _write_script(self) -> str: class SrunStep(Step): - def __init__(self, entity: t.Union[Model, DBNode], run_settings: SrunSettings) -> None: + def __init__(self, entity: t.Union[Model, FSNode], run_settings: SrunSettings) -> None: """Initialize a srun job step :param name: name of the entity to be launched @@ -186,7 +186,7 @@ def _get_mpmd(self) -> t.List[RunSettings]: return self.run_settings.mpmd @staticmethod - def _get_exe_args_list(entity: t.Union[Model, DBNode]) -> t.List[str]: + def _get_exe_args_list(entity: t.Union[Model, FSNode]) -> t.List[str]: """Convenience function to encapsulate checking the runsettings.exe_args type to always return a list """ diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index 81ad98afa..6015117c4 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -36,7 +36,7 @@ from smartsim._core.config import CONFIG from smartsim.error.errors import SmartSimError, UnproxyableStepError -from ....entity import Model, Ensemble, DBNode +from ....entity import Model, Ensemble, FSNode from ....log import get_logger from ....settings.base import RunSettings, SettingsBase from ...utils.helpers import encode_cmd, get_base_36_repr @@ -46,7 +46,7 @@ class Step: - def __init__(self, entity: t.Union[Model, DBNode], step_settings: SettingsBase) -> None: + def __init__(self, entity: t.Union[Model, FSNode], step_settings: SettingsBase) -> None: self.name = self._create_unique_name(entity.name) self.entity = entity self.entity_name = entity.name From 98c7029a2dd75f64fe930223b0067a5d950f6f92 Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Tue, 28 May 2024 17:26:37 -0500 Subject: [PATCH 05/11] ran isort --- conftest.py | 11 +++++------ smartsim/_core/control/jobmanager.py | 2 +- smartsim/_core/control/manifest.py | 2 +- smartsim/_core/launcher/step/slurmStep.py | 2 +- smartsim/_core/launcher/step/step.py | 2 +- smartsim/_core/utils/serialize.py | 2 +- smartsim/database/orchestrator.py | 2 +- 7 files changed, 11 insertions(+), 12 deletions(-) diff --git a/conftest.py b/conftest.py index 97fb4b6d6..024b0044b 100644 --- a/conftest.py +++ b/conftest.py @@ -27,33 +27,32 @@ from __future__ import annotations import asyncio -from collections import defaultdict -from dataclasses import dataclass import json import os import pathlib import shutil -import subprocess import signal import socket +import subprocess import sys import tempfile import time import typing as t import uuid import warnings +from collections import defaultdict +from dataclasses import dataclass from subprocess import run -import time import psutil import pytest import smartsim from smartsim import Experiment -from smartsim._core.launcher.dragon.dragonConnector import DragonConnector -from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher from smartsim._core.config import CONFIG from smartsim._core.config.config import Config +from smartsim._core.launcher.dragon.dragonConnector import DragonConnector +from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher from smartsim._core.utils.telemetry.telemetry import JobEntity from smartsim.database import FeatureStore from smartsim.entity import Model diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 1370ee820..37f379024 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -34,7 +34,7 @@ from ..._core.launcher.step import Step from ...database import FeatureStore -from ...entity import FSNode, EntitySequence, SmartSimEntity +from ...entity import EntitySequence, FSNode, SmartSimEntity from ...log import ContextThread, get_logger from ...status import TERMINAL_STATUSES, SmartSimStatus from ..config import CONFIG diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 7508ed4f2..1ef849e97 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -30,7 +30,7 @@ from dataclasses import dataclass, field from ...database import FeatureStore -from ...entity import FSNode, Ensemble, EntitySequence, Model, SmartSimEntity +from ...entity import Ensemble, EntitySequence, FSNode, Model, SmartSimEntity from ...error import SmartSimError from ..config import CONFIG from ..utils import helpers as _helpers diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 8c786437f..86bd0936c 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -29,7 +29,7 @@ import typing as t from shlex import split as sh_split -from ....entity import FSNode, Ensemble, Model +from ....entity import Ensemble, FSNode, Model from ....error import AllocationError from ....log import get_logger from ....settings import RunSettings, SbatchSettings, Singularity, SrunSettings diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index ffbd4598a..0b12d032d 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -37,7 +37,7 @@ from smartsim._core.config import CONFIG from smartsim.error.errors import SmartSimError, UnproxyableStepError -from ....entity import FSNode, Ensemble, Model +from ....entity import Ensemble, FSNode, Model from ....log import get_logger from ....settings.base import RunSettings, SettingsBase from ...utils.helpers import encode_cmd, get_base_36_repr diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py index 803888a7d..b29e8f7fe 100644 --- a/smartsim/_core/utils/serialize.py +++ b/smartsim/_core/utils/serialize.py @@ -37,7 +37,7 @@ if t.TYPE_CHECKING: from smartsim._core.control.manifest import LaunchedManifest as _Manifest from smartsim.database.orchestrator import FeatureStore - from smartsim.entity import FSNode, Ensemble, Model + from smartsim.entity import Ensemble, FSNode, Model from smartsim.entity.dbobject import FSModel, FSScript from smartsim.settings.base import BatchSettings, RunSettings diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index eea4e47fa..75b4bca95 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -41,7 +41,7 @@ from .._core.utils import fs_is_active from .._core.utils.helpers import is_valid_cmd, unpack_fs_identifier from .._core.utils.network import get_ip_from_host -from ..entity import FSNode, EntityList, TelemetryConfiguration +from ..entity import EntityList, FSNode, TelemetryConfiguration from ..error import ( SmartSimError, SSConfigError, From 18450bdffe8eaba8da119ae0638c1b1bf8eebe21 Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Tue, 28 May 2024 17:35:27 -0500 Subject: [PATCH 06/11] ran black --- conftest.py | 55 ++++++++++--------- smartsim/_core/control/controller.py | 18 ++++-- smartsim/_core/control/manifest.py | 4 +- smartsim/_core/entrypoints/colocated.py | 8 ++- smartsim/_core/generation/generator.py | 4 +- smartsim/entity/model.py | 4 +- .../full_wlm/test_generic_orc_launch_batch.py | 3 +- tests/_legacy/on_wlm/test_preview_wlm.py | 4 +- tests/_legacy/test_cli.py | 4 +- tests/_legacy/test_controller.py | 4 +- tests/_legacy/test_controller_errors.py | 4 +- tests/_legacy/test_launch_errors.py | 8 ++- tests/_legacy/test_manifest.py | 4 +- tests/_legacy/test_multidb.py | 20 +++++-- tests/_legacy/test_orchestrator.py | 19 +++++-- tests/_legacy/test_output_files.py | 4 +- tests/_legacy/test_serialize.py | 4 +- tests/_legacy/test_symlinking.py | 4 +- tests/_legacy/test_telemetry_monitor.py | 34 +++++++++--- 19 files changed, 148 insertions(+), 61 deletions(-) diff --git a/conftest.py b/conftest.py index 024b0044b..91a394e64 100644 --- a/conftest.py +++ b/conftest.py @@ -92,6 +92,7 @@ test_hostlist = None has_aprun = shutil.which("aprun") is not None + def get_account() -> str: return test_account @@ -140,7 +141,7 @@ def pytest_sessionstart( time.sleep(0.1) if CONFIG.dragon_server_path is None: - dragon_server_path = os.path.join(test_output_root, "dragon_server") + dragon_server_path = os.path.join(test_output_root, "dragon_server") os.makedirs(dragon_server_path) os.environ["SMARTSIM_DRAGON_SERVER_PATH"] = dragon_server_path @@ -182,7 +183,7 @@ def build_mpi_app() -> t.Optional[pathlib.Path]: if cc is None: return None - path_to_src = pathlib.Path(FileUtils().get_test_conf_path("mpi")) + path_to_src = pathlib.Path(FileUtils().get_test_conf_path("mpi")) path_to_out = pathlib.Path(test_output_root) / "apps" / "mpi_app" os.makedirs(path_to_out.parent, exist_ok=True) cmd = [cc, str(path_to_src / "mpi_hello.c"), "-o", str(path_to_out)] @@ -193,11 +194,12 @@ def build_mpi_app() -> t.Optional[pathlib.Path]: else: return None + @pytest.fixture(scope="session") def mpi_app_path() -> t.Optional[pathlib.Path]: """Return path to MPI app if it was built - return None if it could not or will not be built + return None if it could not or will not be built """ if not CONFIG.test_mpi: return None @@ -226,7 +228,6 @@ def kill_all_test_spawned_processes() -> None: print("Not all processes were killed after test") - def get_hostlist() -> t.Optional[t.List[str]]: global test_hostlist if not test_hostlist: @@ -707,7 +708,9 @@ def global_dragon_teardown() -> None: """ if test_launcher != "dragon" or CONFIG.dragon_server_path is None: return - logger.debug(f"Tearing down Dragon infrastructure, server path: {CONFIG.dragon_server_path}") + logger.debug( + f"Tearing down Dragon infrastructure, server path: {CONFIG.dragon_server_path}" + ) dragon_connector = DragonConnector() dragon_connector.ensure_connected() dragon_connector.cleanup() @@ -874,9 +877,13 @@ def num_calls(self) -> int: def details(self) -> t.List[t.Tuple[t.Tuple[t.Any, ...], t.Dict[str, t.Any]]]: return self._details + ## Reuse feature store across tests -feature_store_registry: t.DefaultDict[str, t.Optional[FeatureStore]] = defaultdict(lambda: None) +feature_store_registry: t.DefaultDict[str, t.Optional[FeatureStore]] = defaultdict( + lambda: None +) + @pytest.fixture(scope="function") def local_experiment(test_dir: str) -> smartsim.Experiment: @@ -884,16 +891,16 @@ def local_experiment(test_dir: str) -> smartsim.Experiment: name = pathlib.Path(test_dir).stem return smartsim.Experiment(name, exp_path=test_dir, launcher="local") + @pytest.fixture(scope="function") def wlm_experiment(test_dir: str, wlmutils: WLMUtils) -> smartsim.Experiment: """Create a default experiment that uses the requested launcher""" name = pathlib.Path(test_dir).stem return smartsim.Experiment( - name, - exp_path=test_dir, - launcher=wlmutils.get_test_launcher() + name, exp_path=test_dir, launcher=wlmutils.get_test_launcher() ) + def _cleanup_fs(name: str) -> None: global feature_store_registry fs = feature_store_registry[name] @@ -905,19 +912,22 @@ def _cleanup_fs(name: str) -> None: except: pass + @dataclass class DBConfiguration: name: str launcher: str num_nodes: int - interface: t.Union[str,t.List[str]] + interface: t.Union[str, t.List[str]] hostlist: t.Optional[t.List[str]] port: int + @dataclass class PrepareFeatureStoreOutput: - featurestore: t.Optional[FeatureStore] # The actual feature store object - new_fs: bool # True if a new feature store was created when calling prepare_fs + featurestore: t.Optional[FeatureStore] # The actual feature store object + new_fs: bool # True if a new feature store was created when calling prepare_fs + # Reuse feature stores @pytest.fixture(scope="session") @@ -934,6 +944,7 @@ def local_fs() -> t.Generator[DBConfiguration, None, None]: yield config _cleanup_fs(name) + @pytest.fixture(scope="session") def single_fs(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: hostlist = wlmutils.get_test_hostlist() @@ -945,7 +956,7 @@ def single_fs(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: 1, wlmutils.get_test_interface(), hostlist, - _find_free_port(tuple(reversed(test_ports))) + _find_free_port(tuple(reversed(test_ports))), ) yield config _cleanup_fs(name) @@ -970,9 +981,7 @@ def clustered_fs(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None] @pytest.fixture def register_new_fs() -> t.Callable[[DBConfiguration], FeatureStore]: - def _register_new_fs( - config: DBConfiguration - ) -> FeatureStore: + def _register_new_fs(config: DBConfiguration) -> FeatureStore: exp_path = pathlib.Path(test_output_root, config.name) exp_path.mkdir(exist_ok=True) exp = Experiment( @@ -985,26 +994,21 @@ def _register_new_fs( batch=False, interface=config.interface, hosts=config.hostlist, - fs_nodes=config.num_nodes + fs_nodes=config.num_nodes, ) exp.generate(feature_store, overwrite=True) exp.start(feature_store) global feature_store_registry feature_store_registry[config.name] = feature_store return feature_store + return _register_new_fs @pytest.fixture(scope="function") def prepare_fs( - register_new_fs: t.Callable[ - [DBConfiguration], - FeatureStore - ] -) -> t.Callable[ - [DBConfiguration], - PrepareFeatureStoreOutput -]: + register_new_fs: t.Callable[[DBConfiguration], FeatureStore] +) -> t.Callable[[DBConfiguration], PrepareFeatureStoreOutput]: def _prepare_fs(fs_config: DBConfiguration) -> PrepareFeatureStoreOutput: global feature_store_registry fs = feature_store_registry[fs_config.name] @@ -1020,4 +1024,5 @@ def _prepare_fs(fs_config: DBConfiguration) -> PrepareFeatureStoreOutput: new_fs = True return PrepareFeatureStoreOutput(fs, new_fs) + return _prepare_fs diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 7237c9273..3cf63f59c 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -505,7 +505,9 @@ def _launch_feature_store( names and `Step`s of the launched featurestore """ featurestore.remove_stale_files() - feature_store_telem_dir = manifest_builder.run_telemetry_subdirectory / "database" + feature_store_telem_dir = ( + manifest_builder.run_telemetry_subdirectory / "database" + ) # if the featurestore was launched as a batch workload if featurestore.batch: @@ -513,7 +515,8 @@ def _launch_feature_store( featurestore, feature_store_telem_dir ) manifest_builder.add_feature_store( - featurestore, [(feature_store_batch_step.name, step) for step in substeps] + featurestore, + [(feature_store_batch_step.name, step) for step in substeps], ) self._launch_step(feature_store_batch_step, featurestore) @@ -526,7 +529,12 @@ def _launch_feature_store( # if featurestore was run on existing allocation, locally, or in allocation else: fs_steps = [ - (self._create_job_step(fs, feature_store_telem_dir / featurestore.name), fs) + ( + self._create_job_step( + fs, feature_store_telem_dir / featurestore.name + ), + fs, + ) for fs in featurestore.entities ] manifest_builder.add_feature_store( @@ -552,7 +560,9 @@ def _launch_feature_store( create_cluster(featurestore.hosts, featurestore.ports) check_cluster_status(featurestore.hosts, featurestore.ports) num_shards = featurestore.num_shards - logger.info(f"Feature store cluster created with {num_shards} shards") + logger.info( + f"Feature store cluster created with {num_shards} shards" + ) cluster_created = True except SSInternalError: if num_trials > 0: diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 1ef849e97..72497ef3c 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -222,7 +222,9 @@ class LaunchedManifest(t.Generic[_T]): metadata: _LaunchedManifestMetadata models: t.Tuple[t.Tuple[Model, _T], ...] ensembles: t.Tuple[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]], ...] - featurestores: t.Tuple[t.Tuple[FeatureStore, t.Tuple[t.Tuple[FSNode, _T], ...]], ...] + featurestores: t.Tuple[ + t.Tuple[FeatureStore, t.Tuple[t.Tuple[FSNode, _T], ...]], ... + ] def map(self, func: t.Callable[[_T], _U]) -> "LaunchedManifest[_U]": def _map_entity_data( diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 28fb1edca..44429adaf 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -269,7 +269,9 @@ def cleanup() -> None: logger.warning("Couldn't find feature store process to kill.") except OSError as e: - logger.warning(f"Failed to clean up colocated feature store gracefully: {str(e)}") + logger.warning( + f"Failed to clean up colocated feature store gracefully: {str(e)}" + ) finally: if LOCK.is_locked: LOCK.release() @@ -325,7 +327,9 @@ def register_signal_handlers() -> None: LOCK = filelock.FileLock(tmp_lockfile) LOCK.acquire(timeout=0.1) - logger.debug(f"Starting colocated feature store on host: {socket.gethostname()}") + logger.debug( + f"Starting colocated feature store on host: {socket.gethostname()}" + ) # make sure to register the cleanup before we start # the proecss so our signaller will be able to stop diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index e0c0e7e1b..7f601566e 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -168,7 +168,9 @@ def _gen_feature_store_dir(self, feature_store_list: t.List[FeatureStore]) -> No # Always remove featurestore files if present. if path.isdir(feature_store_path): shutil.rmtree(feature_store_path, ignore_errors=True) - pathlib.Path(feature_store_path).mkdir(exist_ok=self.overwrite, parents=True) + pathlib.Path(feature_store_path).mkdir( + exist_ok=self.overwrite, parents=True + ) def _gen_entity_list_dir(self, entity_lists: t.List[Ensemble]) -> None: """Generate directories for Ensemble instances diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index dee438788..576a38be2 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -669,7 +669,9 @@ def __str__(self) -> str: # pragma: no cover def add_ml_model_object(self, fs_model: FSModel) -> None: if not fs_model.is_file and self.colocated: - err_msg = "ML model can not be set from memory for colocated feature stores.\n" + err_msg = ( + "ML model can not be set from memory for colocated feature stores.\n" + ) err_msg += ( f"Please store the ML model named {fs_model.name} in binary format " ) diff --git a/tests/_legacy/full_wlm/test_generic_orc_launch_batch.py b/tests/_legacy/full_wlm/test_generic_orc_launch_batch.py index 9ad69b56e..b437303b5 100644 --- a/tests/_legacy/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/_legacy/full_wlm/test_generic_orc_launch_batch.py @@ -227,5 +227,6 @@ def test_launch_cluster_feature_store_reconnect(test_dir, wlmutils): # Ensure it is the same FS that Experiment 1 was tracking time.sleep(5) assert not any( - stat == SmartSimStatus.STATUS_RUNNING for stat in exp.get_status(feature_store) + stat == SmartSimStatus.STATUS_RUNNING + for stat in exp.get_status(feature_store) ) diff --git a/tests/_legacy/on_wlm/test_preview_wlm.py b/tests/_legacy/on_wlm/test_preview_wlm.py index b7b3f3f32..7e467e4fe 100644 --- a/tests/_legacy/on_wlm/test_preview_wlm.py +++ b/tests/_legacy/on_wlm/test_preview_wlm.py @@ -256,7 +256,9 @@ def test_preview_launch_command(test_dir, wlmutils, choose_host): n_models=4, ) - preview_manifest = Manifest(feature_store, spam_eggs_model, hello_world_model, ensemble) + preview_manifest = Manifest( + feature_store, spam_eggs_model, hello_world_model, ensemble + ) # Execute preview method output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") diff --git a/tests/_legacy/test_cli.py b/tests/_legacy/test_cli.py index bcec732e2..397f1196c 100644 --- a/tests/_legacy/test_cli.py +++ b/tests/_legacy/test_cli.py @@ -733,7 +733,9 @@ def mock_operation(*args, **kwargs) -> int: # mock out the internal get_fs_path method so we don't actually do file system ops monkeypatch.setattr(smartsim._core._cli.build, "tabulate", mock_operation) - monkeypatch.setattr(smartsim._core._cli.build, "build_feature_store", mock_operation) + monkeypatch.setattr( + smartsim._core._cli.build, "build_feature_store", mock_operation + ) monkeypatch.setattr(smartsim._core._cli.build, "build_redis_ai", mock_operation) monkeypatch.setattr( smartsim._core._cli.build, "check_py_torch_version", mock_operation diff --git a/tests/_legacy/test_controller.py b/tests/_legacy/test_controller.py index 34e20aabc..19325c933 100644 --- a/tests/_legacy/test_controller.py +++ b/tests/_legacy/test_controller.py @@ -40,7 +40,9 @@ bs = SbatchSettings() ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -feature_store = FeatureStore(fs_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore( + fs_nodes=3, batch=True, launcher="slurm", run_command="srun" +) class MockStep(Step): diff --git a/tests/_legacy/test_controller_errors.py b/tests/_legacy/test_controller_errors.py index 58f53c379..5301d547c 100644 --- a/tests/_legacy/test_controller_errors.py +++ b/tests/_legacy/test_controller_errors.py @@ -48,7 +48,9 @@ ens = Ensemble("ensemble_name", params={}, run_settings=entity_settings, replicas=2) # Ensemble entity slightly different but with same name ens_2 = Ensemble("ensemble_name", params={}, run_settings=entity_settings, replicas=3) -feature_store = FeatureStore(fs_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore( + fs_nodes=3, batch=True, launcher="slurm", run_command="srun" +) def test_finished_entity_feature_store_error(): diff --git a/tests/_legacy/test_launch_errors.py b/tests/_legacy/test_launch_errors.py index 1676b8029..2c3021254 100644 --- a/tests/_legacy/test_launch_errors.py +++ b/tests/_legacy/test_launch_errors.py @@ -66,9 +66,13 @@ def test_feature_store_relaunch(test_dir, wlmutils): exp_name = "test-feature-store-on-relaunch" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - feature_store = FeatureStore(port=wlmutils.get_test_port(), fs_identifier="feature_store_1") + feature_store = FeatureStore( + port=wlmutils.get_test_port(), fs_identifier="feature_store_1" + ) feature_store.set_path(test_dir) - feature_store_1 = FeatureStore(port=wlmutils.get_test_port() + 1, fs_identifier="feature_store_2") + feature_store_1 = FeatureStore( + port=wlmutils.get_test_port() + 1, fs_identifier="feature_store_2" + ) feature_store_1.set_path(test_dir) try: exp.start(feature_store) diff --git a/tests/_legacy/test_manifest.py b/tests/_legacy/test_manifest.py index 9475ea42f..01b11ffb2 100644 --- a/tests/_legacy/test_manifest.py +++ b/tests/_legacy/test_manifest.py @@ -158,7 +158,9 @@ def test_launched_manifest_builder_correctly_maps_data(): lmb.add_model(model, 1) lmb.add_model(model_2, 1) lmb.add_ensemble(ensemble, [i for i in range(len(ensemble.entities))]) - lmb.add_feature_store(feature_store, [i for i in range(len(feature_store.entities))]) + lmb.add_feature_store( + feature_store, [i for i in range(len(feature_store.entities))] + ) manifest = lmb.finalize() assert len(manifest.models) == 2 diff --git a/tests/_legacy/test_multidb.py b/tests/_legacy/test_multidb.py index 78ff7cfbc..110866e31 100644 --- a/tests/_legacy/test_multidb.py +++ b/tests/_legacy/test_multidb.py @@ -114,7 +114,10 @@ def test_fs_identifier_standard_then_colo_error( == "testdb_colo" ) - with make_entity_context(exp, feature_store), make_entity_context(exp, smartsim_model): + with ( + make_entity_context(exp, feature_store), + make_entity_context(exp, smartsim_model), + ): exp.start(feature_store) with pytest.raises(SSDBIDConflictError) as ex: exp.start(smartsim_model) @@ -185,7 +188,10 @@ def test_fs_identifier_colo_then_standard( assert feature_store.name == "testdb_colo" - with make_entity_context(exp, feature_store), make_entity_context(exp, smartsim_model): + with ( + make_entity_context(exp, feature_store), + make_entity_context(exp, smartsim_model), + ): exp.start(smartsim_model, block=True) exp.start(feature_store) @@ -227,7 +233,10 @@ def test_fs_identifier_standard_twice_not_unique(wlmutils, test_dir): assert feature_store2.name == "my_fs" # CREATE feature store with fs_identifier - with make_entity_context(exp, feature_store2), make_entity_context(exp, feature_store): + with ( + make_entity_context(exp, feature_store2), + make_entity_context(exp, feature_store), + ): exp.start(feature_store) with pytest.raises(SSDBIDConflictError) as ex: exp.start(feature_store) @@ -483,7 +492,10 @@ def test_launch_cluster_feature_store_single_fsid( fileutils, fs_type, exp, test_script, fs_args, on_wlm=on_wlm ) - with make_entity_context(exp, feature_store), make_entity_context(exp, smartsim_model): + with ( + make_entity_context(exp, feature_store), + make_entity_context(exp, smartsim_model), + ): exp.start(feature_store, block=True) exp.start(smartsim_model, block=True) job_dict = exp._control._jobs.get_fs_host_addresses() diff --git a/tests/_legacy/test_orchestrator.py b/tests/_legacy/test_orchestrator.py index b4fb6c46d..5febb8d1b 100644 --- a/tests/_legacy/test_orchestrator.py +++ b/tests/_legacy/test_orchestrator.py @@ -147,11 +147,17 @@ def test_pbs_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: ) feature_store.set_run_arg("account", "ACCOUNT") assert all( - [fs.run_settings.run_args["account"] == "ACCOUNT" for fs in feature_store.entities] + [ + fs.run_settings.run_args["account"] == "ACCOUNT" + for fs in feature_store.entities + ] ) feature_store.set_run_arg("pes-per-numa-node", "5") assert all( - ["pes-per-numa-node" not in fs.run_settings.run_args for fs in feature_store.entities] + [ + "pes-per-numa-node" not in fs.run_settings.run_args + for fs in feature_store.entities + ] ) @@ -195,7 +201,10 @@ def test_slurm_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: ) feature_store.set_run_arg("account", "ACCOUNT") assert all( - [fs.run_settings.run_args["account"] == "ACCOUNT" for fs in feature_store.entities] + [ + fs.run_settings.run_args["account"] == "ACCOUNT" + for fs in feature_store.entities + ] ) @@ -248,7 +257,9 @@ def test_feature_store_results_in_correct_number_of_shards(single_cmd: bool) -> assert len(feature_store.entities) == num_shards assert all(node.run_settings.mpmd == [] for node in feature_store.entities) assert ( - feature_store.num_shards == feature_store.fs_nodes == sum(node.num_shards for node in feature_store.entities) + feature_store.num_shards + == feature_store.fs_nodes + == sum(node.num_shards for node in feature_store.entities) ) diff --git a/tests/_legacy/test_output_files.py b/tests/_legacy/test_output_files.py index 4491ace39..d9e677ff4 100644 --- a/tests/_legacy/test_output_files.py +++ b/tests/_legacy/test_output_files.py @@ -50,7 +50,9 @@ batch_rs = SrunSettings("echo", ["spam", "eggs"]) ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -feature_store = FeatureStore(fs_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore( + fs_nodes=3, batch=True, launcher="slurm", run_command="srun" +) model = Model("test_model", params={}, path="", run_settings=rs) batch_model = Model( "batch_test_model", params={}, path="", run_settings=batch_rs, batch_settings=bs diff --git a/tests/_legacy/test_serialize.py b/tests/_legacy/test_serialize.py index f3447d5e5..05448cf91 100644 --- a/tests/_legacy/test_serialize.py +++ b/tests/_legacy/test_serialize.py @@ -144,7 +144,9 @@ def test_started_entities_are_serialized(test_dir, manifest_json): exp.stop(hello_world_model, spam_eggs_model, hello_ensemble) -def test_serialzed_feature_store_does_not_break_if_using_a_non_standard_install(monkeypatch): +def test_serialzed_feature_store_does_not_break_if_using_a_non_standard_install( + monkeypatch, +): monkeypatch.setattr(utils, "get_fs_path", lambda: None) fs = FeatureStore() dict_ = serialize._dictify_fs(fs, []) diff --git a/tests/_legacy/test_symlinking.py b/tests/_legacy/test_symlinking.py index a91263654..7b5a0152f 100644 --- a/tests/_legacy/test_symlinking.py +++ b/tests/_legacy/test_symlinking.py @@ -49,7 +49,9 @@ batch_rs = SrunSettings("echo", ["spam", "eggs"]) ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -feature_store = FeatureStore(fs_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore( + fs_nodes=3, batch=True, launcher="slurm", run_command="srun" +) model = Model("test_model", params={}, path="", run_settings=rs) batch_model = Model( "batch_test_model", params={}, path="", run_settings=batch_rs, batch_settings=bs diff --git a/tests/_legacy/test_telemetry_monitor.py b/tests/_legacy/test_telemetry_monitor.py index 8546afdfb..36c5ccb9e 100644 --- a/tests/_legacy/test_telemetry_monitor.py +++ b/tests/_legacy/test_telemetry_monitor.py @@ -381,14 +381,26 @@ def test_load_manifest_fs_and_models_1run(fileutils: FileUtils): @pytest.mark.parametrize( ["task_id", "step_id", "etype", "exp_isfeature_store", "exp_ismanaged"], [ - pytest.param("123", "", "model", False, False, id="unmanaged, non-feature_store"), - pytest.param("456", "123", "ensemble", False, True, id="managed, non-feature_store"), - pytest.param("789", "987", "featurestore", True, True, id="managed, feature_store"), - pytest.param("987", "", "featurestore", True, False, id="unmanaged, feature_store"), + pytest.param( + "123", "", "model", False, False, id="unmanaged, non-feature_store" + ), + pytest.param( + "456", "123", "ensemble", False, True, id="managed, non-feature_store" + ), + pytest.param( + "789", "987", "featurestore", True, True, id="managed, feature_store" + ), + pytest.param( + "987", "", "featurestore", True, False, id="unmanaged, feature_store" + ), ], ) def test_persistable_computed_properties( - task_id: str, step_id: str, etype: str, exp_isfeature_store: bool, exp_ismanaged: bool + task_id: str, + step_id: str, + etype: str, + exp_isfeature_store: bool, + exp_ismanaged: bool, ): name = f"test-{etype}-{uuid.uuid4()}" timestamp = get_ts_ms() @@ -795,7 +807,9 @@ def test_telemetry_fs_only_with_generate(test_dir, wlmutils, monkeypatch, config exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create regular feature store - feature_store = exp.create_feature_store(port=test_port, interface=test_interface) + feature_store = exp.create_feature_store( + port=test_port, interface=test_interface + ) exp.generate(feature_store) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir @@ -839,7 +853,9 @@ def test_telemetry_fs_only_without_generate(test_dir, wlmutils, monkeypatch, con exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create regular feature store - feature_store = exp.create_feature_store(port=test_port, interface=test_interface) + feature_store = exp.create_feature_store( + port=test_port, interface=test_interface + ) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir try: @@ -883,7 +899,9 @@ def test_telemetry_fs_and_model(fileutils, test_dir, wlmutils, monkeypatch, conf exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create regular feature store - feature_store = exp.create_feature_store(port=test_port, interface=test_interface) + feature_store = exp.create_feature_store( + port=test_port, interface=test_interface + ) exp.generate(feature_store) try: exp.start(feature_store) From e0340e7d046d7a3e04bac5760bf109123c0760f9 Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Wed, 29 May 2024 15:14:04 -0500 Subject: [PATCH 07/11] ignore build docs on smartsim-refactor branch --- .github/workflows/build_docs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index 65aadc03c..9a7375557 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -32,6 +32,8 @@ on: push: branches: - develop + branches-ignore: + - smartsim-refactor jobs: build_docs: From ffc88bd23531f930c7dea028ab6703520a0aee0c Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Wed, 29 May 2024 17:46:05 -0500 Subject: [PATCH 08/11] Empty-Commit From 543f0923b4d1dd8428c27716e22742d3e08b1680 Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Thu, 30 May 2024 17:10:12 -0500 Subject: [PATCH 09/11] isort fixes --- smartsim/_core/control/manifest.py | 2 +- smartsim/_core/launcher/step/slurmStep.py | 2 +- smartsim/_core/launcher/step/step.py | 2 +- smartsim/_core/utils/serialize.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 7ba7ac05e..f066f7848 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -30,7 +30,7 @@ from dataclasses import dataclass, field from ...database import FeatureStore -from ...entity import Application, FSNode, Ensemble, EntitySequence, SmartSimEntity +from ...entity import Application, Ensemble, EntitySequence, FSNode, SmartSimEntity from ...error import SmartSimError from ..config import CONFIG from ..utils import helpers as _helpers diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 001587e35..3f178d974 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -29,7 +29,7 @@ import typing as t from shlex import split as sh_split -from ....entity import Application, FSNode, Ensemble +from ....entity import Application, Ensemble, FSNode from ....error import AllocationError from ....log import get_logger from ....settings import RunSettings, SbatchSettings, Singularity, SrunSettings diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index fbdf8dc7f..c2aa444c0 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -37,7 +37,7 @@ from smartsim._core.config import CONFIG from smartsim.error.errors import SmartSimError, UnproxyableStepError -from ....entity import Application, FSNode, Ensemble +from ....entity import Application, Ensemble, FSNode from ....log import get_logger from ....settings.base import RunSettings, SettingsBase from ...utils.helpers import encode_cmd, get_base_36_repr diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py index ec419cfa5..aad38c778 100644 --- a/smartsim/_core/utils/serialize.py +++ b/smartsim/_core/utils/serialize.py @@ -37,7 +37,7 @@ if t.TYPE_CHECKING: from smartsim._core.control.manifest import LaunchedManifest as _Manifest from smartsim.database.orchestrator import FeatureStore - from smartsim.entity import Application, FSNode, Ensemble + from smartsim.entity import Application, Ensemble, FSNode from smartsim.entity.dbobject import FSModel, FSScript from smartsim.settings.base import BatchSettings, RunSettings From 13c9382310b2ae86d1eefc69a5937bbebcea2585 Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Thu, 30 May 2024 17:17:39 -0500 Subject: [PATCH 10/11] black fix --- smartsim/_core/control/manifest.py | 4 +++- tests/_legacy/test_output_files.py | 4 +++- tests/_legacy/test_symlinking.py | 4 +++- tests/_legacy/test_telemetry_monitor.py | 16 ++++++++++++---- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index f066f7848..36b030504 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -222,7 +222,9 @@ class LaunchedManifest(t.Generic[_T]): metadata: _LaunchedManifestMetadata applications: t.Tuple[t.Tuple[Application, _T], ...] ensembles: t.Tuple[t.Tuple[Ensemble, t.Tuple[t.Tuple[Application, _T], ...]], ...] - featurestores: t.Tuple[t.Tuple[FeatureStore, t.Tuple[t.Tuple[FSNode, _T], ...]], ...] + featurestores: t.Tuple[ + t.Tuple[FeatureStore, t.Tuple[t.Tuple[FSNode, _T], ...]], ... + ] def map(self, func: t.Callable[[_T], _U]) -> "LaunchedManifest[_U]": def _map_entity_data( diff --git a/tests/_legacy/test_output_files.py b/tests/_legacy/test_output_files.py index 3bb6360c4..3b786548f 100644 --- a/tests/_legacy/test_output_files.py +++ b/tests/_legacy/test_output_files.py @@ -50,7 +50,9 @@ batch_rs = SrunSettings("echo", ["spam", "eggs"]) ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -feature_store = FeatureStore(fs_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore( + fs_nodes=3, batch=True, launcher="slurm", run_command="srun" +) application = Application("test_application", params={}, path="", run_settings=rs) batch_application = Application( "batch_test_application", diff --git a/tests/_legacy/test_symlinking.py b/tests/_legacy/test_symlinking.py index 473289100..11219a81b 100644 --- a/tests/_legacy/test_symlinking.py +++ b/tests/_legacy/test_symlinking.py @@ -49,7 +49,9 @@ batch_rs = SrunSettings("echo", ["spam", "eggs"]) ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -feature_store = FeatureStore(fs_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore( + fs_nodes=3, batch=True, launcher="slurm", run_command="srun" +) application = Application("test_application", params={}, path="", run_settings=rs) batch_application = Application( "batch_test_application", diff --git a/tests/_legacy/test_telemetry_monitor.py b/tests/_legacy/test_telemetry_monitor.py index 48007d9cf..02a89d3e0 100644 --- a/tests/_legacy/test_telemetry_monitor.py +++ b/tests/_legacy/test_telemetry_monitor.py @@ -381,10 +381,18 @@ def test_load_manifest_fs_and_applications_1run(fileutils: FileUtils): @pytest.mark.parametrize( ["task_id", "step_id", "etype", "exp_isfeature_store", "exp_ismanaged"], [ - pytest.param("123", "", "application", False, False, id="unmanaged, non-feature_store"), - pytest.param("456", "123", "ensemble", False, True, id="managed, non-feature_store"), - pytest.param("789", "987", "featurestore", True, True, id="managed, feature_store"), - pytest.param("987", "", "featurestore", True, False, id="unmanaged, feature_store"), + pytest.param( + "123", "", "application", False, False, id="unmanaged, non-feature_store" + ), + pytest.param( + "456", "123", "ensemble", False, True, id="managed, non-feature_store" + ), + pytest.param( + "789", "987", "featurestore", True, True, id="managed, feature_store" + ), + pytest.param( + "987", "", "featurestore", True, False, id="unmanaged, feature_store" + ), ], ) def test_persistable_computed_properties( From 5e8d9492b8a81b92ba1ba1e05910dfd51f9cb53e Mon Sep 17 00:00:00 2001 From: Julia Putko Date: Mon, 3 Jun 2024 18:49:30 -0500 Subject: [PATCH 11/11] merge conflict update --- smartsim/experiment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/experiment.py b/smartsim/experiment.py index b1801cb17..0caad3bbf 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -428,8 +428,8 @@ def get_status( raise @_contextualize - def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: - """Reconnect to a running ``Orchestrator`` + def reconnect_feature_store(self, checkpoint: str) -> FeatureStore: + """Reconnect to a running ``FeatureStore`` This method can be used to connect to a ``FeatureStore`` deployment that was launched by a previous ``Experiment``. This can be