From c48db1405576935a0464fa7f291acee90c34f189 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Mon, 2 Oct 2023 18:34:07 -0500 Subject: [PATCH 01/64] Update tests to pass on_wlm --- conftest.py | 18 ++++++++++++++++-- tests/backends/test_dbmodel.py | 16 ++++++++-------- tests/backends/test_dbscript.py | 16 ++++++++-------- tests/on_wlm/test_colocated_model.py | 28 ++++++++++++++++++++-------- tests/test_colo_model_local.py | 15 ++++++++++----- 5 files changed, 62 insertions(+), 31 deletions(-) diff --git a/conftest.py b/conftest.py index 9c59aaaa2..2d5142f9c 100644 --- a/conftest.py +++ b/conftest.py @@ -49,6 +49,8 @@ from subprocess import run import sys import typing as t +import warnings +import contextlib # pylint: disable=redefined-outer-name,invalid-name,global-statement @@ -677,10 +679,12 @@ def setup_test_colo( exp: Experiment, db_args: t.Dict[str, t.Any], colo_settings: t.Optional[t.Dict[str, t.Any]] = None, + on_wlm: t.Optional[bool] = False ) -> Model: """Setup things needed for setting up the colo pinning tests""" # get test setup - test_dir = fileutils.make_test_dir(level=2) + level = 3 if on_wlm else 2 + test_dir = fileutils.make_test_dir(level=level) sr_test_script = fileutils.get_test_conf_path("send_data_local_smartredis.py") # Create an app with a colo_db which uses 1 db_cpu @@ -688,6 +692,9 @@ def setup_test_colo( colo_settings = exp.create_run_settings( exe=sys.executable, exe_args=[sr_test_script] ) + if on_wlm: + colo_settings.set_tasks(1) + colo_settings.set_nodes(1) colo_model = exp.create_model("colocated_model", colo_settings) colo_model.set_path(test_dir) @@ -700,7 +707,14 @@ def setup_test_colo( "deprecated": colo_model.colocate_db, "uds": colo_model.colocate_db_uds, } - colocate_fun[db_type](**db_args) + with warnings.catch_warnings(): + if db_type == "deprecated": + warnings.filterwarnings( + "ignore", + message="`colocate_db` has been deprecated" + ) + colocate_fun[db_type](**db_args) + exp.generate(colo_model, overwrite=True) # assert model will launch with colocated db assert colo_model.colocated # Check to make sure that limit_db_cpus made it into the colo settings diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index ff5854864..05972fda1 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -165,7 +165,7 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) @@ -239,7 +239,7 @@ def test_pt_db_model(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) @@ -302,7 +302,7 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create ensemble smartsim_ensemble = exp.create_ensemble( @@ -397,7 +397,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create colocated Model colo_model = exp.create_model("colocated_model", colo_settings) @@ -466,7 +466,7 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) @@ -525,7 +525,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings for colocated model colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create ensemble of two identical models colo_ensemble: Ensemble = exp.create_ensemble( @@ -626,7 +626,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create the ensemble of two identical SmartSim Model colo_ensemble = exp.create_ensemble( @@ -725,7 +725,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 06492f60f..7577bec31 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -74,7 +74,7 @@ def test_db_script(fileutils, wlmutils, mlutils): # Create the RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) @@ -146,7 +146,7 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create Ensemble with two identical models ensemble = exp.create_ensemble( @@ -237,7 +237,7 @@ def test_colocated_db_script(fileutils, wlmutils, mlutils): # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create model with colocated database colo_model = exp.create_model("colocated_model", colo_settings) @@ -306,7 +306,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create SmartSim Ensemble with two identical models colo_ensemble = exp.create_ensemble( @@ -402,7 +402,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create Ensemble with two identical SmartSim Model colo_ensemble = exp.create_ensemble( @@ -496,7 +496,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create a SmartSim model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) @@ -576,7 +576,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): # an in-memory script with pytest.raises(SSUnsupportedError): colo_ensemble.add_model(colo_model) - + def test_inconsistent_params_db_script(fileutils): """Test error when devices_per_node>1 and when devices is set to CPU in DBScript constructor""" @@ -589,6 +589,6 @@ def test_inconsistent_params_db_script(fileutils): devices_per_node=2, ) assert ( - ex.value.args[0] + ex.value.args[0] == "Cannot set devices_per_node>1 if CPU is specified under devices" ) \ No newline at end of file diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index 6a3945115..ed6de7228 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -36,6 +36,9 @@ else: supported_dbs = ["uds", "tcp", "deprecated"] +# Set to true if DB logs should be generated for debugging +DEBUG_DB = False + # retrieved from pytest fixtures launcher = pytest.test_launcher if launcher not in pytest.wlm_options: @@ -45,14 +48,15 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): """Test the launch of a model with a colocated database and local launcher""" - db_args = { } + db_args = { "debug":DEBUG_DB } - exp = Experiment("colocated_model_defaults", launcher=launcher) + exp = Experiment(f"colocated_model_defaults_{db_type}", launcher=launcher) colo_model = coloutils.setup_test_colo( fileutils, db_type, exp, db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0" @@ -68,10 +72,11 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): - exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher) + exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", launcher=launcher) db_args = { "db_cpus": 1, "custom_pinning": [], + "debug":DEBUG_DB, } # Check to make sure that the CPU mask was correctly generated @@ -80,6 +85,7 @@ def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): db_type, exp, db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] is None exp.start(colo_model, block=True) @@ -89,10 +95,11 @@ def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type): - exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher) + exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", launcher=launcher) db_args = { "db_cpus": 2, + "debug":DEBUG_DB } # Check to make sure that the CPU mask was correctly generated @@ -101,6 +108,7 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type): db_type, exp, db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1" exp.start(colo_model, block=True) @@ -112,11 +120,12 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # Assume that there are at least 4 cpus on the node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher) db_args = { "db_cpus": 4, - "custom_pinning": range(4) + "custom_pinning": range(4), + "debug":DEBUG_DB } colo_model = coloutils.setup_test_colo( @@ -124,6 +133,7 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type): db_type, exp, db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1,2,3" exp.start(colo_model, block=True) @@ -135,7 +145,7 @@ def test_colocated_model_pinning_list(fileutils, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this has more than 2 CPUs on the supercomputer node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher) db_args = { "db_cpus": 2, @@ -147,6 +157,7 @@ def test_colocated_model_pinning_list(fileutils, coloutils, db_type): db_type, exp, db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,2" exp.start(colo_model, block=True) @@ -158,7 +169,7 @@ def test_colocated_model_pinning_mixed(fileutils, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this at least 4 CPUs on the supercomputer node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher) db_args = { "db_cpus": 2, @@ -170,6 +181,7 @@ def test_colocated_model_pinning_mixed(fileutils, coloutils, db_type): db_type, exp, db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1,3" exp.start(colo_model, block=True) diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index 376c71f26..87f1d737b 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -102,7 +102,8 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type, launcher db_args = { } - exp = Experiment("colocated_model_defaults", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(f"colocated_model_defaults_{db_type}", test_dir, launcher=launcher) colo_model = coloutils.setup_test_colo( fileutils, db_type, @@ -127,7 +128,8 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type, launcher @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_disable_pinning(fileutils, coloutils, db_type, launcher="local"): - exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", test_dir, launcher=launcher) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -147,7 +149,8 @@ def test_colocated_model_disable_pinning(fileutils, coloutils, db_type, launcher @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type, launcher="local"): - exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", test_dir, launcher=launcher) db_args = { "db_cpus": 2, @@ -174,7 +177,8 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type, launch def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", test_dir, launcher=launcher) db_args = { "db_cpus": 2, @@ -197,7 +201,8 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher=" def test_colocated_model_pinning_list(fileutils, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", test_dir, launcher=launcher) db_args = { "db_cpus": 1, From 7e02516c59c074367eb1cc0d05ee11014afa345c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 12 Oct 2023 19:14:35 +0200 Subject: [PATCH 02/64] Define make_test_dir and get_test_dir fixtures --- conftest.py | 139 ++++-------------- tests/backends/test_cli_mini_exp.py | 4 +- tests/backends/test_dataloader.py | 16 +- tests/backends/test_dbmodel.py | 32 ++-- tests/backends/test_dbscript.py | 24 +-- tests/backends/test_onnx.py | 4 +- tests/backends/test_tf.py | 8 +- tests/backends/test_torch.py | 4 +- tests/full_wlm/test_generic_batch_launch.py | 12 +- .../full_wlm/test_generic_orc_launch_batch.py | 22 +-- tests/full_wlm/test_mpmd.py | 4 +- tests/on_wlm/test_base_settings_on_wlm.py | 8 +- tests/on_wlm/test_colocated_model.py | 31 ++-- tests/on_wlm/test_containers_wlm.py | 4 +- tests/on_wlm/test_generic_orc_launch.py | 12 +- tests/on_wlm/test_launch_errors.py | 8 +- tests/on_wlm/test_launch_ompi_lsf.py | 4 +- tests/on_wlm/test_local_step.py | 12 +- tests/on_wlm/test_restart.py | 4 +- .../test_simple_base_settings_on_wlm.py | 8 +- tests/on_wlm/test_simple_entity_launch.py | 12 +- tests/on_wlm/test_stop.py | 8 +- tests/test_colo_model_local.py | 26 ++-- tests/test_config.py | 52 ++++--- tests/test_containers.py | 12 +- tests/test_dbnode.py | 4 +- tests/test_experiment.py | 16 +- tests/test_generator.py | 36 ++--- tests/test_interrupt.py | 8 +- tests/test_launch_errors.py | 8 +- tests/test_local_launch.py | 8 +- tests/test_local_multi_run.py | 4 +- tests/test_local_restart.py | 8 +- tests/test_modelwriter.py | 16 +- tests/test_multidb.py | 72 ++++----- tests/test_orchestrator.py | 8 +- tests/test_pals_settings.py | 8 +- tests/test_reconnect_orchestrator.py | 4 +- tests/test_smartredis.py | 8 +- 39 files changed, 304 insertions(+), 374 deletions(-) diff --git a/conftest.py b/conftest.py index 69f712d6a..af15afabd 100644 --- a/conftest.py +++ b/conftest.py @@ -375,15 +375,13 @@ def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: @pytest.fixture def local_db( - fileutils: FileUtils, request: t.Any, wlmutils: t.Type[WLMUtils] + request: t.Any, wlmutils: t.Type[WLMUtils], make_test_dir: t.Any ) -> t.Generator[Orchestrator, None, None]: """Yield fixture for startup and teardown of an local orchestrator""" exp_name = request.function.__name__ exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir( - caller_function=exp_name, caller_fspath=request.fspath - ) + test_dir = make_test_dir db = Orchestrator(port=wlmutils.get_test_port(), interface="lo") db.set_path(test_dir) exp.start(db) @@ -418,7 +416,7 @@ def db( @pytest.fixture def db_cluster( - fileutils: t.Type[FileUtils], wlmutils: t.Type[WLMUtils], request: t.Any + make_test_dir: t.Any, wlmutils: t.Type[WLMUtils], request: t.Any ) -> t.Generator[Orchestrator, None, None]: """ Yield fixture for startup and teardown of a clustered orchestrator. @@ -428,9 +426,7 @@ def db_cluster( exp_name = request.function.__name__ exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir( - caller_function=exp_name, caller_fspath=request.fspath - ) + test_dir = make_test_dir db = wlmutils.get_orchestrator(nodes=3) db.set_path(test_dir) exp.start(db) @@ -530,12 +526,39 @@ def get_config_edit_method( return config_edit_methods.get(config_setting, None) +@pytest.fixture +def get_test_dir(request: t.Optional[pytest.FixtureRequest]): + caller_function_list = request.node.name.split("[", maxsplit=1) + if len(caller_function_list) > 1: + caller_function_list[1] = ''.join(filter(str.isalnum, caller_function_list[1])) + caller_function = ".".join(caller_function_list) + dir_path = FileUtils._test_dir_path(caller_function, request.node.fspath) + + if not os.path.exists(os.path.dirname(dir_path)): + os.makedirs(os.path.dirname(dir_path)) + + return dir_path + + +@pytest.fixture +def make_test_dir(request: t.Optional[pytest.FixtureRequest]): + + caller_function = request.node.name.replace("[", ".").replace("]", "") + dir_path = FileUtils._test_dir_path(caller_function, request.node.fspath) + + try: + os.makedirs(dir_path) + except Exception: + return dir_path + return dir_path + + @pytest.fixture def fileutils() -> t.Type[FileUtils]: return FileUtils - class FileUtils: + @staticmethod def _test_dir_path(caller_function: str, caller_fspath: str) -> str: caller_file_to_dir = os.path.splitext(str(caller_fspath))[0] @@ -543,82 +566,6 @@ def _test_dir_path(caller_function: str, caller_fspath: str) -> str: dir_path = os.path.join(test_dir, rel_path, caller_function) return dir_path - @staticmethod - def get_test_dir( - caller_function: t.Optional[str] = None, - caller_fspath: t.Optional[str] = None, - level: int = 1, - ) -> str: - """Get path to test output. - - This function should be called without arguments from within - a test: the returned directory will be - `test_output///`. - When called from other functions (e.g. from functions in this file), - the caller function and the caller file path should be provided. - The directory will not be created, but the parent (and all the needed - tree) will. This is to allow tests to create the directory. - - :param caller_function: caller function name defaults to None - :type caller_function: str, optional - :param caller_fspath: absolute path to file containing caller, defaults to None - :type caller_fspath: str or Path, optional - :return: String path to test output directory - :rtype: str - """ - if not caller_function or not caller_fspath: - caller_frame = inspect.stack()[level] - caller_fspath = caller_frame.filename - caller_function = caller_frame.function - - dir_path = FileUtils._test_dir_path(caller_function, caller_fspath) - if not os.path.exists(os.path.dirname(dir_path)): - os.makedirs(os.path.dirname(dir_path)) - # dir_path = os.path.join(test_dir, dir_name) - return dir_path - - @staticmethod - def make_test_dir( - caller_function: t.Optional[str] = None, - caller_fspath: t.Optional[str] = None, - level: int = 1, - sub_dir: t.Optional[str] = None, - ) -> str: - """Create test output directory and return path to it. - - This function should be called without arguments from within - a test: the directory will be created as - `test_output///`. - When called from other functions (e.g. from functions in this file), - the caller function and the caller file path should be provided. - - :param caller_function: caller function name defaults to None - :type caller_function: str, optional - :param caller_fspath: absolute path to file containing caller, defaults to None - :type caller_fspath: str or Path, optional - :param level: indicate depth in the call stack relative to test method. - :type level: int, optional - :param sub_dir: a relative path to create in the test directory - :type sub_dir: str or Path, optional - - :return: String path to test output directory - :rtype: str - """ - if not caller_function or not caller_fspath: - caller_frame = inspect.stack()[level] - caller_fspath = caller_frame.filename - caller_function = caller_frame.function - - dir_path = FileUtils._test_dir_path(caller_function, caller_fspath) - if sub_dir: - dir_path = os.path.join(dir_path, sub_dir) - - try: - os.makedirs(dir_path) - except Exception: - return dir_path - return dir_path - @staticmethod def get_test_conf_path(filename: str) -> str: file_path = os.path.join(test_path, "tests", "test_configs", filename) @@ -629,25 +576,6 @@ def get_test_dir_path(dirname: str) -> str: dir_path = os.path.join(test_path, "tests", "test_configs", dirname) return dir_path - @staticmethod - def make_test_file(file_name: str, file_dir: t.Optional[str] = None) -> str: - """Create a dummy file in the test output directory. - - :param file_name: name of file to create, e.g. "file.txt" - :type file_name: str - :param file_dir: path relative to test output directory, e.g. "deps/libs" - :type file_dir: str - :return: String path to test output file - :rtype: str - """ - test_dir = FileUtils.make_test_dir(level=2, sub_dir=file_dir) - file_path = os.path.join(test_dir, file_name) - - with open(file_path, "w+", encoding="utf-8") as dummy_file: - dummy_file.write("dummy\n") - - return file_path - @pytest.fixture def mlutils() -> t.Type[MLUtils]: @@ -682,8 +610,6 @@ def setup_test_colo( port: t.Optional[int] = test_port ) -> Model: """Setup things needed for setting up the colo pinning tests""" - # get test setup - test_dir = fileutils.make_test_dir(level=2) sr_test_script = fileutils.get_test_conf_path(application_file) @@ -694,7 +620,6 @@ def setup_test_colo( ) colo_name = colo_model_name if colo_model_name else "colocated_model" colo_model = exp.create_model(colo_name, colo_settings) - colo_model.set_path(test_dir) if db_type in ["tcp", "deprecated"]: db_args["port"] = port diff --git a/tests/backends/test_cli_mini_exp.py b/tests/backends/test_cli_mini_exp.py index 77500e768..ea9abe066 100644 --- a/tests/backends/test_cli_mini_exp.py +++ b/tests/backends/test_cli_mini_exp.py @@ -47,7 +47,7 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( local_db, - fileutils, + get_test_dir, monkeypatch, ): """Presumably devs running the test suite have built SS correctly. @@ -71,7 +71,7 @@ def _mock_make_managed_local_orc(*a, **kw): smartsim._core._cli.validate.test_install( # Shouldn't matter bc we are stubbing creation of orc # but best to give it "correct" vals for safety - location=fileutils.get_test_dir(), + location=get_test_dir, port=db_port, # Always test on CPU, heads don't always have GPU device="CPU", diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index 5dd1fe4ed..ea4859ad4 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -155,8 +155,8 @@ def train_tf(generator): @pytest.mark.skipif(not shouldrun_tf, reason="Test needs TensorFlow to run") -def test_tf_dataloaders(fileutils, wlmutils): - test_dir = fileutils.make_test_dir() +def test_tf_dataloaders(make_test_dir, wlmutils): + test_dir = make_test_dir exp = Experiment("test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher()) orc: Orchestrator = wlmutils.get_orchestrator() exp.generate(orc) @@ -221,8 +221,8 @@ def create_trainer_torch(experiment: Experiment, filedir, wlmutils): @pytest.mark.skipif(not shouldrun_torch, reason="Test needs Torch to run") -def test_torch_dataloaders(fileutils, wlmutils): - test_dir = fileutils.make_test_dir() +def test_torch_dataloaders(fileutils, make_test_dir, wlmutils): + test_dir = make_test_dir exp = Experiment("test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher()) orc: Orchestrator = wlmutils.get_orchestrator() config_dir = fileutils.get_test_dir_path("ml") @@ -271,10 +271,10 @@ def test_torch_dataloaders(fileutils, wlmutils): for _ in range(2): for _ in torch_static: continue - + trainer = create_trainer_torch(exp, config_dir, wlmutils) exp.start(trainer, block=True) - + assert exp.get_status(trainer)[0] == STATUS_COMPLETED except Exception as e: @@ -317,8 +317,8 @@ def test_data_info_repr(): @pytest.mark.skipif( not (shouldrun_torch or shouldrun_tf), reason="Requires TF or PyTorch" ) -def test_wrong_dataloaders(fileutils, wlmutils): - test_dir = fileutils.make_test_dir() +def test_wrong_dataloaders(make_test_dir, wlmutils): + test_dir = make_test_dir exp = Experiment("test-wrong-dataloaders", exp_path=test_dir, launcher=wlmutils.get_test_launcher()) orc = wlmutils.get_orchestrator() exp.generate(orc) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 07a0515e6..652f11d66 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -144,7 +144,7 @@ def save_torch_cnn(path, file_name): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_tf_db_model(fileutils, wlmutils, mlutils): +def test_tf_db_model(fileutils, make_test_dir, wlmutils, mlutils): """Test TensorFlow DB Models on remote DB""" # Set experiment name @@ -156,7 +156,7 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -218,7 +218,7 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_pt_db_model(fileutils, wlmutils, mlutils): +def test_pt_db_model(fileutils, make_test_dir, wlmutils, mlutils): """Test PyTorch DB Models on remote DB""" # Set experiment name @@ -230,7 +230,7 @@ def test_pt_db_model(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -281,7 +281,7 @@ def test_pt_db_model(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_db_model_ensemble(fileutils, wlmutils, mlutils): +def test_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils): """Test DBModels on remote DB, with an ensemble""" # Set experiment name @@ -293,7 +293,7 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -376,7 +376,7 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): +def test_colocated_db_model_tf(fileutils, make_test_dir, wlmutils, mlutils): """Test DB Models on colocated DB (TensorFlow backend)""" # Set experiment name @@ -388,7 +388,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experience @@ -445,7 +445,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): exp.stop(colo_model) @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): +def test_colocated_db_model_pytorch(fileutils, make_test_dir, wlmutils, mlutils): """Test DB Models on colocated DB (PyTorch backend)""" # Set experiment name @@ -457,7 +457,7 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -502,7 +502,7 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): +def test_colocated_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils): """Test DBModel on colocated ensembles, first colocating DB, then adding DBModel. """ @@ -516,7 +516,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -603,7 +603,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): +def test_colocated_db_model_ensemble_reordered(fileutils, make_test_dir, wlmutils, mlutils): """Test DBModel on colocated ensembles, first adding the DBModel to the ensemble, then colocating DB. """ @@ -617,7 +617,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -704,7 +704,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): +def test_colocated_db_model_errors(fileutils, make_test_dir, wlmutils, mlutils): """Test error when colocated db model has no file.""" # Set experiment name @@ -716,7 +716,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experiment diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index b418d88c3..41f0d0b2f 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -58,7 +58,7 @@ def timestwo(x): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script(fileutils, wlmutils, mlutils): +def test_db_script(fileutils, make_test_dir, wlmutils, mlutils): """Test DB scripts on remote DB""" # Set experiment name @@ -70,7 +70,7 @@ def test_db_script(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -130,7 +130,7 @@ def test_db_script(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_ensemble(fileutils, wlmutils, mlutils): +def test_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutils): """Test DB scripts on remote DB""" # Set experiment name @@ -142,7 +142,7 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -221,7 +221,7 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script(fileutils, wlmutils, mlutils): +def test_colocated_db_script(fileutils, make_test_dir, wlmutils, mlutils): """Test DB Scripts on colocated DB""" # Set the experiment name @@ -233,7 +233,7 @@ def test_colocated_db_script(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -285,7 +285,7 @@ def test_colocated_db_script(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): +def test_colocated_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutils): """Test DB Scripts on colocated DB from ensemble, first colocating DB, then adding script. """ @@ -299,7 +299,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -382,7 +382,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): +def test_colocated_db_script_ensemble_reordered(fileutils, make_test_dir, wlmutils, mlutils): """Test DB Scripts on colocated DB from ensemble, first adding the script to the ensemble, then colocating the DB""" @@ -395,7 +395,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -477,7 +477,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_errors(fileutils, wlmutils, mlutils): +def test_db_script_errors(fileutils, make_test_dir, wlmutils, mlutils): """Test DB Scripts error when setting a serialized function on colocated DB""" # Set Experiment name @@ -489,7 +489,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index 312d56953..d0873af89 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -56,7 +56,7 @@ ) -def test_sklearn_onnx(fileutils, mlutils, wlmutils): +def test_sklearn_onnx(make_test_dir, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 some sklearn models here we test the following sklearn models: @@ -75,7 +75,7 @@ def test_sklearn_onnx(fileutils, mlutils, wlmutils): """ exp_name = "test_sklearn_onnx" - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index e30ad4f24..d010933f2 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -50,7 +50,7 @@ (not tf_backend_available) or (not tf_available), reason="Requires RedisAI TF backend", ) -def test_keras_model(fileutils, mlutils, wlmutils): +def test_keras_model(make_test_dir, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 for a keras model script this test can run on CPU/GPU by setting SMARTSIM_TEST_DEVICE=GPU @@ -61,7 +61,7 @@ def test_keras_model(fileutils, mlutils, wlmutils): """ exp_name = "test_keras_model" - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() @@ -110,8 +110,8 @@ def create_tf_model(): @pytest.mark.skipif(not tf_available, reason="Requires Tensorflow and Keras") -def test_freeze_model(fileutils): - test_dir = fileutils.make_test_dir() +def test_freeze_model(make_test_dir): + test_dir = make_test_dir model = create_tf_model() model_path, inputs, outputs = freeze_model(model, test_dir, "mnist.pb") diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index ba663a433..2c104bce5 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -48,7 +48,7 @@ ) -def test_torch_model_and_script(fileutils, mlutils, wlmutils): +def test_torch_model_and_script(make_test_dir, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 for a torch model script Here we test both the torchscipt API and the NN API from torch @@ -61,7 +61,7 @@ def test_torch_model_and_script(fileutils, mlutils, wlmutils): """ exp_name = "test_torch_model_and_script" - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 4beccd41b..8a90bd8f0 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -35,12 +35,12 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_batch_model(fileutils, wlmutils): +def test_batch_model(fileutils, make_test_dir, wlmutils): """Test the launch of a manually construced batch model""" exp_name = "test-batch-model" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") @@ -60,12 +60,12 @@ def test_batch_model(fileutils, wlmutils): assert statuses[0] == status.STATUS_COMPLETED -def test_batch_ensemble(fileutils, wlmutils): +def test_batch_ensemble(fileutils, make_test_dir, wlmutils): """Test the launch of a manually constructed batch ensemble""" exp_name = "test-batch-ensemble" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") @@ -87,10 +87,10 @@ def test_batch_ensemble(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_batch_ensemble_replicas(fileutils, wlmutils): +def test_batch_ensemble_replicas(fileutils, make_test_dir, wlmutils): exp_name = "test-batch-ensemble-replicas" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 7e5591a30..79c428f56 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -36,13 +36,13 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_launch_orc_auto_batch(fileutils, wlmutils): +def test_launch_orc_auto_batch(make_test_dir, wlmutils): """test single node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-orc-batch" exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -55,7 +55,7 @@ def test_launch_orc_auto_batch(fileutils, wlmutils): orc.batch_settings.set_walltime("00:02:00") if wlmutils.get_test_launcher() == "cobalt": orc.batch_settings.set_queue("debug-flat-quad") - + orc.set_path(test_dir) exp.start(orc, block=True) @@ -71,14 +71,14 @@ def test_launch_orc_auto_batch(fileutils, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_batch_single(fileutils, wlmutils): +def test_launch_cluster_orc_batch_single(make_test_dir, wlmutils): """test clustered 3-node orchestrator with single command""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-batch-single" exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -110,23 +110,23 @@ def test_launch_cluster_orc_batch_single(fileutils, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_batch_multi(fileutils, wlmutils): +def test_launch_cluster_orc_batch_multi(make_test_dir, wlmutils): """test clustered 3-node orchestrator""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-batch-multi" exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() orc = exp.create_database( wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface, single_cmd=False ) - + orc.batch_settings.set_account(wlmutils.get_test_account()) - + orc.batch_settings.set_walltime("00:03:00") if wlmutils.get_test_launcher() == "cobalt": # As Cobalt won't allow us to run two @@ -149,12 +149,12 @@ def test_launch_cluster_orc_batch_multi(fileutils, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_reconnect(fileutils, wlmutils): +def test_launch_cluster_orc_reconnect(make_test_dir, wlmutils): """test reconnecting to clustered 3-node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-cluster-orc-batch-reconect" exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index 19f4660c2..c8f92cae8 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -36,7 +36,7 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_mpmd(fileutils, wlmutils): +def test_mpmd(fileutils, make_test_dir, wlmutils): """Run an MPMD model twice and check that it always gets executed the same way. @@ -77,7 +77,7 @@ def prune_commands(launcher): f"MPMD on {launcher} only supported for run commands {by_launcher[launcher]}" ) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir for run_command in run_commands: script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings( diff --git a/tests/on_wlm/test_base_settings_on_wlm.py b/tests/on_wlm/test_base_settings_on_wlm.py index 3aa77983f..7022884e7 100644 --- a/tests/on_wlm/test_base_settings_on_wlm.py +++ b/tests/on_wlm/test_base_settings_on_wlm.py @@ -40,10 +40,10 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_model_on_wlm(fileutils, wlmutils): +def test_model_on_wlm(fileutils, make_test_dir, wlmutils): exp_name = "test-base-settings-model-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") @@ -58,10 +58,10 @@ def test_model_on_wlm(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_model_stop_on_wlm(fileutils, wlmutils): +def test_model_stop_on_wlm(fileutils, make_test_dir, wlmutils): exp_name = "test-base-settings-model-stop" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index a38fabd06..b9b66c904 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -42,12 +42,12 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") @pytest.mark.parametrize("db_type", supported_dbs) -def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): +def test_launch_colocated_model_defaults(fileutils, make_test_dir, coloutils, db_type): """Test the launch of a model with a colocated database and local launcher""" db_args = { } - exp = Experiment("colocated_model_defaults", launcher=launcher) + exp = Experiment("colocated_model_defaults", launcher=launcher, exp_path=make_test_dir) colo_model = coloutils.setup_test_colo( fileutils, db_type, @@ -55,7 +55,7 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): "send_data_local_smartredis.py", db_args, ) - + exp.generate(colo_model) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0" exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) @@ -67,9 +67,9 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): +def test_colocated_model_disable_pinning(fileutils, make_test_dir, coloutils, db_type): - exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher) + exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher, exp_path=make_test_dir) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -84,14 +84,15 @@ def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] is None + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type): +def test_colocated_model_pinning_auto_2cpu(fileutils, make_test_dir, coloutils, db_type): - exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher) + exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher, exp_path=make_test_dir) db_args = { "db_cpus": 2, @@ -106,16 +107,17 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type): db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_range(fileutils, coloutils, db_type): +def test_colocated_model_pinning_range(fileutils, make_test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # Assume that there are at least 4 cpus on the node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment("colocated_model_pinning_manual", launcher=launcher, exp_path=make_test_dir) db_args = { "db_cpus": 4, @@ -130,16 +132,17 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type): db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1,2,3" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_list(fileutils, coloutils, db_type): +def test_colocated_model_pinning_list(fileutils, make_test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this has more than 2 CPUs on the supercomputer node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment("colocated_model_pinning_manual", launcher=launcher, exp_path=make_test_dir) db_args = { "db_cpus": 2, @@ -154,16 +157,17 @@ def test_colocated_model_pinning_list(fileutils, coloutils, db_type): db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,2" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_mixed(fileutils, coloutils, db_type): +def test_colocated_model_pinning_mixed(fileutils, make_test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this at least 4 CPUs on the supercomputer node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment("colocated_model_pinning_manual", launcher=launcher, exp_path=make_test_dir) db_args = { "db_cpus": 2, @@ -178,6 +182,7 @@ def test_colocated_model_pinning_mixed(fileutils, coloutils, db_type): db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1,3" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py index 414303df4..46eb6d771 100644 --- a/tests/on_wlm/test_containers_wlm.py +++ b/tests/on_wlm/test_containers_wlm.py @@ -40,7 +40,7 @@ @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_wlm_smartredis(fileutils, wlmutils): +def test_singularity_wlm_smartredis(fileutils, make_test_dir, wlmutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. @@ -55,7 +55,7 @@ def test_singularity_wlm_smartredis(fileutils, wlmutils): f"Test only runs on systems with PBS or Slurm as WLM. Current launcher: {launcher}" ) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher=launcher ) diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index 919317c73..9533d5b4c 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -33,13 +33,13 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_launch_orc_auto(fileutils, wlmutils): +def test_launch_orc_auto(make_test_dir, wlmutils): """test single node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-orc" exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -65,14 +65,14 @@ def test_launch_orc_auto(fileutils, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_single(fileutils, wlmutils): +def test_launch_cluster_orc_single(make_test_dir, wlmutils): """test clustered 3-node orchestrator with single command""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-single" exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -99,14 +99,14 @@ def test_launch_cluster_orc_single(fileutils, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_multi(fileutils, wlmutils): +def test_launch_cluster_orc_multi(make_test_dir, wlmutils): """test clustered 3-node orchestrator with multiple commands""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-multi" exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/on_wlm/test_launch_errors.py b/tests/on_wlm/test_launch_errors.py index 77ba8a69a..4fd4886f6 100644 --- a/tests/on_wlm/test_launch_errors.py +++ b/tests/on_wlm/test_launch_errors.py @@ -36,12 +36,12 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_failed_status(fileutils, wlmutils): +def test_failed_status(fileutils, make_test_dir, wlmutils): """Test when a failure occurs deep into model execution""" exp_name = "test-report-failure" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("bad.py") settings = exp.create_run_settings( @@ -58,7 +58,7 @@ def test_failed_status(fileutils, wlmutils): assert stat[0] == status.STATUS_FAILED -def test_bad_run_command_args(fileutils, wlmutils): +def test_bad_run_command_args(fileutils, make_test_dir, wlmutils): """Should fail because of incorrect arguments given to the run command @@ -70,7 +70,7 @@ def test_bad_run_command_args(fileutils, wlmutils): exp_name = "test-bad-run-command-args" exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_launch_ompi_lsf.py b/tests/on_wlm/test_launch_ompi_lsf.py index e3327514a..05fc9cd5f 100644 --- a/tests/on_wlm/test_launch_ompi_lsf.py +++ b/tests/on_wlm/test_launch_ompi_lsf.py @@ -34,13 +34,13 @@ @pytest.mark.skip("OpenMPI currently not working on LSF systems") -def test_launch_openmpi_lsf(wlmutils, fileutils): +def test_launch_openmpi_lsf(fileutils, make_test_dir, wlmutils): launcher = wlmutils.get_test_launcher() if launcher != "lsf": pytest.skip("Test only runs on systems with LSF as WLM") exp_name = "test-launch-openmpi-lsf" exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", script, "mpirun") diff --git a/tests/on_wlm/test_local_step.py b/tests/on_wlm/test_local_step.py index 0613f41c2..dd838b80e 100644 --- a/tests/on_wlm/test_local_step.py +++ b/tests/on_wlm/test_local_step.py @@ -40,13 +40,13 @@ """ -def test_local_env_pass_implicit(fileutils) -> None: +def test_local_env_pass_implicit(fileutils, make_test_dir) -> None: """Ensure implicitly exported env is available to running task""" exp_value = str(uuid.uuid4()) env_key = "test_local_env_pass_implicit" os.environ[env_key] = exp_value - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp_dir = f"{test_dir}/exp" os.makedirs(exp_dir) script = fileutils.get_test_conf_path("check_env.py") @@ -72,19 +72,19 @@ def test_local_env_pass_implicit(fileutils) -> None: with open(f"{exp_dir}/{app_name}/{app_name}.out") as app_outfile: app_output = app_outfile.read() - + # verify application was able to access the env var assert f"{env_key}=={exp_value}" in app_output -def test_local_env_pass_explicit(fileutils) -> None: +def test_local_env_pass_explicit(fileutils, make_test_dir) -> None: """Ensure explicitly exported env is available to running task""" exp_value = str(uuid.uuid4()) env_key = "test_local_env_pass_explicit" assert env_key not in os.environ - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("check_env.py") exp_dir = f"{test_dir}/exp" @@ -111,6 +111,6 @@ def test_local_env_pass_explicit(fileutils) -> None: with open(f"{exp_dir}/{app_name}/{app_name}.out") as app_outfile: app_output = app_outfile.read() - + # verify application was able to access the env var assert f"{env_key}=={exp_value}" in app_output diff --git a/tests/on_wlm/test_restart.py b/tests/on_wlm/test_restart.py index 86d883358..4ef779d40 100644 --- a/tests/on_wlm/test_restart.py +++ b/tests/on_wlm/test_restart.py @@ -35,11 +35,11 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_restart(fileutils, wlmutils): +def test_restart(fileutils, make_test_dir, wlmutils): exp_name = "test-restart" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index d46a46aae..7f28b2080 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -48,7 +48,7 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_simple_model_on_wlm(fileutils, wlmutils): +def test_simple_model_on_wlm(fileutils, make_test_dir, wlmutils): launcher = wlmutils.get_test_launcher() if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: pytest.skip( @@ -57,7 +57,7 @@ def test_simple_model_on_wlm(fileutils, wlmutils): exp_name = "test-simplebase-settings-model-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") @@ -69,7 +69,7 @@ def test_simple_model_on_wlm(fileutils, wlmutils): assert exp.get_status(M)[0] == status.STATUS_COMPLETED -def test_simple_model_stop_on_wlm(fileutils, wlmutils): +def test_simple_model_stop_on_wlm(fileutils, make_test_dir, wlmutils): launcher = wlmutils.get_test_launcher() if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: pytest.skip( @@ -78,7 +78,7 @@ def test_simple_model_stop_on_wlm(fileutils, wlmutils): exp_name = "test-simplebase-settings-model-stop" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index 14fd2e4ae..f66971996 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -46,10 +46,10 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_models(fileutils, wlmutils): +def test_models(fileutils, make_test_dir, wlmutils): exp_name = "test-models-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -63,10 +63,10 @@ def test_models(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_ensemble(fileutils, wlmutils): +def test_ensemble(fileutils, make_test_dir, wlmutils): exp_name = "test-ensemble-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -80,12 +80,12 @@ def test_ensemble(fileutils, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_summary(fileutils, wlmutils): +def test_summary(fileutils, make_test_dir, wlmutils): """Fairly rudimentary test of the summary dataframe""" exp_name = "test-launch-summary" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir sleep = fileutils.get_test_conf_path("sleep.py") bad = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index a786ce1a4..13e6b54af 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -42,10 +42,10 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_stop_entity(fileutils, wlmutils): +def test_stop_entity(fileutils, make_test_dir, wlmutils): exp_name = "test-launch-stop-model" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") @@ -59,11 +59,11 @@ def test_stop_entity(fileutils, wlmutils): assert exp.get_status(M1)[0] == status.STATUS_CANCELLED -def test_stop_entity_list(fileutils, wlmutils): +def test_stop_entity_list(fileutils, make_test_dir, wlmutils): exp_name = "test-launch-stop-ensemble" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index e0999299e..bfeba3760 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -144,14 +144,14 @@ def test_launch_colocated_model_defaults( @pytest.mark.parametrize("db_type", supported_dbs) def test_launch_multiple_colocated_models( - fileutils, coloutils, wlmutils, db_type, launcher="local" + fileutils, make_test_dir, coloutils, wlmutils, db_type, launcher="local" ): """Test the concurrent launch of two models with a colocated database and local launcher """ db_args = {} - exp = Experiment("multi_colo_models", launcher=launcher) + exp = Experiment("multi_colo_models", launcher=launcher, exp_path=make_test_dir) colo_models = [ coloutils.setup_test_colo( fileutils, @@ -172,7 +172,7 @@ def test_launch_multiple_colocated_models( port=wlmutils.get_test_port() + 1, ), ] - + exp.generate(*colo_models) exp.start(*colo_models, block=True) statuses = exp.get_status(*colo_models) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @@ -185,9 +185,9 @@ def test_launch_multiple_colocated_models( @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_disable_pinning( - fileutils, coloutils, db_type, launcher="local" + fileutils, make_test_dir, coloutils, db_type, launcher="local" ): - exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher) + exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher, exp_path=make_test_dir) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -201,6 +201,7 @@ def test_colocated_model_disable_pinning( db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] is None + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @@ -208,9 +209,9 @@ def test_colocated_model_disable_pinning( @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_auto_2cpu( - fileutils, coloutils, db_type, launcher="local" + fileutils, make_test_dir, coloutils, db_type, launcher="local" ): - exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher) + exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher, exp_path=make_test_dir) db_args = { "db_cpus": 2, @@ -231,6 +232,7 @@ def test_colocated_model_pinning_auto_2cpu( assert ( colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning ) + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @@ -238,10 +240,10 @@ def test_colocated_model_pinning_auto_2cpu( @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher="local"): +def test_colocated_model_pinning_range(fileutils, make_test_dir, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment("colocated_model_pinning_manual", launcher=launcher, exp_path=make_test_dir) db_args = {"db_cpus": 2, "custom_pinning": range(2)} @@ -253,6 +255,7 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher=" db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @@ -260,10 +263,10 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher=" @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_list(fileutils, coloutils, db_type, launcher="local"): +def test_colocated_model_pinning_list(fileutils, make_test_dir, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment("colocated_model_pinning_manual", launcher=launcher, exp_path=make_test_dir) db_args = {"db_cpus": 1, "custom_pinning": [1]} @@ -275,6 +278,7 @@ def test_colocated_model_pinning_list(fileutils, coloutils, db_type, launcher="l db_args, ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "1" + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/test_config.py b/tests/test_config.py index 5e28202b1..2af76dcd7 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -55,7 +55,7 @@ def test_all_config_defaults(): def get_redisai_env(rai_path: t.Optional[str], lib_path: t.Optional[str]) -> t.Dict[str, str]: - """Convenience method to create a set of environment variables + """Convenience method to create a set of environment variables that include RedisAI-specific variables :param rai_path: The path to the RedisAI library :type: str (optional) @@ -68,21 +68,25 @@ def get_redisai_env(rai_path: t.Optional[str], lib_path: t.Optional[str]) -> t.D env["RAI_PATH"] = rai_path else: env.pop("RAI_PATH", None) - + if lib_path is not None: env["SMARTSIM_DEP_INSTALL_PATH"] = lib_path else: env.pop("SMARTSIM_DEP_INSTALL_PATH", None) - + return env -def test_redisai_invalid_rai_path(fileutils, monkeypatch): +def make_file(filepath: str) -> None: + os.makedirs(os.path.dirname(filepath)) + with open(filepath, "w+", encoding="utf-8") as dummy_file: + dummy_file.write("dummy\n") + +def test_redisai_invalid_rai_path(get_test_dir, monkeypatch): """An invalid RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should fail""" - test_dir = fileutils.make_test_dir() + test_dir = get_test_dir rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") - _ = fileutils.make_test_file("redisai.so", "lib") - + make_file(os.path.join(test_dir, "lib", "redisai.so")) env = get_redisai_env(rai_file_path, test_dir) monkeypatch.setattr(os, "environ", env) @@ -95,11 +99,11 @@ def test_redisai_invalid_rai_path(fileutils, monkeypatch): assert 'RedisAI dependency not found' in ex.value.args[0] -def test_redisai_valid_rai_path(fileutils, monkeypatch): +def test_redisai_valid_rai_path(get_test_dir, monkeypatch): """A valid RAI_PATH should override valid SMARTSIM_DEP_INSTALL_PATH and succeed""" - test_dir = fileutils.make_test_dir() - rai_file_path = fileutils.make_test_file("mock-redisai.so", "lib") - _ = fileutils.make_test_file("redisai.so", "deps") + test_dir = get_test_dir + rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") + make_file(rai_file_path) env = get_redisai_env(rai_file_path, test_dir) monkeypatch.setattr(os, "environ", env) @@ -110,14 +114,14 @@ def test_redisai_valid_rai_path(fileutils, monkeypatch): assert config.redisai == rai_file_path -def test_redisai_invalid_lib_path(fileutils, monkeypatch): +def test_redisai_invalid_lib_path(make_test_dir, monkeypatch): """Invalid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should fail""" - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir rai_file_path = f"{test_dir}/railib/redisai.so" env = get_redisai_env(rai_file_path, test_dir) monkeypatch.setattr(os, "environ", env) - + config = Config() # Fail when no files exist @ either location with pytest.raises(SSConfigError) as ex: @@ -126,29 +130,29 @@ def test_redisai_invalid_lib_path(fileutils, monkeypatch): assert 'RedisAI dependency not found' in ex.value.args[0] -def test_redisai_valid_lib_path(fileutils, monkeypatch): +def test_redisai_valid_lib_path(get_test_dir, monkeypatch): """Valid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should succeed""" - test_dir = fileutils.make_test_dir() - rai_file_path = fileutils.make_test_file("mock-redisai.so", "lib") - + test_dir = get_test_dir + rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") + make_file(rai_file_path) env = get_redisai_env(rai_file_path, test_dir) monkeypatch.setattr(os, "environ", env) - + config = Config() assert config.redisai assert Path(config.redisai).is_file() assert config.redisai == rai_file_path -def test_redisai_valid_lib_path_null_rai(fileutils, monkeypatch): +def test_redisai_valid_lib_path_null_rai(get_test_dir, monkeypatch): """Missing RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should succeed""" - test_dir = fileutils.make_test_dir() + test_dir = get_test_dir rai_file_path: t.Optional[str] = None - lib_file_path = fileutils.make_test_file("redisai.so", "lib") - + lib_file_path = os.path.join(test_dir, "lib", "redisai.so") + make_file(lib_file_path) env = get_redisai_env(rai_file_path, test_dir) monkeypatch.setattr(os, "environ", env) - + config = Config() assert config.redisai assert Path(config.redisai).is_file() diff --git a/tests/test_containers.py b/tests/test_containers.py index 0c6db8d49..f848f3663 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -87,9 +87,9 @@ def test_singularity_commands(fileutils): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_basic(fileutils): +def test_singularity_basic(fileutils, make_test_dir): """Basic argument-less Singularity test""" - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir container = Singularity(containerURI) @@ -113,9 +113,9 @@ def test_singularity_basic(fileutils): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_args(fileutils): +def test_singularity_args(fileutils, make_test_dir): """Test combinations of args and mount arguments for Singularity""" - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir hometest_dir = os.path.join(str(Path.home()), "test") # $HOME/test mount_paths = {test_dir + "/singularity_args": hometest_dir} container = Singularity(containerURI, args="--contain", mount=mount_paths) @@ -140,7 +140,7 @@ def test_singularity_args(fileutils): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_smartredis(fileutils, wlmutils): +def test_singularity_smartredis(make_test_dir, fileutils, wlmutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. @@ -148,7 +148,7 @@ def test_singularity_smartredis(fileutils, wlmutils): Note: This is a containerized port of test_smartredis.py """ - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" ) diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index 5b849d76e..434362a57 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -46,10 +46,10 @@ def test_parse_db_host_error(): orc.entities[0].host -def test_hosts(fileutils, wlmutils): +def test_hosts(make_test_dir, wlmutils): exp_name = "test_hosts" exp = Experiment(exp_name) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir orc = Orchestrator(port=wlmutils.get_test_port(), interface="lo", launcher="local") orc.set_path(test_dir) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index dbaa51bdc..64d526a42 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -34,10 +34,10 @@ from smartsim.settings import RunSettings -def test_model_prefix(fileutils): +def test_model_prefix(make_test_dir): exp_name = "test_prefix" exp = Experiment(exp_name) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir model = exp.create_model( "model", path=test_dir, @@ -108,10 +108,10 @@ def test_bad_ensemble_init_no_rs_bs(): exp.create_ensemble("name") -def test_stop_entity(fileutils): +def test_stop_entity(make_test_dir): exp_name = "test_stop_entity" exp = Experiment(exp_name) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir m = exp.create_model("model", path=test_dir, run_settings=RunSettings("sleep", "5")) exp.start(m, block=False) assert exp.finished(m) == False @@ -119,11 +119,11 @@ def test_stop_entity(fileutils): assert exp.finished(m) == True -def test_poll(fileutils): +def test_poll(make_test_dir): # Ensure that a SmartSimError is not raised exp_name = "test_exp_poll" exp = Experiment(exp_name) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir model = exp.create_model( "model", path=test_dir, run_settings=RunSettings("sleep", "5") ) @@ -132,10 +132,10 @@ def test_poll(fileutils): exp.stop(model) -def test_summary(fileutils): +def test_summary(make_test_dir): exp_name = "test_exp_summary" exp = Experiment(exp_name) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir m = exp.create_model( "model", path=test_dir, run_settings=RunSettings("echo", "Hello") ) diff --git a/tests/test_generator.py b/tests/test_generator.py index b1cd93973..0719cd308 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -53,9 +53,9 @@ def get_gen_file(fileutils, filename): return fileutils.get_test_conf_path(osp.join("generator_files", filename)) -def test_ensemble(fileutils): +def test_ensemble(fileutils, get_test_dir): exp = Experiment("gen-test", launcher="local") - test_dir = fileutils.get_test_dir() + test_dir = get_test_dir gen = Generator(test_dir) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test", params=params, run_settings=rs) @@ -70,9 +70,9 @@ def test_ensemble(fileutils): assert osp.isdir(osp.join(test_dir, "test/test_" + str(i))) -def test_ensemble_overwrite(fileutils): +def test_ensemble_overwrite(fileutils, get_test_dir): exp = Experiment("gen-test-overwrite", launcher="local") - test_dir = fileutils.get_test_dir() + test_dir = get_test_dir gen = Generator(test_dir, overwrite=True) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} @@ -93,9 +93,9 @@ def test_ensemble_overwrite(fileutils): assert osp.isdir(osp.join(test_dir, "test/test_" + str(i))) -def test_ensemble_overwrite_error(fileutils): +def test_ensemble_overwrite_error(fileutils, get_test_dir): exp = Experiment("gen-test-overwrite-error", launcher="local") - test_dir = fileutils.get_test_dir() + test_dir = get_test_dir gen = Generator(test_dir) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} @@ -112,8 +112,8 @@ def test_ensemble_overwrite_error(fileutils): gen.generate_experiment(ensemble) -def test_full_exp(fileutils, wlmutils): - test_dir = fileutils.make_test_dir() +def test_full_exp(fileutils, make_test_dir, wlmutils): + test_dir = make_test_dir exp = Experiment("gen-test", test_dir, launcher="local") model = exp.create_model("model", run_settings=rs) @@ -141,12 +141,12 @@ def test_full_exp(fileutils, wlmutils): assert osp.isfile(osp.join(test_dir, "model/sleep.py")) -def test_dir_files(fileutils): +def test_dir_files(fileutils, make_test_dir): """test the generate of models with files that are directories with subdirectories and files """ - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment("gen-test", test_dir, launcher="local") params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} @@ -164,10 +164,10 @@ def test_dir_files(fileutils): assert osp.isfile(osp.join(model_path, "test.py")) -def test_print_files(fileutils, capsys): +def test_print_files(fileutils, make_test_dir, capsys): """Test the stdout print of files attached to an ensemble""" - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment("print-attached-files-test", test_dir, launcher="local") ensemble = exp.create_ensemble("dir_test", replicas=1, run_settings=rs) @@ -245,9 +245,9 @@ def test_print_files(fileutils, capsys): assert captured.out == expected_out_multi -def test_multiple_tags(fileutils): +def test_multiple_tags(fileutils, make_test_dir): """Test substitution of multiple tagged parameters on same line""" - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment("test-multiple-tags", test_dir) model_params = {"port": 6379, "password": "unbreakable_password"} @@ -267,10 +267,10 @@ def test_multiple_tags(fileutils): ) -def test_generation_log(fileutils): +def test_generation_log(fileutils, make_test_dir): """Test that an error is issued when a tag is unused and make_fatal is True""" - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment("gen-log-test", test_dir, launcher="local") params = {"THERMO": [10, 20], "STEPS": [10, 20]} @@ -302,12 +302,12 @@ def not_header(line): ), ) -def test_config_dir(fileutils): +def test_config_dir(fileutils, make_test_dir): """Test the generation and configuration of models with tagged files that are directories with subdirectories and files """ exp = Experiment("config-dir", launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir gen = Generator(test_dir) params = {"PARAM0": [0, 1], "PARAM1": [2, 3]} diff --git a/tests/test_interrupt.py b/tests/test_interrupt.py index f51d7fc4d..529dc966f 100644 --- a/tests/test_interrupt.py +++ b/tests/test_interrupt.py @@ -40,13 +40,13 @@ def keyboard_interrupt(pid): os.kill(pid, signal.SIGINT) -def test_interrupt_blocked_jobs(fileutils): +def test_interrupt_blocked_jobs(make_test_dir): """ Launches and polls a model and an ensemble with two more models. Once polling starts, the SIGINT signal is sent to the main thread, and consequently, all running jobs are killed. """ - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp_name = "test_interrupt_blocked_jobs" exp = Experiment(exp_name, exp_path=test_dir) model = exp.create_model( @@ -77,7 +77,7 @@ def test_interrupt_blocked_jobs(fileutils): assert len(completed_jobs) == num_jobs -def test_interrupt_multi_experiment_unblocked_jobs(fileutils): +def test_interrupt_multi_experiment_unblocked_jobs(make_test_dir): """ Starts two Experiments, each having one model and an ensemble with two more models. Since @@ -85,7 +85,7 @@ def test_interrupt_multi_experiment_unblocked_jobs(fileutils): the SIGINT signal is sent, resulting in both Experiment's running jobs to be killed. """ - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp_names = ["test_interrupt_jobs_0", "test_interrupt_jobs_1"] experiments = [Experiment(exp_names[i], exp_path=test_dir) for i in range(2)] jobs_per_experiment = [0] * len(experiments) diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index 2669fd882..2b311350d 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -43,10 +43,10 @@ def test_unsupported_run_settings(): exp.start(model) -def test_model_failure(fileutils): +def test_model_failure(fileutils, make_test_dir): exp_name = "test-model-failure" exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("bad.py") settings = RunSettings("python", f"{script} --time=3") @@ -58,11 +58,11 @@ def test_model_failure(fileutils): assert all([stat == status.STATUS_FAILED for stat in statuses]) -def test_orchestrator_relaunch(fileutils, wlmutils): +def test_orchestrator_relaunch(make_test_dir, wlmutils): """Test when users try to launch second orchestrator""" exp_name = "test-orc-on-relaunch" exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir orc = Orchestrator(port=wlmutils.get_test_port()) orc.set_path(test_dir) diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index 857855205..71642b739 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -32,10 +32,10 @@ """ -def test_models(fileutils): +def test_models(fileutils, make_test_dir): exp_name = "test-models-local-launch" exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -48,10 +48,10 @@ def test_models(fileutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_ensemble(fileutils): +def test_ensemble(fileutils, make_test_dir): exp_name = "test-ensemble-launch" exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_local_multi_run.py b/tests/test_local_multi_run.py index 76bfc898c..f9c4e0ad4 100644 --- a/tests/test_local_multi_run.py +++ b/tests/test_local_multi_run.py @@ -32,10 +32,10 @@ """ -def test_models(fileutils): +def test_models(fileutils, make_test_dir): exp_name = "test-models-local-launch" exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index 99c6afd5e..625cecb5a 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -32,11 +32,11 @@ """ -def test_restart(fileutils): +def test_restart(fileutils, make_test_dir): exp_name = "test-models-local-restart" exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -53,10 +53,10 @@ def test_restart(fileutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_ensemble(fileutils): +def test_ensemble(fileutils, make_test_dir): exp_name = "test-ensemble-restart" exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_modelwriter.py b/tests/test_modelwriter.py index 11d887e2d..71dbb1d6f 100644 --- a/tests/test_modelwriter.py +++ b/tests/test_modelwriter.py @@ -40,8 +40,8 @@ def get_gen_file(fileutils, filename): return fileutils.get_test_conf_path(path.join("generator_files", filename)) -def test_write_easy_configs(fileutils): - test_dir = fileutils.make_test_dir() +def test_write_easy_configs(fileutils, make_test_dir): + test_dir = make_test_dir param_dict = { "5": 10, # MOM_input @@ -69,8 +69,8 @@ def test_write_easy_configs(fileutils): assert filecmp.cmp(written, correct) -def test_write_med_configs(fileutils): - test_dir = fileutils.make_test_dir() +def test_write_med_configs(fileutils, make_test_dir): + test_dir = make_test_dir param_dict = { "1 0 0 0": "3 0 0 0", # in.ellipse.gayberne @@ -101,10 +101,10 @@ def test_write_med_configs(fileutils): assert filecmp.cmp(written, correct) -def test_write_new_tag_configs(fileutils): +def test_write_new_tag_configs(fileutils, make_test_dir): """sets the tag to the dollar sign""" - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir param_dict = { "1 0 0 0": "3 0 0 0", # in.ellipse.gayberne @@ -146,8 +146,8 @@ def test_mw_error_2(): writer._write_changes("[not/a/path]") -def test_write_mw_error_3(fileutils): - test_dir = fileutils.make_test_dir() +def test_write_mw_error_3(fileutils, make_test_dir): + test_dir = make_test_dir param_dict = { "5": 10, # MOM_input diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 61c99247c..1b75219b3 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -23,15 +23,9 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys - -import os - import pytest from smartsim import Experiment, status -from smartsim._core.utils import installed_redisai_backends from smartsim.error.errors import SSDBIDConflictError from smartsim.log import get_logger @@ -47,7 +41,7 @@ @pytest.mark.parametrize("db_type", supported_dbs) -def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_type): +def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_type, make_test_dir): """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp with unique db_identifiers""" @@ -58,19 +52,16 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # create regular database orc = exp.create_database( port=test_port, interface=test_interface, db_identifier="my_db" ) - - exp.generate(orc) - assert orc.name == "my_db" # create run settings @@ -80,7 +71,6 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ # # Create the SmartSim Model smartsim_model = exp.create_model("colocated_model", colo_settings) - smartsim_model.set_path(test_dir) db_args = { "port": test_port + 1, @@ -99,6 +89,8 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ assert smartsim_model.run_settings.colocated_db_settings["db_identifier"] == "my_db" + exp.generate(orc, smartsim_model) + try: exp.start(orc) with pytest.raises(SSDBIDConflictError) as ex: @@ -116,9 +108,9 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ @pytest.mark.parametrize("db_type", supported_dbs) -def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_type): +def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_type, make_test_dir): """Test colocate_db_uds/colocate_db_tcp then create_database with database - identifiers. + identifiers. """ # Set experiment name @@ -128,11 +120,11 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create run settings colo_settings = exp.create_run_settings("python", test_script) @@ -141,7 +133,6 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ # Create the SmartSim Model smartsim_model = exp.create_model("colocated_model", colo_settings) - smartsim_model.set_path(test_dir) db_args = { "port": test_port, @@ -165,7 +156,7 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ port=test_port + 1, interface=test_interface, db_identifier="my_db" ) - exp.generate(orc) + exp.generate(orc, smartsim_model) assert orc.name == "my_db" exp.start(smartsim_model, block=True) @@ -175,7 +166,7 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ exp.stop(orc) -def test_db_identifier_standard_twice_not_unique(wlmutils): +def test_db_identifier_standard_twice_not_unique(wlmutils, make_test_dir): """Test uniqueness of db_identifier several calls to create_database, with non unique names, checking error is raised before exp start is called""" @@ -186,9 +177,10 @@ def test_db_identifier_standard_twice_not_unique(wlmutils): test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() + test_dir = make_test_dir # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # CREATE DATABASE with db_identifier orc = exp.create_database( @@ -222,7 +214,7 @@ def test_db_identifier_standard_twice_not_unique(wlmutils): # exp.stop(orc2) -def test_db_identifier_create_standard_once(fileutils, wlmutils): +def test_db_identifier_create_standard_once(make_test_dir, wlmutils): """One call to create database with a database identifier""" # Set experiment name @@ -232,7 +224,7 @@ def test_db_identifier_create_standard_once(fileutils, wlmutils): test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir # Create the SmartSim Experiment exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) @@ -252,14 +244,14 @@ def test_db_identifier_create_standard_once(fileutils, wlmutils): print(exp.summary()) -def test_multidb_create_standard_twice(fileutils, wlmutils): +def test_multidb_create_standard_twice(make_test_dir, wlmutils): """Multiple calls to create database with unique db_identifiers""" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir # start a new Experiment for this section exp = Experiment( @@ -290,17 +282,17 @@ def test_multidb_create_standard_twice(fileutils, wlmutils): @pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_colo_once(fileutils, wlmutils, coloutils, db_type): +def test_multidb_colo_once(fileutils, make_test_dir, wlmutils, coloutils, db_type): """create one model with colocated database with db_identifier""" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() test_port = wlmutils.get_test_port() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("smartredis/dbid.py") # start a new Experiment for this section - exp = Experiment("test_multidb_colo_once", launcher=test_launcher) + exp = Experiment("test_multidb_colo_once", launcher=test_launcher, exp_path=test_dir) # create run settings run_settings = exp.create_run_settings("python", test_script) @@ -327,6 +319,7 @@ def test_multidb_colo_once(fileutils, wlmutils, coloutils, db_type): db_args, ) + exp.generate(smartsim_model) exp.start(smartsim_model) exp.stop(smartsim_model) @@ -334,12 +327,12 @@ def test_multidb_colo_once(fileutils, wlmutils, coloutils, db_type): @pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_standard_then_colo(fileutils, wlmutils, coloutils, db_type): +def test_multidb_standard_then_colo(fileutils, make_test_dir, wlmutils, coloutils, db_type): """Create regular database then colocate_db_tcp/uds with unique db_identifiers""" # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") test_interface = wlmutils.get_test_interface() test_launcher = wlmutils.get_test_launcher() @@ -354,11 +347,10 @@ def test_multidb_standard_then_colo(fileutils, wlmutils, coloutils, db_type): run_settings.set_nodes(1) run_settings.set_tasks_per_node(1) - # create and start an instance of the Orchestrator database + # create and generate an instance of the Orchestrator database db = exp.create_database( port=test_port, interface=test_interface, db_identifier="testdb_reg" ) - exp.generate(db) # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) @@ -379,6 +371,7 @@ def test_multidb_standard_then_colo(fileutils, wlmutils, coloutils, db_type): db_args, ) + exp.generate(db, smartsim_model) exp.start(db) exp.start(smartsim_model, block=True) @@ -395,12 +388,12 @@ def test_multidb_standard_then_colo(fileutils, wlmutils, coloutils, db_type): @pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_colo_then_standard(fileutils, wlmutils, coloutils, db_type): +def test_multidb_colo_then_standard(fileutils, make_test_dir, wlmutils, coloutils, db_type): """create regular database then colocate_db_tcp/uds with unique db_identifiers""" # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") test_interface = wlmutils.get_test_interface() test_launcher = wlmutils.get_test_launcher() @@ -438,7 +431,7 @@ def test_multidb_colo_then_standard(fileutils, wlmutils, coloutils, db_type): db = exp.create_database( port=test_port + 1, interface=test_interface, db_identifier="testdb_reg" ) - exp.generate(db) + exp.generate(db, smartsim_model) exp.start(db) exp.start(smartsim_model) @@ -460,14 +453,14 @@ def test_multidb_colo_then_standard(fileutils, wlmutils, coloutils, db_type): pytest.test_launcher not in pytest.wlm_options, reason="Not testing WLM integrations", ) -def test_launch_cluster_orc_single_dbid(fileutils, wlmutils): +def test_launch_cluster_orc_single_dbid(make_test_dir, wlmutils): """test clustered 3-node orchestrator with single command with a database identifier""" # TODO detect number of nodes in allocation and skip if not sufficent exp_name = "test_launch_cluster_orc_single_dbid" launcher = wlmutils.get_test_launcher() - exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -480,8 +473,7 @@ def test_launch_cluster_orc_single_dbid(fileutils, wlmutils): hosts=wlmutils.get_test_hostlist(), db_identifier="testdb_reg", ) - orc.set_path(test_dir) - + exp.generate(orc) exp.start(orc, block=True) statuses = exp.get_status(orc) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index e8156a4ee..7b3b54be6 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -65,10 +65,10 @@ def test_inactive_orc_get_address(): db.get_address() -def test_orc_active_functions(fileutils, wlmutils): +def test_orc_active_functions(make_test_dir, wlmutils): exp_name = "test_orc_active_functions" exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir db = Orchestrator(port=wlmutils.get_test_port()) db.set_path(test_dir) @@ -93,10 +93,10 @@ def test_orc_active_functions(fileutils, wlmutils): db.get_address() -def test_multiple_interfaces(fileutils, wlmutils): +def test_multiple_interfaces(make_test_dir, wlmutils): exp_name = "test_multiple_interfaces" exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir net_if_addrs = psutil.net_if_addrs() net_if_addrs = [ diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index 7bc3a6520..e70ec5d8a 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -115,7 +115,7 @@ def set_env_var_to_inherit(rs): ], ) def test_pbs_can_make_step_from_pals_settings_fmt_cmd( - monkeypatch, mock_mpiexec, fileutils, rs_mutation, run_args + monkeypatch, mock_mpiexec, make_test_dir, rs_mutation, run_args ): # Setup run settings exe_args = ["-c", """'print("Hello")'"""] @@ -126,7 +126,7 @@ def test_pbs_can_make_step_from_pals_settings_fmt_cmd( launcher = PBSLauncher() monkeypatch.setenv(f"PBS_JOBID", "mock-job") - wdir = fileutils.make_test_dir() + wdir = make_test_dir step = launcher.create_step("my_step", wdir, rs) assert isinstance(step, MpiexecStep) assert step.get_launch_cmd() == [ @@ -139,7 +139,7 @@ def test_pbs_can_make_step_from_pals_settings_fmt_cmd( ] -def test_pals_settings_can_be_correctly_made_mpmd(monkeypatch, fileutils, mock_mpiexec): +def test_pals_settings_can_be_correctly_made_mpmd(monkeypatch, make_test_dir, mock_mpiexec): # Setup run settings def make_rs(exe, exe_args): return PalsMpiexecSettings(exe, exe_args), [exe] + exe_args @@ -166,7 +166,7 @@ def set_tasks(rs, num): launcher = PBSLauncher() monkeypatch.setenv(f"PBS_JOBID", "mock-job") - wdir = fileutils.make_test_dir() + wdir = make_test_dir step = launcher.create_step("my_step", wdir, rs_1) assert isinstance(step, MpiexecStep) assert step.get_launch_cmd() == [ diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index 609ba48e7..1c5502c6c 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -37,12 +37,12 @@ # use https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test -def test_local_orchestrator(fileutils, wlmutils): +def test_local_orchestrator(make_test_dir, wlmutils): """Test launching orchestrator locally""" global first_dir exp_name = "test-orc-launch-local" exp = Experiment(exp_name, launcher="local") - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir first_dir = test_dir orc = Orchestrator(port=wlmutils.get_test_port()) diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py index f27ac5de4..2c52590dc 100644 --- a/tests/test_smartredis.py +++ b/tests/test_smartredis.py @@ -55,13 +55,13 @@ ) -def test_exchange(fileutils, wlmutils): +def test_exchange(fileutils, make_test_dir, wlmutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. """ - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" ) @@ -100,14 +100,14 @@ def test_exchange(fileutils, wlmutils): exp.stop(orc) -def test_consumer(fileutils, wlmutils): +def test_consumer(fileutils, make_test_dir, wlmutils): """Run three processes, each one of the first two processes puts a tensor on the DB; the third process accesses the tensors put by the two producers. Finally, the tensor is used to run a model by each producer and the consumer accesses the two results. """ - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment( "smartredis_ensemble_consumer", exp_path=test_dir, launcher="local" ) From 2f9882b1636884896b7809aaf30b8e54f5791e4e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 12 Oct 2023 20:04:53 +0200 Subject: [PATCH 03/64] More permissive naming for caller_function --- conftest.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/conftest.py b/conftest.py index af15afabd..dca807724 100644 --- a/conftest.py +++ b/conftest.py @@ -526,12 +526,22 @@ def get_config_edit_method( return config_edit_methods.get(config_setting, None) +def _sanitize_caller_function(caller_function: str) -> str: + # Parametrized test functions end with a list of all + # parameter values. The list is enclosed in square brackets. + # We split at the opening bracket, sanitize the string + # to its right and then merge the function name and + # the sanitized list with a dot. + caller_function_list = caller_function.split("[", maxsplit=1) + def is_accepted_char(char: str): + return char.isalnum or char in "-." + if len(caller_function_list) > 1: + caller_function_list[1] = ''.join(filter(is_accepted_char, caller_function_list[1])) + return ".".join(caller_function_list) + @pytest.fixture def get_test_dir(request: t.Optional[pytest.FixtureRequest]): - caller_function_list = request.node.name.split("[", maxsplit=1) - if len(caller_function_list) > 1: - caller_function_list[1] = ''.join(filter(str.isalnum, caller_function_list[1])) - caller_function = ".".join(caller_function_list) + caller_function = _sanitize_caller_function(caller_function) dir_path = FileUtils._test_dir_path(caller_function, request.node.fspath) if not os.path.exists(os.path.dirname(dir_path)): From 978e177b8293e8b7395ea61a8e01e6bda8cafebd Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 12 Oct 2023 20:06:38 +0200 Subject: [PATCH 04/64] Style --- conftest.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/conftest.py b/conftest.py index dca807724..cdd63c777 100644 --- a/conftest.py +++ b/conftest.py @@ -84,7 +84,9 @@ def print_test_configuration() -> None: print("TEST_ALLOC_SPEC_SHEET_PATH:", test_alloc_specs_path) print("TEST_DIR:", test_dir) print("Test output will be located in TEST_DIR if there is a failure") - print("TEST_PORTS", ", ".join(str(port) for port in range(test_port, test_port+3))) + print( + "TEST_PORTS", ", ".join(str(port) for port in range(test_port, test_port + 3)) + ) def pytest_configure() -> None: @@ -145,7 +147,7 @@ def get_hostlist() -> t.Optional[t.List[str]]: return _parse_hostlist_file(os.environ["COBALT_NODEFILE"]) except FileNotFoundError: return None - elif "PBS_NODEFILE" in os.environ and test_launcher=="pals": + elif "PBS_NODEFILE" in os.environ and test_launcher == "pals": # with PALS, we need a hostfile even if `aprun` is available try: return _parse_hostlist_file(os.environ["PBS_NODEFILE"]) @@ -533,12 +535,17 @@ def _sanitize_caller_function(caller_function: str) -> str: # to its right and then merge the function name and # the sanitized list with a dot. caller_function_list = caller_function.split("[", maxsplit=1) + def is_accepted_char(char: str): return char.isalnum or char in "-." + if len(caller_function_list) > 1: - caller_function_list[1] = ''.join(filter(is_accepted_char, caller_function_list[1])) + caller_function_list[1] = "".join( + filter(is_accepted_char, caller_function_list[1]) + ) return ".".join(caller_function_list) + @pytest.fixture def get_test_dir(request: t.Optional[pytest.FixtureRequest]): caller_function = _sanitize_caller_function(caller_function) @@ -552,7 +559,6 @@ def get_test_dir(request: t.Optional[pytest.FixtureRequest]): @pytest.fixture def make_test_dir(request: t.Optional[pytest.FixtureRequest]): - caller_function = request.node.name.replace("[", ".").replace("]", "") dir_path = FileUtils._test_dir_path(caller_function, request.node.fspath) @@ -567,8 +573,8 @@ def make_test_dir(request: t.Optional[pytest.FixtureRequest]): def fileutils() -> t.Type[FileUtils]: return FileUtils -class FileUtils: +class FileUtils: @staticmethod def _test_dir_path(caller_function: str, caller_fspath: str) -> str: caller_file_to_dir = os.path.splitext(str(caller_fspath))[0] @@ -617,7 +623,7 @@ def setup_test_colo( db_args: t.Dict[str, t.Any], colo_settings: t.Optional[t.Dict[str, t.Any]] = None, colo_model_name: t.Optional[str] = None, - port: t.Optional[int] = test_port + port: t.Optional[int] = test_port, ) -> Model: """Setup things needed for setting up the colo pinning tests""" From 0e25a4a414b6676011d7945a7226b6058f9f9578 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Mon, 2 Oct 2023 18:34:07 -0500 Subject: [PATCH 05/64] Update tests to pass on_wlm --- conftest.py | 32 +++++++++++++++++++--------- tests/backends/test_dbmodel.py | 16 +++++++------- tests/backends/test_dbscript.py | 12 +++++------ tests/on_wlm/test_colocated_model.py | 28 +++++++++++++++++------- tests/test_colo_model_local.py | 15 ++++++++----- 5 files changed, 66 insertions(+), 37 deletions(-) diff --git a/conftest.py b/conftest.py index 69f712d6a..772a2f146 100644 --- a/conftest.py +++ b/conftest.py @@ -49,6 +49,8 @@ from subprocess import run import sys import typing as t +import warnings +import contextlib # pylint: disable=redefined-outer-name,invalid-name,global-statement @@ -678,22 +680,26 @@ def setup_test_colo( application_file: str, db_args: t.Dict[str, t.Any], colo_settings: t.Optional[t.Dict[str, t.Any]] = None, - colo_model_name: t.Optional[str] = None, - port: t.Optional[int] = test_port + colo_model_name: t.Optional[str] = "colocated_model", + port: t.Optional[int] = test_port, + on_wlm: t.Optional[bool] = False, + sr_test_script = fileutils.get_test_conf_path(application_file) ) -> Model: - """Setup things needed for setting up the colo pinning tests""" - # get test setup - test_dir = fileutils.make_test_dir(level=2) + """Setup database needed for the colo pinning tests""" - sr_test_script = fileutils.get_test_conf_path(application_file) + # get test setup + test_dir = fileutils.make_test_dir(level=level) + sr_test_script = fileutils.get_test_conf_path("send_data_local_smartredis.py") # Create an app with a colo_db which uses 1 db_cpu if colo_settings is None: colo_settings = exp.create_run_settings( exe=sys.executable, exe_args=[sr_test_script] ) - colo_name = colo_model_name if colo_model_name else "colocated_model" - colo_model = exp.create_model(colo_name, colo_settings) + if on_wlm: + colo_settings.set_tasks(1) + colo_settings.set_nodes(1) + colo_model = exp.create_model(colo_model_name, colo_settings) colo_model.set_path(test_dir) if db_type in ["tcp", "deprecated"]: @@ -707,8 +713,14 @@ def setup_test_colo( "deprecated": colo_model.colocate_db, "uds": colo_model.colocate_db_uds, } - - colocate_fun[db_type](**db_args) + with warnings.catch_warnings(): + if db_type == "deprecated": + warnings.filterwarnings( + "ignore", + message="`colocate_db` has been deprecated" + ) + colocate_fun[db_type](**db_args) + exp.generate(colo_model, overwrite=True) # assert model will launch with colocated db assert colo_model.colocated # Check to make sure that limit_db_cpus made it into the colo settings diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 386631a50..57c4b4139 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -165,7 +165,7 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) @@ -241,7 +241,7 @@ def test_pt_db_model(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) @@ -305,7 +305,7 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create ensemble smartsim_ensemble = exp.create_ensemble( @@ -403,7 +403,7 @@ def test_colocated_db_model_tf(fileutils, wlmutils, mlutils): # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create colocated Model colo_model = exp.create_model("colocated_model", colo_settings) @@ -474,7 +474,7 @@ def test_colocated_db_model_pytorch(fileutils, wlmutils, mlutils): # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) @@ -534,7 +534,7 @@ def test_colocated_db_model_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings for colocated model colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create ensemble of two identical models colo_ensemble: Ensemble = exp.create_ensemble( @@ -638,7 +638,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, wlmutils, mlutils): # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create the ensemble of two identical SmartSim Model colo_ensemble = exp.create_ensemble( @@ -740,7 +740,7 @@ def test_colocated_db_model_errors(fileutils, wlmutils, mlutils): # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index c92be31de..af38a151a 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -80,7 +80,7 @@ def test_db_script(fileutils, wlmutils, mlutils): # Create the RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) @@ -155,7 +155,7 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) + run_settings.set_tasks(1) # Create Ensemble with two identical models ensemble = exp.create_ensemble( @@ -250,7 +250,7 @@ def test_colocated_db_script(fileutils, wlmutils, mlutils): # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create model with colocated database colo_model = exp.create_model("colocated_model", colo_settings) @@ -318,7 +318,7 @@ def test_colocated_db_script_ensemble(fileutils, wlmutils, mlutils): # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create SmartSim Ensemble with two identical models colo_ensemble = exp.create_ensemble( @@ -417,7 +417,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, wlmutils, mlutils): # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create Ensemble with two identical SmartSim Model colo_ensemble = exp.create_ensemble( @@ -514,7 +514,7 @@ def test_db_script_errors(fileutils, wlmutils, mlutils): # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) + colo_settings.set_tasks(1) # Create a SmartSim model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index a38fabd06..78b678029 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -36,6 +36,9 @@ else: supported_dbs = ["uds", "tcp", "deprecated"] +# Set to true if DB logs should be generated for debugging +DEBUG_DB = False + # retrieved from pytest fixtures launcher = pytest.test_launcher if launcher not in pytest.wlm_options: @@ -45,15 +48,16 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): """Test the launch of a model with a colocated database and local launcher""" - db_args = { } + db_args = { "debug":DEBUG_DB } - exp = Experiment("colocated_model_defaults", launcher=launcher) + exp = Experiment(f"colocated_model_defaults_{db_type}", launcher=launcher) colo_model = coloutils.setup_test_colo( fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0" @@ -69,10 +73,11 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): - exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher) + exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", launcher=launcher) db_args = { "db_cpus": 1, "custom_pinning": [], + "debug":DEBUG_DB, } # Check to make sure that the CPU mask was correctly generated @@ -82,6 +87,7 @@ def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): exp, "send_data_local_smartredis.py", db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] is None exp.start(colo_model, block=True) @@ -91,10 +97,11 @@ def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type): - exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher) + exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", launcher=launcher) db_args = { "db_cpus": 2, + "debug":DEBUG_DB } # Check to make sure that the CPU mask was correctly generated @@ -104,6 +111,7 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type): exp, "send_data_local_smartredis.py", db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1" exp.start(colo_model, block=True) @@ -115,11 +123,12 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # Assume that there are at least 4 cpus on the node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher) db_args = { "db_cpus": 4, - "custom_pinning": range(4) + "custom_pinning": range(4), + "debug":DEBUG_DB } colo_model = coloutils.setup_test_colo( @@ -128,6 +137,7 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type): exp, "send_data_local_smartredis.py", db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1,2,3" exp.start(colo_model, block=True) @@ -139,7 +149,7 @@ def test_colocated_model_pinning_list(fileutils, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this has more than 2 CPUs on the supercomputer node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher) db_args = { "db_cpus": 2, @@ -152,6 +162,7 @@ def test_colocated_model_pinning_list(fileutils, coloutils, db_type): exp, "send_data_local_smartredis.py", db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,2" exp.start(colo_model, block=True) @@ -163,7 +174,7 @@ def test_colocated_model_pinning_mixed(fileutils, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this at least 4 CPUs on the supercomputer node - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher) db_args = { "db_cpus": 2, @@ -176,6 +187,7 @@ def test_colocated_model_pinning_mixed(fileutils, coloutils, db_type): exp, "send_data_local_smartredis.py", db_args, + on_wlm=True ) assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0,1,3" exp.start(colo_model, block=True) diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index e0999299e..31fa1f2a7 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -116,7 +116,8 @@ def test_launch_colocated_model_defaults( db_args = {} - exp = Experiment("colocated_model_defaults", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(f"colocated_model_defaults_{db_type}", test_dir, launcher=launcher) colo_model = coloutils.setup_test_colo( fileutils, db_type, @@ -187,7 +188,8 @@ def test_launch_multiple_colocated_models( def test_colocated_model_disable_pinning( fileutils, coloutils, db_type, launcher="local" ): - exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", test_dir, launcher=launcher) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -210,7 +212,8 @@ def test_colocated_model_disable_pinning( def test_colocated_model_pinning_auto_2cpu( fileutils, coloutils, db_type, launcher="local" ): - exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", test_dir, launcher=launcher) db_args = { "db_cpus": 2, @@ -241,7 +244,8 @@ def test_colocated_model_pinning_auto_2cpu( def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", test_dir, launcher=launcher) db_args = {"db_cpus": 2, "custom_pinning": range(2)} @@ -263,7 +267,8 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher=" def test_colocated_model_pinning_list(fileutils, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - exp = Experiment("colocated_model_pinning_manual", launcher=launcher) + test_dir = fileutils.make_test_dir() + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", test_dir, launcher=launcher) db_args = {"db_cpus": 1, "custom_pinning": [1]} From c6930c019137ba454fe003ee6cb57c7037c784e1 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Mon, 16 Oct 2023 16:31:02 -0500 Subject: [PATCH 06/64] Respond to review feedback --- conftest.py | 6 ++---- tests/on_wlm/test_colocated_model.py | 8 ++++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/conftest.py b/conftest.py index 772a2f146..39129ebcb 100644 --- a/conftest.py +++ b/conftest.py @@ -50,7 +50,6 @@ import sys import typing as t import warnings -import contextlib # pylint: disable=redefined-outer-name,invalid-name,global-statement @@ -683,13 +682,12 @@ def setup_test_colo( colo_model_name: t.Optional[str] = "colocated_model", port: t.Optional[int] = test_port, on_wlm: t.Optional[bool] = False, - sr_test_script = fileutils.get_test_conf_path(application_file) ) -> Model: """Setup database needed for the colo pinning tests""" # get test setup - test_dir = fileutils.make_test_dir(level=level) - sr_test_script = fileutils.get_test_conf_path("send_data_local_smartredis.py") + test_dir = fileutils.make_test_dir(level=2) + sr_test_script = fileutils.get_test_conf_path(application_file) # Create an app with a colo_db which uses 1 db_cpu if colo_settings is None: diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index 78b678029..f152a199c 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -48,7 +48,7 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type): """Test the launch of a model with a colocated database and local launcher""" - db_args = { "debug":DEBUG_DB } + db_args = { "debug": DEBUG_DB } exp = Experiment(f"colocated_model_defaults_{db_type}", launcher=launcher) colo_model = coloutils.setup_test_colo( @@ -77,7 +77,7 @@ def test_colocated_model_disable_pinning(fileutils, coloutils, db_type): db_args = { "db_cpus": 1, "custom_pinning": [], - "debug":DEBUG_DB, + "debug": DEBUG_DB, } # Check to make sure that the CPU mask was correctly generated @@ -101,7 +101,7 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type): db_args = { "db_cpus": 2, - "debug":DEBUG_DB + "debug": DEBUG_DB } # Check to make sure that the CPU mask was correctly generated @@ -128,7 +128,7 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type): db_args = { "db_cpus": 4, "custom_pinning": range(4), - "debug":DEBUG_DB + "debug": DEBUG_DB } colo_model = coloutils.setup_test_colo( From 86e51f8e1fa12cfeee38662519e931ed9cb33812 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Mon, 16 Oct 2023 17:01:22 -0500 Subject: [PATCH 07/64] Modify for mpirun with PBS --- conftest.py | 13 +++++++++++-- smartsim/_core/config/config.py | 5 +++++ smartsim/settings/pbsSettings.py | 21 ++++++++++++++++++--- tests/backends/test_dbmodel.py | 15 ++++++++++++--- tests/backends/test_dbscript.py | 12 ++++++++++-- tests/test_batch_settings.py | 5 ++--- 6 files changed, 58 insertions(+), 13 deletions(-) diff --git a/conftest.py b/conftest.py index 2d5142f9c..d211a40b3 100644 --- a/conftest.py +++ b/conftest.py @@ -65,10 +65,11 @@ test_alloc_specs_path = os.getenv("SMARTSIM_TEST_ALLOC_SPEC_SHEET_PATH", None) test_port = CONFIG.test_port test_account = CONFIG.test_account or "" +test_batch_resources = CONFIG.test_batch_resources # Fill this at runtime if needed test_hostlist = None - +has_aprun = shutil.which("aprun") is not None def get_account() -> str: return test_account @@ -86,7 +87,10 @@ def print_test_configuration() -> None: print("TEST_ALLOC_SPEC_SHEET_PATH:", test_alloc_specs_path) print("TEST_DIR:", test_dir) print("Test output will be located in TEST_DIR if there is a failure") - print("TEST_PORTS", ", ".join(str(port) for port in range(test_port, test_port+3))) + print("TEST_PORTS:", ", ".join(str(port) for port in range(test_port, test_port+3))) + if test_batch_resources: + print("TEST_BATCH_RESOURCES: ") + print(json.dumps(test_batch_resources, indent=2)) def pytest_configure() -> None: @@ -95,6 +99,7 @@ def pytest_configure() -> None: account = get_account() pytest.test_account = account pytest.test_device = test_device + pytest.has_aprun = has_aprun def pytest_sessionstart( @@ -225,6 +230,10 @@ def get_test_interface() -> t.List[str]: def get_test_hostlist() -> t.Optional[t.List[str]]: return get_hostlist() + @staticmethod + def get_batch_resources() -> t.Dict: + return test_batch_resources + @staticmethod def get_base_run_settings( exe: str, args: t.List[str], nodes: int = 1, ntasks: int = 1, **kwargs: t.Any diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 3b0905021..2d0d0f2e7 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import json import os import psutil import typing as t @@ -171,6 +172,10 @@ def test_num_gpus(self) -> int: # pragma: no cover def test_port(self) -> int: # pragma: no cover return int(os.environ.get("SMARTSIM_TEST_PORT", 6780)) + @property + def test_batch_resources(self) -> t.Dict: # pragma: no cov + return json.loads(os.environ.get("SMARTSIM_TEST_BATCH_RESOURCES", "{}")) + @property def test_interface(self) -> t.List[str]: # pragma: no cover if interfaces_cfg := os.environ.get("SMARTSIM_TEST_INTERFACE", None): diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index e5e7f30e4..46d715265 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -29,7 +29,9 @@ from .._core.utils import init_default from ..error import SmartSimError from .base import BatchSettings +from ..log import get_logger +logger = get_logger(__name__) class QsubBatchSettings(BatchSettings): def __init__( @@ -189,8 +191,22 @@ def _create_resource_list(self) -> t.List[str]: res = [] # get select statement from resources or kwargs - if "select" in self.resources: + if ("select" in self.resources) and not ("nodes" in self.resources): res += [f"-l select={str(self.resources['select'])}"] + elif ("select" in self.resources) and ("nodes" in self.resources): + nselect = self.resources["select"] + if nselect == self._nodes: + logger.warning("select and nodes were both specified, specifying nodes") + res += [f"-l nodes={self._nodes}"] + else: + raise SmartSimError( + ( + "select and nodes were both specified, but do not have " + f"the same value. select={nselect} nodes={self.nodes}" + ) + ) + elif "nodes" in self.resources: + res += [f"-l nodes={self._nodes}"] else: select = "-l select=" if self._nodes: @@ -208,8 +224,6 @@ def _create_resource_list(self) -> t.List[str]: if "place" in self.resources: res += [f"-l place={str(self.resources['place'])}"] - else: - res += ["-l place=scatter"] # get time from resources or kwargs if "walltime" in self.resources: @@ -221,4 +235,5 @@ def _create_resource_list(self) -> t.List[str]: for resource, value in self.resources.items(): if resource not in ["select", "walltime", "place"]: res += [f"-l {resource}={str(value)}"] + print(res) return res diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 05972fda1..4a18a4bc5 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -34,6 +34,7 @@ from smartsim.entity import Ensemble from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger +from smartsim.settings import MpirunSettings, MpiexecSettings from smartsim.entity.dbobject import DBModel @@ -108,6 +109,11 @@ def forward(self, x): should_run_pt &= "torch" in installed_redisai_backends() +def choose_host(run_settings, wlmutils): + host = None + if isinstance(run_settings, (MpirunSettings, MpiexecSettings)): + host = wlmutils.get_test_hostlist()[0] + return host def save_tf_cnn(path, file_name): """Create a Keras CNN for testing purposes""" @@ -172,7 +178,8 @@ def test_tf_db_model(fileutils, wlmutils, mlutils): smartsim_model.set_path(test_dir) # Create database - db = exp.create_database(port=test_port, interface=test_interface) + host = choose_host(run_settings, wlmutils) + db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) # Create and save ML model to filesystem @@ -246,7 +253,8 @@ def test_pt_db_model(fileutils, wlmutils, mlutils): smartsim_model.set_path(test_dir) # Create database - db = exp.create_database(port=test_port, interface=test_interface) + host = choose_host(run_settings, wlmutils) + db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) # Create and save ML model to filesystem @@ -315,7 +323,8 @@ def test_db_model_ensemble(fileutils, wlmutils, mlutils): smartsim_model.set_path(test_dir) # Create database - db = exp.create_database(port=test_port, interface=test_interface) + host = choose_host(run_settings, wlmutils) + db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) # Create and save ML model to filesystem diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 7577bec31..335e4dc81 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -32,6 +32,7 @@ from smartsim._core.utils import installed_redisai_backends from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger +from smartsim.settings import MpirunSettings, MpiexecSettings from smartsim.entity.dbobject import DBScript @@ -46,6 +47,11 @@ should_run &= "torch" in installed_redisai_backends() +def choose_host(run_settings, wlmutils): + host = None + if isinstance(run_settings, (MpirunSettings, MpiexecSettings)): + host = wlmutils.get_test_hostlist()[0] + return host def timestwo(x): return 2 * x @@ -81,7 +87,8 @@ def test_db_script(fileutils, wlmutils, mlutils): smartsim_model.set_path(test_dir) # Create the SmartSim database - db = exp.create_database(port=test_port, interface=test_interface) + host = choose_host(run_settings, wlmutils) + db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) # Define the torch script string @@ -159,7 +166,8 @@ def test_db_script_ensemble(fileutils, wlmutils, mlutils): smartsim_model.set_path(test_dir) # Create SmartSim database - db = exp.create_database(port=test_port, interface=test_interface) + host = choose_host(run_settings, wlmutils) + db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) # Create the script string diff --git a/tests/test_batch_settings.py b/tests/test_batch_settings.py index abf685146..805edcb95 100644 --- a/tests/test_batch_settings.py +++ b/tests/test_batch_settings.py @@ -39,7 +39,6 @@ def test_create_pbs_batch(): assert isinstance(pbs_batch, QsubBatchSettings) assert args == [ "-l select=1:ncpus=10", - "-l place=scatter", "-l walltime=10:00:00", "-q default", "-A myproject", @@ -102,7 +101,7 @@ def test_existing_batch_args_mutation(): queue="default", batch_args=batch_args, ) - + # verify initial expectations assert "k1" in bsub.batch_args assert "k2" in bsub.batch_args @@ -129,7 +128,7 @@ def test_direct_set_batch_args_mutation(): queue="default", ) bsub.batch_args = batch_args - + # verify initial expectations assert "k1" in bsub.batch_args assert "k2" in bsub.batch_args From 89c20086acd83c99959e9517ba6fddcffbf87f8c Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Mon, 16 Oct 2023 18:06:52 -0500 Subject: [PATCH 08/64] Fix db shutdown and some fixtures --- conftest.py | 14 ++++---- smartsim/_core/control/controller.py | 30 ++++++++++++----- smartsim/_core/utils/redis.py | 48 +++++++++++++++------------- smartsim/database/orchestrator.py | 8 +++++ smartsim/entity/dbnode.py | 5 ++- smartsim/settings/slurmSettings.py | 2 +- tests/test_multidb.py | 4 ++- 7 files changed, 70 insertions(+), 41 deletions(-) diff --git a/conftest.py b/conftest.py index cdd63c777..5bf670681 100644 --- a/conftest.py +++ b/conftest.py @@ -396,16 +396,14 @@ def local_db( @pytest.fixture def db( - fileutils: t.Type[FileUtils], wlmutils: t.Type[WLMUtils], request: t.Any + request: t.Any, wlmutils: t.Type[WLMUtils], make_test_dir: t.Any ) -> t.Generator[Orchestrator, None, None]: """Yield fixture for startup and teardown of an orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = request.function.__name__ exp = Experiment(exp_name, launcher=launcher) - test_dir = fileutils.make_test_dir( - caller_function=exp_name, caller_fspath=request.fspath - ) + test_dir = make_test_dir db = wlmutils.get_orchestrator() db.set_path(test_dir) exp.start(db) @@ -441,7 +439,9 @@ def db_cluster( @pytest.fixture(scope="function", autouse=True) def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.delenv("SSDB", raising=False) + for key in os.environ.keys(): + if key.startswith("SSDB"): + monkeypatch.delenv(key, raising=False) monkeypatch.delenv("SSKEYIN", raising=False) monkeypatch.delenv("SSKEYOUT", raising=False) @@ -548,7 +548,7 @@ def is_accepted_char(char: str): @pytest.fixture def get_test_dir(request: t.Optional[pytest.FixtureRequest]): - caller_function = _sanitize_caller_function(caller_function) + caller_function = _sanitize_caller_function(request.node.name) dir_path = FileUtils._test_dir_path(caller_function, request.node.fspath) if not os.path.exists(os.path.dirname(dir_path)): @@ -559,7 +559,7 @@ def get_test_dir(request: t.Optional[pytest.FixtureRequest]): @pytest.fixture def make_test_dir(request: t.Optional[pytest.FixtureRequest]): - caller_function = request.node.name.replace("[", ".").replace("]", "") + caller_function = _sanitize_caller_function(request.node.name) dir_path = FileUtils._test_dir_path(caller_function, request.node.fspath) try: diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 08c9f2bd8..0b27901f5 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from __future__ import annotations +import itertools import os.path as osp from os import environ @@ -36,8 +37,10 @@ from smartredis import Client, ConfigOptions +from smartsim._core.utils.network import get_ip_from_host + from ..._core.launcher.step import Step -from ..._core.utils.redis import db_is_active, set_ml_model, set_script, shutdown_db +from ..._core.utils.redis import db_is_active, set_ml_model, set_script, shutdown_db_node from ..._core.utils.helpers import ( unpack_db_identifier, unpack_colo_db_identifier, @@ -53,7 +56,7 @@ ) from ...log import get_logger from ...settings.base import BatchSettings -from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES +from ...status import STATUS_CANCELLED, STATUS_FAILED, STATUS_RUNNING, TERMINAL_STATUSES from ...servertype import STANDALONE, CLUSTERED from ..config import CONFIG from ..launcher import ( @@ -212,12 +215,23 @@ def stop_db(self, db: Orchestrator) -> None: if db.batch: self.stop_entity(db) else: - shutdown_db(db.hosts, db.ports) - with JM_LOCK: - for entity in db: - job = self._jobs[entity.name] - job.set_status(STATUS_CANCELLED, "", 0, output=None, error=None) - self._jobs.move_to_completed(job) + for node in db.entities: + for host_ip, port in itertools.product( + (get_ip_from_host(host) for host in node.hosts), db.ports + ): + retcode, _, _ = shutdown_db_node(host_ip, port) + # Sometimes the DB will not shutdown (unless we force NOSAVE) + if retcode != 0: + self.stop_entity(node) + continue + + with JM_LOCK: + job = self._jobs[node.name] + job.set_status(STATUS_CANCELLED, "", 0, output=None, error=None) + self._jobs.move_to_completed(job) + + db.clear_hosts() + def stop_entity_list(self, entity_list: EntitySequence[SmartSimEntity]) -> None: """Stop an instance of an entity list diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 6570ab28b..dd2ba5084 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -95,7 +95,7 @@ def check_cluster_status( :raises SmartSimError: If cluster status cannot be verified """ cluster_nodes = [ - ClusterNode(get_ip_from_host(host), port) + ClusterNode(get_ip_from_host(host), port) # type: ignore for host, port in product(hosts, ports) ] @@ -109,9 +109,9 @@ def check_cluster_status( # wait for cluster to spin up time.sleep(5) try: - redis_tester: "RedisCluster[t.Any]" = RedisCluster( + redis_tester: "RedisCluster[t.Any]" = RedisCluster( # type: ignore startup_nodes=cluster_nodes - ) + ) # type: ignore redis_tester.set("__test__", "__test__") redis_tester.delete("__test__") # type: ignore logger.debug("Cluster status verified") @@ -219,31 +219,33 @@ def set_script(db_script: DBScript, client: Client) -> None: raise error -def shutdown_db(hosts: t.List[str], ports: t.List[int]) -> None: # cov-wlm - """Send shutdown signal to cluster instances. +def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov-wlm + """Send shutdown signal to DB node. Should only be used in the case where cluster deallocation needs to occur manually. Usually, the SmartSim task manager will take care of this automatically. - :param hosts: List of hostnames to connect to - :type hosts: List[str] - :param ports: List of ports for each hostname - :type ports: List[int] + :param host_ip: IP of host to connect to + :type hosts: str + :param ports: Port to which node is listening + :type ports: int + :return: returncode, output, and error of the process + :rtype: tuple of (int, str, str) :raises SmartSimError: if cluster creation fails """ - for host_ip, port in itertools.product( - (get_ip_from_host(host) for host in hosts), ports - ): - # call cluster command - redis_cli = CONFIG.database_cli - cmd = [redis_cli, "-h", host_ip, "-p", str(port), "shutdown"] - returncode, out, err = execute_cmd( - cmd, proc_input="yes", shell=False, timeout=10 - ) - if returncode != 0: - logger.error(out) - logger.error(err) - else: - logger.debug(out) + # call cluster command + redis_cli = CONFIG.database_cli + cmd = [redis_cli, "-h", host_ip, "-p", str(port), "shutdown"] + returncode, out, err = execute_cmd( + cmd, proc_input="yes", shell=False, timeout=10 + ) + + if returncode != 0: + logger.error(out) + logger.error(err) + elif out: + logger.debug(out) + + return returncode, out, err \ No newline at end of file diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 92453dc32..9406629ab 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -305,6 +305,14 @@ def hosts(self) -> t.List[str]: self._hosts = self._get_db_hosts() return self._hosts + def clear_hosts(self) -> None: + """Clears the list of hosts for this orchestrator. + """ + for node in self.entities: + node.clear_hosts() + + self._hosts = [] + def remove_stale_files(self) -> None: """Can be used to remove database files of a previous launch""" diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 35445c42d..92df73d6a 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -97,6 +97,9 @@ def hosts(self) -> t.List[str]: self._hosts = self._parse_db_hosts() return self._hosts + def clear_hosts(self) -> None: + self._hosts = None + @property def is_mpmd(self) -> bool: try: @@ -227,7 +230,7 @@ def _parse_db_hosts(self) -> t.List[str]: @dataclass(frozen=True) class LaunchedShardData: - """Data class to write an parse data about a launched database shard""" + """Data class to write and parse data about a launched database shard""" name: str hostname: str diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index 60280fce9..8da8659e1 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -336,7 +336,7 @@ def check_env_vars(self) -> None: # If a variable is defined, it will take precedence over --export # we warn the user preexisting_var = os.environ.get(k, None) - if preexisting_var is not None: + if preexisting_var is not None and preexisting_var != v: msg = ( f"Variable {k} is set to {preexisting_var} in current " "environment. If the job is running in an interactive " diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 1b75219b3..0d066f923 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -379,13 +379,15 @@ def test_multidb_standard_then_colo(fileutils, make_test_dir, wlmutils, coloutil exp.start(smartsim_model) exp.stop(db) + # test restart standard db exp.start(db) - exp.stop(db) exp.stop(smartsim_model) print(exp.summary()) + assert all(stat is not status.STATUS_FAILED for stat in exp.get_status(db, smartsim_model)) + @pytest.mark.parametrize("db_type", supported_dbs) def test_multidb_colo_then_standard(fileutils, make_test_dir, wlmutils, coloutils, db_type): From 765c5719241ff9bba6ef1314f62ade84740a6673 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Tue, 17 Oct 2023 14:02:03 -0500 Subject: [PATCH 09/64] Update DBModel tests --- tests/backends/test_dbmodel.py | 33 +++++++++++++++++++-------------- tests/backends/test_dbscript.py | 27 ++++++++++++--------------- 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 652f11d66..d3ebdebae 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -169,7 +169,6 @@ def test_tf_db_model(fileutils, make_test_dir, wlmutils, mlutils): # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) # Create database db = exp.create_database(port=test_port, interface=test_interface) @@ -208,6 +207,8 @@ def test_tf_db_model(fileutils, make_test_dir, wlmutils, mlutils): # Assert we have added both models assert len(smartsim_model._db_models) == 2 + exp.generate(smartsim_model) + # Launch and check successful completion try: exp.start(db, smartsim_model, block=True) @@ -243,7 +244,6 @@ def test_pt_db_model(fileutils, make_test_dir, wlmutils, mlutils): # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) # Create database db = exp.create_database(port=test_port, interface=test_interface) @@ -271,6 +271,8 @@ def test_pt_db_model(fileutils, make_test_dir, wlmutils, mlutils): # Assert we have added both models assert len(smartsim_model._db_models) == 1 + exp.generate(smartsim_model) + # Launch and check successful completion try: exp.start(db, smartsim_model, block=True) @@ -308,11 +310,9 @@ def test_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils): smartsim_ensemble = exp.create_ensemble( "smartsim_model", run_settings=run_settings, replicas=2 ) - smartsim_ensemble.set_path(test_dir) # Create Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) # Create database db = exp.create_database(port=test_port, interface=test_interface) @@ -366,6 +366,8 @@ def test_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils): # Assert we have added two models to each entity assert all([len(entity._db_models) == 2 for entity in smartsim_ensemble]) + exp.generate(smartsim_ensemble) + # Launch and check successful completion try: exp.start(db, smartsim_ensemble, block=True) @@ -392,7 +394,7 @@ def test_colocated_db_model_tf(fileutils, make_test_dir, wlmutils, mlutils): test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experience - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -401,7 +403,6 @@ def test_colocated_db_model_tf(fileutils, make_test_dir, wlmutils, mlutils): # Create colocated Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( port=test_port, db_cpus=1, @@ -436,6 +437,8 @@ def test_colocated_db_model_tf(fileutils, make_test_dir, wlmutils, mlutils): # Assert we have added both models assert len(colo_model._db_models) == 2 + exp.generate(colo_model) + # Launch and check successful completion try: exp.start(colo_model, block=True) @@ -461,7 +464,7 @@ def test_colocated_db_model_pytorch(fileutils, make_test_dir, wlmutils, mlutils) test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") # Create the SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -470,7 +473,6 @@ def test_colocated_db_model_pytorch(fileutils, make_test_dir, wlmutils, mlutils) # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( port=test_port, db_cpus=1, @@ -478,6 +480,7 @@ def test_colocated_db_model_pytorch(fileutils, make_test_dir, wlmutils, mlutils) ifname=test_interface ) + # Create and save ML model to filesystem save_torch_cnn(test_dir, "model1.pt") model_file = test_dir + "/model1.pt" @@ -492,6 +495,8 @@ def test_colocated_db_model_pytorch(fileutils, make_test_dir, wlmutils, mlutils) # Assert we have added both models assert len(colo_model._db_models) == 1 + exp.generate(colo_model) + # Launch and check successful completion try: exp.start(colo_model, block=True) @@ -531,7 +536,6 @@ def test_colocated_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils colo_ensemble: Ensemble = exp.create_ensemble( "colocated_ens", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Create a third model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) @@ -593,6 +597,8 @@ def test_colocated_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils outputs=outputs2, ) + exp.generate(colo_ensemble) + # Launch and check successful completion try: exp.start(colo_ensemble, block=True) @@ -621,7 +627,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, make_test_dir, wlmutil test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -632,11 +638,9 @@ def test_colocated_db_model_ensemble_reordered(fileutils, make_test_dir, wlmutil colo_ensemble = exp.create_ensemble( "colocated_ens", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) # Create and save ML model to filesystem model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") @@ -694,6 +698,8 @@ def test_colocated_db_model_ensemble_reordered(fileutils, make_test_dir, wlmutil outputs=outputs2, ) + exp.generate(colo_ensemble) + # Launch and check successful completion try: exp.start(colo_ensemble, block=True) @@ -720,7 +726,7 @@ def test_colocated_db_model_errors(fileutils, make_test_dir, wlmutils, mlutils): test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create colocated RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -752,7 +758,6 @@ def test_colocated_db_model_errors(fileutils, make_test_dir, wlmutils, mlutils): colo_ensemble = exp.create_ensemble( "colocated_ens", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Colocate a db with each ensemble member for i, entity in enumerate(colo_ensemble): diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 41f0d0b2f..fed2156be 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -158,11 +158,9 @@ def test_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutils): ensemble = exp.create_ensemble( "dbscript_ensemble", run_settings=run_settings, replicas=2 ) - ensemble.set_path(test_dir) # Create SmartSim model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) # Create SmartSim database db = exp.create_database(port=test_port, interface=test_interface) @@ -212,6 +210,8 @@ def test_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutils): # Assert we have added all three models to entities in ensemble assert all([len(entity._db_scripts) == 3 for entity in ensemble]) + exp.generate(ensemble) + try: exp.start(db, ensemble, block=True) statuses = exp.get_status(ensemble) @@ -238,7 +238,7 @@ def test_colocated_db_script(fileutils, make_test_dir, wlmutils, mlutils): torch_script = fileutils.get_test_conf_path("torchscript.py") # Create the SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -247,7 +247,6 @@ def test_colocated_db_script(fileutils, make_test_dir, wlmutils, mlutils): # Create model with colocated database colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( port=test_port, db_cpus=1, debug=True, ifname=test_interface ) @@ -273,6 +272,8 @@ def test_colocated_db_script(fileutils, make_test_dir, wlmutils, mlutils): # Assert we have added both models assert len(colo_model._db_scripts) == 2 + exp.generate(colo_model) + for db_script in colo_model._db_scripts: logger.debug(db_script) @@ -304,7 +305,7 @@ def test_colocated_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutil torch_script = fileutils.get_test_conf_path("torchscript.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -315,11 +316,9 @@ def test_colocated_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutil colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Create a SmartSim model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) # Colocate a db with each ensemble entity and add a script # to each entity via file @@ -372,6 +371,8 @@ def test_colocated_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutil # Assert we have added both models to each entity assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) + exp.generate(colo_ensemble) + # Launch and check successful completion try: exp.start(colo_ensemble, block=True) @@ -400,7 +401,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, make_test_dir, wlmuti torch_script = fileutils.get_test_conf_path("torchscript.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -411,11 +412,9 @@ def test_colocated_db_script_ensemble_reordered(fileutils, make_test_dir, wlmuti colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Create an additional SmartSim Model entity colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) # Add a script via string to the ensemble members torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" @@ -467,6 +466,8 @@ def test_colocated_db_script_ensemble_reordered(fileutils, make_test_dir, wlmuti # Assert we have added both models to each entity assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) + exp.generate(colo_ensemble) + # Launch and check successful completion try: exp.start(colo_ensemble, block=True) @@ -491,10 +492,9 @@ def test_db_script_errors(fileutils, make_test_dir, wlmutils, mlutils): test_num_gpus = mlutils.get_test_num_gpus() test_dir = make_test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") - torch_script = fileutils.get_test_conf_path("torchscript.py") # Create SmartSim experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) # Create RunSettings colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) @@ -503,7 +503,6 @@ def test_db_script_errors(fileutils, make_test_dir, wlmutils, mlutils): # Create a SmartSim model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( port=test_port, db_cpus=1, @@ -526,7 +525,6 @@ def test_db_script_errors(fileutils, make_test_dir, wlmutils, mlutils): colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Add a colocated database for each ensemble member for i, entity in enumerate(colo_ensemble): @@ -552,7 +550,6 @@ def test_db_script_errors(fileutils, make_test_dir, wlmutils, mlutils): colo_ensemble = exp.create_ensemble( "colocated_ensemble", run_settings=colo_settings, replicas=2 ) - colo_ensemble.set_path(test_dir) # Add an in-memory function to the ensemble colo_ensemble.add_function( From ea6bac5b765d443f9d2e13f0bb6eddaa8d39647d Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Tue, 17 Oct 2023 14:27:32 -0500 Subject: [PATCH 10/64] Begin adding a context manager for orchestrators in multidb cases --- smartsim/entity/model.py | 7 ++ tests/full_wlm/test_generic_batch_launch.py | 15 +++ .../full_wlm/test_generic_orc_launch_batch.py | 30 ++++-- tests/test_multidb.py | 98 ++++++++----------- 4 files changed, 89 insertions(+), 61 deletions(-) diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index d9c752970..49fe7f3f1 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -27,6 +27,7 @@ from __future__ import annotations import collections.abc +import re import sys import typing as t import warnings @@ -253,6 +254,12 @@ def colocate_db_uds( :type kwargs: dict, optional """ + if not re.match(r"^[a-zA-Z0-9.:\,_\-/]*$", unix_socket): + raise ValueError( + f"Invalid name for unix socket: {unix_socket}. Must only " + "contain alphanumeric characters or . : _ - /" + ) + uds_options = { "unix_socket": unix_socket, "socket_permissions": socket_permissions, diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 4beccd41b..e8de79782 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -29,11 +29,23 @@ import pytest from smartsim import Experiment, status +from smartsim.settings import QsubBatchSettings # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") +if (pytest.test_launcher == "pbs") and (not pytest.has_aprun): + pytestmark = pytest.mark.skip( + reason="Launching batch jobs is not supported on PBS without ALPS" + ) + +def add_batch_resources(wlmutils, batch_settings): + if isinstance(batch_settings, QsubBatchSettings): + print(wlmutils.get_batch_resources()) + for key, value in wlmutils.get_batch_resources().items(): + batch_settings.set_resource(key, value) + def test_batch_model(fileutils, wlmutils): """Test the launch of a manually construced batch model""" @@ -46,6 +58,7 @@ def test_batch_model(fileutils, wlmutils): batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, batch_settings) if wlmutils.get_test_launcher() == "cobalt": batch_settings.set_queue("debug-flat-quad") run_settings = wlmutils.get_run_settings("python", f"{script} --time=5") @@ -73,6 +86,7 @@ def test_batch_ensemble(fileutils, wlmutils): M2 = exp.create_model("m2", path=test_dir, run_settings=settings) batch = exp.create_batch_settings(nodes=1, time="00:01:00") + add_batch_resources(wlmutils, batch) batch.set_account(wlmutils.get_test_account()) if wlmutils.get_test_launcher() == "cobalt": @@ -96,6 +110,7 @@ def test_batch_ensemble_replicas(fileutils, wlmutils): settings = wlmutils.get_run_settings("python", f"{script} --time=5") batch = exp.create_batch_settings(nodes=1, time="00:01:00") + add_batch_resources(wlmutils, batch) batch.set_account(wlmutils.get_test_account()) if wlmutils.get_test_launcher() == "cobalt": diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 7e5591a30..441e70fef 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -25,16 +25,22 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os.path as osp +import shutil import time import pytest from smartsim import Experiment, status +from smartsim.wlm import detect_launcher # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") +if (pytest.test_launcher == "pbs") and (not pytest.has_aprun): + pytestmark = pytest.mark.skip( + reason="Launching orchestrators in a batch job is not supported on PBS without ALPS" + ) def test_launch_orc_auto_batch(fileutils, wlmutils): """test single node orchestrator""" @@ -47,7 +53,10 @@ def test_launch_orc_auto_batch(fileutils, wlmutils): # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() orc = exp.create_database( - wlmutils.get_test_port(), batch=True, interface=network_interface, single_cmd=False + wlmutils.get_test_port(), + batch=True, + interface=network_interface, + single_cmd=False ) orc.batch_settings.set_account(wlmutils.get_test_account()) @@ -55,7 +64,7 @@ def test_launch_orc_auto_batch(fileutils, wlmutils): orc.batch_settings.set_walltime("00:02:00") if wlmutils.get_test_launcher() == "cobalt": orc.batch_settings.set_queue("debug-flat-quad") - + orc.set_path(test_dir) exp.start(orc, block=True) @@ -83,7 +92,11 @@ def test_launch_cluster_orc_batch_single(fileutils, wlmutils): # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() orc = exp.create_database( - wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface, single_cmd=True + wlmutils.get_test_port(), + db_nodes=3, + batch=True, + interface=network_interface, + single_cmd=True ) orc.batch_settings.set_account(wlmutils.get_test_account()) @@ -124,9 +137,9 @@ def test_launch_cluster_orc_batch_multi(fileutils, wlmutils): orc = exp.create_database( wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface, single_cmd=False ) - + orc.batch_settings.set_account(wlmutils.get_test_account()) - + orc.batch_settings.set_walltime("00:03:00") if wlmutils.get_test_launcher() == "cobalt": # As Cobalt won't allow us to run two @@ -158,7 +171,12 @@ def test_launch_cluster_orc_reconnect(fileutils, wlmutils): # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database(wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface) + orc = exp.create_database( + wlmutils.get_test_port(), + db_nodes=3, + batch=True, + interface=network_interface + ) orc.set_path(test_dir) orc.batch_settings.set_account(wlmutils.get_test_account()) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 61c99247c..9ae1af00d 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -45,6 +45,15 @@ supported_dbs = ["uds", "tcp"] +@contextmanager +def start_in_context(exp, entity): + """Start entity in a context to ensure that it is always stopped""" + exp.generate(entity) + try: + exp.start(entity) + yield entity + finally: + exp.stop(entity) @pytest.mark.parametrize("db_type", supported_dbs) def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_type): @@ -62,15 +71,13 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, test_dir, launcher=test_launcher) # create regular database orc = exp.create_database( - port=test_port, interface=test_interface, db_identifier="my_db" + port=test_port, interface=test_interface, db_identifier="my_db", + hosts=wlmutils.get_test_hostlist(), ) - - exp.generate(orc) - assert orc.name == "my_db" # create run settings @@ -80,7 +87,6 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ # # Create the SmartSim Model smartsim_model = exp.create_model("colocated_model", colo_settings) - smartsim_model.set_path(test_dir) db_args = { "port": test_port + 1, @@ -99,26 +105,19 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ assert smartsim_model.run_settings.colocated_db_settings["db_identifier"] == "my_db" - try: - exp.start(orc) + with start_in_context(exp, orc) as orc: with pytest.raises(SSDBIDConflictError) as ex: exp.start(smartsim_model, block=True) - assert ( "has already been used. Pass in a unique name for db_identifier" in ex.value.args[0] ) - except: - exp.stop(smartsim_model) - finally: - exp.stop(orc) - @pytest.mark.parametrize("db_type", supported_dbs) def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_type): """Test colocate_db_uds/colocate_db_tcp then create_database with database - identifiers. + identifiers. """ # Set experiment name @@ -132,7 +131,7 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, test_dir, launcher=test_launcher) # Create run settings colo_settings = exp.create_run_settings("python", test_script) @@ -162,7 +161,8 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ # Create Database orc = exp.create_database( - port=test_port + 1, interface=test_interface, db_identifier="my_db" + port=test_port + 1, interface=test_interface, db_identifier="my_db", + hosts=wlmutils.get_test_hostlist(), ) exp.generate(orc) @@ -171,9 +171,6 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ exp.start(smartsim_model, block=True) exp.start(orc) - exp.stop(smartsim_model) - exp.stop(orc) - def test_db_identifier_standard_twice_not_unique(wlmutils): """Test uniqueness of db_identifier several calls to create_database, with non unique names, @@ -186,40 +183,36 @@ def test_db_identifier_standard_twice_not_unique(wlmutils): test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() + test_dir = fileutils.make_test_dir() # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher) + exp = Experiment(exp_name, test_dir, launcher=test_launcher) # CREATE DATABASE with db_identifier orc = exp.create_database( - port=test_port, interface=test_interface, db_identifier="my_db" + port=test_port, interface=test_interface, db_identifier="my_db", + hosts=wlmutils.get_test_hostlist(), ) exp.generate(orc) assert orc.name == "my_db" orc2 = exp.create_database( - port=test_port + 1, interface=test_interface, db_identifier="my_db" + port=test_port + 1, interface=test_interface, db_identifier="my_db", + hosts=wlmutils.get_test_hostlist(), ) exp.generate(orc2) assert orc2.name == "my_db" # CREATE DATABASE with db_identifier - try: - exp.start(orc) + with start_in_context(exp, orc) as orc: with pytest.raises(SSDBIDConflictError) as ex: - exp.start(orc2) - - assert ( - "has already been used. Pass in a unique name for db_identifier" - in ex.value.args[0] - ) - except: - exp.stop(orc2) - finally: - exp.stop(orc) - # exp.stop(orc2) + with start_in_context(exp, orc2) as orc2: + assert ( + "has already been used. Pass in a unique name for db_identifier" + in ex.value.args[0] + ) def test_db_identifier_create_standard_once(fileutils, wlmutils): @@ -249,9 +242,6 @@ def test_db_identifier_create_standard_once(fileutils, wlmutils): exp.start(db) exp.stop(db) - print(exp.summary()) - - def test_multidb_create_standard_twice(fileutils, wlmutils): """Multiple calls to create database with unique db_identifiers""" @@ -268,26 +258,22 @@ def test_multidb_create_standard_twice(fileutils, wlmutils): # create and start an instance of the Orchestrator database db = exp.create_database( - port=test_port, interface=test_interface, db_identifier="testdb_reg" + port=test_port, interface=test_interface, db_identifier="testdb_reg", + hosts=wlmutils.get_test_hostlist(), ) - exp.generate(db) # create database with different db_id db2 = exp.create_database( - port=test_port + 1, interface=test_interface, db_identifier="testdb_reg2" + port=test_port + 1, interface=test_interface, db_identifier="testdb_reg2", + hosts=wlmutils.get_test_hostlist(), ) - exp.generate(db2) # launch - exp.start(db, db2) - exp.stop(db, db2) - - # test restart - exp.start(db, db2) - exp.stop(db, db2) - - print(exp.summary()) + with start_in_context(exp, db) as db, start_in_context(exp, db2) as db2: + print("Databases started") + with start_in_context(exp, db) as db, start_in_context(exp, db2) as db2: + print("Databases restarted") @pytest.mark.parametrize("db_type", supported_dbs) def test_multidb_colo_once(fileutils, wlmutils, coloutils, db_type): @@ -300,7 +286,7 @@ def test_multidb_colo_once(fileutils, wlmutils, coloutils, db_type): test_script = fileutils.get_test_conf_path("smartredis/dbid.py") # start a new Experiment for this section - exp = Experiment("test_multidb_colo_once", launcher=test_launcher) + exp = Experiment("test_multidb_colo_once", test_dir, launcher=test_launcher) # create run settings run_settings = exp.create_run_settings("python", test_script) @@ -356,7 +342,8 @@ def test_multidb_standard_then_colo(fileutils, wlmutils, coloutils, db_type): # create and start an instance of the Orchestrator database db = exp.create_database( - port=test_port, interface=test_interface, db_identifier="testdb_reg" + port=test_port, interface=test_interface, db_identifier="testdb_reg", + hosts=wlmutils.get_test_hostlist(), ) exp.generate(db) @@ -436,7 +423,8 @@ def test_multidb_colo_then_standard(fileutils, wlmutils, coloutils, db_type): # create and start an instance of the Orchestrator database db = exp.create_database( - port=test_port + 1, interface=test_interface, db_identifier="testdb_reg" + port=test_port + 1, interface=test_interface, db_identifier="testdb_reg", + hosts=wlmutils.get_test_hostlist(), ) exp.generate(db) @@ -466,7 +454,7 @@ def test_launch_cluster_orc_single_dbid(fileutils, wlmutils): exp_name = "test_launch_cluster_orc_single_dbid" launcher = wlmutils.get_test_launcher() - exp = Experiment(exp_name, launcher=launcher) + exp = Experiment(exp_name, test_dir, launcher=launcher) test_dir = fileutils.make_test_dir() # batch = False to launch on existing allocation From b46ccb93516879548eea8519128ed59978fc74cb Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Tue, 17 Oct 2023 19:33:57 -0500 Subject: [PATCH 11/64] More multidb tests wokring, stopping at test_multidb.py::test_multidb_create_standard_twice --- conftest.py | 6 +-- smartsim/_core/control/jobmanager.py | 25 +++++----- smartsim/status.py | 6 ++- tests/test_multidb.py | 68 +++++++++++++++------------- 4 files changed, 58 insertions(+), 47 deletions(-) diff --git a/conftest.py b/conftest.py index 89adf7139..26a6aac76 100644 --- a/conftest.py +++ b/conftest.py @@ -544,6 +544,7 @@ def _sanitize_caller_function(caller_function: str) -> str: # We split at the opening bracket, sanitize the string # to its right and then merge the function name and # the sanitized list with a dot. + caller_function = caller_function.replace("]","") caller_function_list = caller_function.split("[", maxsplit=1) def is_accepted_char(char: str): @@ -553,6 +554,7 @@ def is_accepted_char(char: str): caller_function_list[1] = "".join( filter(is_accepted_char, caller_function_list[1]) ) + return ".".join(caller_function_list) @@ -639,7 +641,7 @@ def setup_test_colo( """Setup database needed for the colo pinning tests""" # get test setup - test_dir = fileutils.make_test_dir(level=2) + test_dir = make_test_dir sr_test_script = fileutils.get_test_conf_path(application_file) # Create an app with a colo_db which uses 1 db_cpu @@ -651,7 +653,6 @@ def setup_test_colo( colo_settings.set_tasks(1) colo_settings.set_nodes(1) colo_model = exp.create_model(colo_model_name, colo_settings) - colo_model.set_path(test_dir) if db_type in ["tcp", "deprecated"]: db_args["port"] = port @@ -671,7 +672,6 @@ def setup_test_colo( message="`colocate_db` has been deprecated" ) colocate_fun[db_type](**db_args) - exp.generate(colo_model, overwrite=True) # assert model will launch with colocated db assert colo_model.colocated # Check to make sure that limit_db_cpus made it into the colo settings diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index bfffe58fb..2c73c5831 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -34,7 +34,7 @@ from ...entity import DBNode, SmartSimEntity, EntitySequence from ...error import SmartSimError from ...log import get_logger -from ...status import TERMINAL_STATUSES +from ...status import TERMINAL_STATUSES, STATUS_NEVER_STARTED from ..config import CONFIG from ..launcher import LocalLauncher, Launcher from ..utils.network import get_ip_from_host @@ -162,6 +162,13 @@ def __call__(self) -> t.Dict[str, Job]: all_jobs = {**self.jobs, **self.db_jobs} return all_jobs + def __contains__(self, key) -> bool: + try: + self[key] + return True + except KeyError as e: + return False + def add_job( self, job_name: str, @@ -243,17 +250,13 @@ def get_status( :returns: tuple of status """ with self._lock: - try: - if entity.name in self.completed: - return self.completed[entity.name].status - + if entity.name in self.completed: + return self.completed[entity.name].status + elif entity.name in self: job: Job = self[entity.name] # locked - except KeyError: - raise SmartSimError( - f"Entity {entity.name} has not been launched in this Experiment" - ) from None - - return job.status + return job.status + else: + return STATUS_NEVER_STARTED def set_launcher(self, launcher: Launcher) -> None: """Set the launcher of the job manager to a specific launcher instance diff --git a/smartsim/status.py b/smartsim/status.py index 4d1749e71..a9eff28eb 100644 --- a/smartsim/status.py +++ b/smartsim/status.py @@ -14,11 +14,11 @@ # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NEVER LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# DAMAGES (INCLUDING, BUT NEVER LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE @@ -32,6 +32,7 @@ STATUS_FAILED = "Failed" STATUS_NEW = "New" STATUS_PAUSED = "Paused" +STATUS_NEVER_STARTED = "NotStarted" # SmartSim status mapping SMARTSIM_STATUS = { @@ -41,6 +42,7 @@ "Cancelled": STATUS_CANCELLED, "Failed": STATUS_FAILED, "New": STATUS_NEW, + "NeverStarted": STATUS_NEVER_STARTED } # Status groupings diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 6a751498f..552a10142 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -23,6 +23,7 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from contextlib import contextmanager import pytest from smartsim import Experiment, status @@ -42,12 +43,18 @@ @contextmanager def start_in_context(exp, entity): """Start entity in a context to ensure that it is always stopped""" - exp.generate(entity) + exp.generate(entity, overwrite=True) try: - exp.start(entity) yield entity finally: - exp.stop(entity) + if exp.get_status(entity) == status.STATUS_RUNNING: + exp.stop(entity) + +def choose_host(wlmutils, index=0): + hosts = wlmutils.get_test_hostlist() + if hosts: + hosts = hosts[index] + return hosts @pytest.mark.parametrize("db_type", supported_dbs) def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_type, make_test_dir): @@ -70,7 +77,7 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ # create regular database orc = exp.create_database( port=test_port, interface=test_interface, db_identifier="my_db", - hosts=wlmutils.get_test_hostlist(), + hosts=choose_host(wlmutils), ) assert orc.name == "my_db" @@ -99,11 +106,11 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ assert smartsim_model.run_settings.colocated_db_settings["db_identifier"] == "my_db" - exp.generate(orc, smartsim_model) - - with start_in_context(exp, orc) as orc: + with start_in_context(exp, orc) as orc, start_in_context(exp, smartsim_model) as smartsim_model: + exp.start(orc) with pytest.raises(SSDBIDConflictError) as ex: - exp.start(smartsim_model, block=True) + exp.start(smartsim_model) + assert ( "has already been used. Pass in a unique name for db_identifier" in ex.value.args[0] @@ -157,14 +164,15 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ # Create Database orc = exp.create_database( port=test_port + 1, interface=test_interface, db_identifier="my_db", - hosts=wlmutils.get_test_hostlist(), + hosts=choose_host(wlmutils), ) exp.generate(orc, smartsim_model) assert orc.name == "my_db" - exp.start(smartsim_model, block=True) - exp.start(orc) + with start_in_context(exp, orc) as orc, start_in_context(exp, smartsim_model) as smartsim_model: + exp.start(smartsim_model, block=True) + exp.start(orc) def test_db_identifier_standard_twice_not_unique(wlmutils, make_test_dir): @@ -178,7 +186,7 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, make_test_dir): test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) @@ -186,28 +194,27 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, make_test_dir): # CREATE DATABASE with db_identifier orc = exp.create_database( port=test_port, interface=test_interface, db_identifier="my_db", - hosts=wlmutils.get_test_hostlist(), + hosts=choose_host(wlmutils), ) - exp.generate(orc) assert orc.name == "my_db" orc2 = exp.create_database( port=test_port + 1, interface=test_interface, db_identifier="my_db", - hosts=wlmutils.get_test_hostlist(), + hosts=choose_host(wlmutils, index=1), ) - exp.generate(orc2) assert orc2.name == "my_db" # CREATE DATABASE with db_identifier - with start_in_context(exp, orc) as orc: + with start_in_context(exp, orc) as orc, start_in_context(exp, orc2): + exp.start(orc) with pytest.raises(SSDBIDConflictError) as ex: - with start_in_context(exp, orc2) as orc2: - assert ( - "has already been used. Pass in a unique name for db_identifier" - in ex.value.args[0] - ) + exp.start(orc2) + assert ( + "has already been used. Pass in a unique name for db_identifier" + in ex.value.args[0] + ) def test_db_identifier_create_standard_once(make_test_dir, wlmutils): @@ -231,13 +238,12 @@ def test_db_identifier_create_standard_once(make_test_dir, wlmutils): db_nodes=1, interface=test_interface, db_identifier="testdb_reg", + hosts=choose_host(wlmutils) ) - exp.generate(db) + with start_in_context(exp, db): + exp.start(db) - exp.start(db) - exp.stop(db) - -def test_multidb_create_standard_twice(fileutils, wlmutils): +def test_multidb_create_standard_twice(fileutils, wlmutils, make_test_dir): """Multiple calls to create database with unique db_identifiers""" # Retrieve parameters from testing environment @@ -254,21 +260,21 @@ def test_multidb_create_standard_twice(fileutils, wlmutils): # create and start an instance of the Orchestrator database db = exp.create_database( port=test_port, interface=test_interface, db_identifier="testdb_reg", - hosts=wlmutils.get_test_hostlist(), + hosts=choose_host(wlmutils,1), ) # create database with different db_id db2 = exp.create_database( port=test_port + 1, interface=test_interface, db_identifier="testdb_reg2", - hosts=wlmutils.get_test_hostlist(), + hosts=choose_host(wlmutils,2), ) # launch with start_in_context(exp, db) as db, start_in_context(exp, db2) as db2: - print("Databases started") + exp.start(db, db2) with start_in_context(exp, db) as db, start_in_context(exp, db2) as db2: - print("Databases restarted") + exp.start(db, db2) @pytest.mark.parametrize("db_type", supported_dbs) def test_multidb_colo_once(fileutils, make_test_dir, wlmutils, coloutils, db_type): From 30d3fe48c48469e6af1c9d95d372b6cf01bcf176 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Wed, 18 Oct 2023 10:59:40 -0500 Subject: [PATCH 12/64] Fix start_in_context --- conftest.py | 1 - tests/test_multidb.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index 26a6aac76..d560c36aa 100644 --- a/conftest.py +++ b/conftest.py @@ -641,7 +641,6 @@ def setup_test_colo( """Setup database needed for the colo pinning tests""" # get test setup - test_dir = make_test_dir sr_test_script = fileutils.get_test_conf_path(application_file) # Create an app with a colo_db which uses 1 db_cpu diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 552a10142..0329f8e0e 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -27,6 +27,7 @@ import pytest from smartsim import Experiment, status +from smartsim.entity.entity import SmartSimEntity from smartsim.error.errors import SSDBIDConflictError from smartsim.log import get_logger @@ -41,13 +42,13 @@ supported_dbs = ["uds", "tcp"] @contextmanager -def start_in_context(exp, entity): +def start_in_context(exp: Experiment, entity: SmartSimEntity): """Start entity in a context to ensure that it is always stopped""" exp.generate(entity, overwrite=True) try: yield entity finally: - if exp.get_status(entity) == status.STATUS_RUNNING: + if exp.get_status(entity)[0] == status.STATUS_RUNNING: exp.stop(entity) def choose_host(wlmutils, index=0): From 131310fb9a5131b55588d53c2192f96835015ffa Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Wed, 18 Oct 2023 12:04:56 -0500 Subject: [PATCH 13/64] Fix fixture usage --- smartsim/_core/config/config.py | 2 +- smartsim/_core/control/controller.py | 12 ++++++++---- smartsim/_core/control/jobmanager.py | 14 +++++++------- smartsim/_core/utils/redis.py | 7 +++---- smartsim/database/orchestrator.py | 12 ++++-------- smartsim/settings/pbsSettings.py | 5 +++-- tests/test_colo_model_local.py | 9 +++------ tests/test_experiment.py | 7 ++----- 8 files changed, 31 insertions(+), 37 deletions(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 130710472..9e4880ab2 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -181,7 +181,7 @@ def test_port(self) -> int: # pragma: no cover return int(os.environ.get("SMARTSIM_TEST_PORT", 6780)) @property - def test_batch_resources(self) -> t.Dict: # pragma: no cov + def test_batch_resources(self) -> t.Any: # pragma: no cover return json.loads(os.environ.get("SMARTSIM_TEST_BATCH_RESOURCES", "{}")) @property diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 0b27901f5..cea2ed233 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -40,7 +40,12 @@ from smartsim._core.utils.network import get_ip_from_host from ..._core.launcher.step import Step -from ..._core.utils.redis import db_is_active, set_ml_model, set_script, shutdown_db_node +from ..._core.utils.redis import ( + db_is_active, + set_ml_model, + set_script, + shutdown_db_node, +) from ..._core.utils.helpers import ( unpack_db_identifier, unpack_colo_db_identifier, @@ -56,7 +61,7 @@ ) from ...log import get_logger from ...settings.base import BatchSettings -from ...status import STATUS_CANCELLED, STATUS_FAILED, STATUS_RUNNING, TERMINAL_STATUSES +from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES from ...servertype import STANDALONE, CLUSTERED from ..config import CONFIG from ..launcher import ( @@ -217,7 +222,7 @@ def stop_db(self, db: Orchestrator) -> None: else: for node in db.entities: for host_ip, port in itertools.product( - (get_ip_from_host(host) for host in node.hosts), db.ports + (get_ip_from_host(host) for host in node.hosts), db.ports ): retcode, _, _ = shutdown_db_node(host_ip, port) # Sometimes the DB will not shutdown (unless we force NOSAVE) @@ -232,7 +237,6 @@ def stop_db(self, db: Orchestrator) -> None: db.clear_hosts() - def stop_entity_list(self, entity_list: EntitySequence[SmartSimEntity]) -> None: """Stop an instance of an entity list diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 2c73c5831..eed82de92 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -32,7 +32,6 @@ from ...database import Orchestrator from ...entity import DBNode, SmartSimEntity, EntitySequence -from ...error import SmartSimError from ...log import get_logger from ...status import TERMINAL_STATUSES, STATUS_NEVER_STARTED from ..config import CONFIG @@ -162,11 +161,11 @@ def __call__(self) -> t.Dict[str, Job]: all_jobs = {**self.jobs, **self.db_jobs} return all_jobs - def __contains__(self, key) -> bool: + def __contains__(self, key: str) -> bool: try: - self[key] + _ = self[key] return True - except KeyError as e: + except KeyError: return False def add_job( @@ -252,11 +251,12 @@ def get_status( with self._lock: if entity.name in self.completed: return self.completed[entity.name].status - elif entity.name in self: + + if entity.name in self: job: Job = self[entity.name] # locked return job.status - else: - return STATUS_NEVER_STARTED + + return STATUS_NEVER_STARTED def set_launcher(self, launcher: Launcher) -> None: """Set the launcher of the job manager to a specific launcher instance diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index dd2ba5084..468009fe2 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import itertools import logging import redis import time @@ -109,11 +108,11 @@ def check_cluster_status( # wait for cluster to spin up time.sleep(5) try: - redis_tester: "RedisCluster[t.Any]" = RedisCluster( # type: ignore + redis_tester: "RedisCluster[t.Any]" = RedisCluster( startup_nodes=cluster_nodes ) # type: ignore redis_tester.set("__test__", "__test__") - redis_tester.delete("__test__") # type: ignore + redis_tester.delete("__test__") logger.debug("Cluster status verified") return except (ClusterDownError, RedisClusterException, redis.RedisError): @@ -248,4 +247,4 @@ def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov- elif out: logger.debug(out) - return returncode, out, err \ No newline at end of file + return returncode, out, err diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 9406629ab..28b354f28 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -506,12 +506,10 @@ def set_run_arg(self, arg: str, value: t.Optional[str] = None) -> None: mpmd.run_args[arg] = value def enable_checkpoints(self, frequency: int) -> None: - """Sets the database's save configuration to save the - DB every 'frequency' seconds given that at least one - write operation against the DB occurred in that time. - For example, if `frequency` is 900, then the database - will save to disk after 900 seconds if there is at least - 1 change to the dataset. + """Sets the database's save configuration to save the DB every 'frequency' + seconds given that at least one write operation against the DB occurred in + that time. E.g., if `frequency` is 900, then the database will save to disk + after 900 seconds if there is at least 1 change to the dataset. :param frequency: the given number of seconds before the DB saves :type frequency: int @@ -532,7 +530,6 @@ def set_max_memory(self, mem: str) -> None: :param mem: the desired max memory size e.g. 3gb :type mem: str - :raises SmartSimError: If 'mem' is an invalid memory value :raises SmartSimError: If database is not active """ @@ -545,7 +542,6 @@ def set_eviction_strategy(self, strategy: str) -> None: :param strategy: The max memory policy to use e.g. "volatile-lru", "allkeys-lru", etc. :type strategy: str - :raises SmartSimError: If 'strategy' is an invalid maxmemory policy :raises SmartSimError: If database is not active """ diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 46d715265..1b337874e 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -33,6 +33,7 @@ logger = get_logger(__name__) + class QsubBatchSettings(BatchSettings): def __init__( self, @@ -191,7 +192,7 @@ def _create_resource_list(self) -> t.List[str]: res = [] # get select statement from resources or kwargs - if ("select" in self.resources) and not ("nodes" in self.resources): + if ("select" in self.resources) and "nodes" not in self.resources: res += [f"-l select={str(self.resources['select'])}"] elif ("select" in self.resources) and ("nodes" in self.resources): nselect = self.resources["select"] @@ -202,7 +203,7 @@ def _create_resource_list(self) -> t.List[str]: raise SmartSimError( ( "select and nodes were both specified, but do not have " - f"the same value. select={nselect} nodes={self.nodes}" + f"the same value. select={nselect} nodes={self._nodes}" ) ) elif "nodes" in self.resources: diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index 50e69fc61..db67ea0a6 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -110,13 +110,13 @@ def test_create_pinning_string(pin_list, num_cpus, expected): @pytest.mark.parametrize("db_type", supported_dbs) def test_launch_colocated_model_defaults( - fileutils, coloutils, db_type, launcher="local" + fileutils, make_test_dir, coloutils, db_type, launcher="local" ): """Test the launch of a model with a colocated database and local launcher""" db_args = {} - test_dir = fileutils.make_test_dir() + test_dir = make_test_dir exp = Experiment(f"colocated_model_defaults_{db_type}", test_dir, launcher=launcher) colo_model = coloutils.setup_test_colo( fileutils, @@ -133,6 +133,7 @@ def test_launch_colocated_model_defaults( assert ( colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning ) + exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @@ -188,7 +189,6 @@ def test_launch_multiple_colocated_models( def test_colocated_model_disable_pinning( fileutils, make_test_dir, coloutils, db_type, launcher="local" ): - test_dir = fileutils.make_test_dir() exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", launcher=launcher, exp_path=make_test_dir) db_args = { "db_cpus": 1, @@ -213,7 +213,6 @@ def test_colocated_model_disable_pinning( def test_colocated_model_pinning_auto_2cpu( fileutils, make_test_dir, coloutils, db_type, launcher="local" ): - test_dir = fileutils.make_test_dir() exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", launcher=launcher, exp_path=make_test_dir) db_args = { @@ -246,7 +245,6 @@ def test_colocated_model_pinning_auto_2cpu( def test_colocated_model_pinning_range(fileutils, make_test_dir, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - test_dir = fileutils.make_test_dir() exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=make_test_dir) db_args = {"db_cpus": 2, "custom_pinning": range(2)} @@ -270,7 +268,6 @@ def test_colocated_model_pinning_range(fileutils, make_test_dir, coloutils, db_t def test_colocated_model_pinning_list(fileutils, make_test_dir, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - test_dir = fileutils.make_test_dir() exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=make_test_dir) db_args = {"db_cpus": 1, "custom_pinning": [1]} diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 64d526a42..1e0300c09 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -23,15 +23,13 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import contextlib - import pytest from smartsim import Experiment from smartsim.entity import Model from smartsim.error import SmartSimError from smartsim.settings import RunSettings +from smartsim.status import STATUS_NEVER_STARTED def test_model_prefix(make_test_dir): @@ -83,8 +81,7 @@ def test_status_typeerror(): def test_status_pre_launch(): model = Model("name", {}, "./", RunSettings("python")) exp = Experiment("test") - with pytest.raises(SmartSimError): - exp.get_status(model) + assert exp.get_status(model) == STATUS_NEVER_STARTED def test_bad_ensemble_init_no_rs(): From dc301bbb8b32cfc57a465141cd13985011f6bb4d Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Wed, 18 Oct 2023 12:12:48 -0500 Subject: [PATCH 14/64] Fix get_status --- tests/test_experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 1e0300c09..4148d01b1 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -81,7 +81,7 @@ def test_status_typeerror(): def test_status_pre_launch(): model = Model("name", {}, "./", RunSettings("python")) exp = Experiment("test") - assert exp.get_status(model) == STATUS_NEVER_STARTED + assert exp.get_status(model)[0] == STATUS_NEVER_STARTED def test_bad_ensemble_init_no_rs(): From 380c8eff19e8616929a0a78be3674529bed9f2c2 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Wed, 18 Oct 2023 12:16:08 -0500 Subject: [PATCH 15/64] Fix mypy issues --- smartsim/_core/utils/redis.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 468009fe2..91a972d04 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -94,7 +94,7 @@ def check_cluster_status( :raises SmartSimError: If cluster status cannot be verified """ cluster_nodes = [ - ClusterNode(get_ip_from_host(host), port) # type: ignore + ClusterNode(get_ip_from_host(host), port) for host, port in product(hosts, ports) ] @@ -110,9 +110,9 @@ def check_cluster_status( try: redis_tester: "RedisCluster[t.Any]" = RedisCluster( startup_nodes=cluster_nodes - ) # type: ignore + ) redis_tester.set("__test__", "__test__") - redis_tester.delete("__test__") + redis_tester.delete("__test__") # type: ignore logger.debug("Cluster status verified") return except (ClusterDownError, RedisClusterException, redis.RedisError): From 62b79b0293b528618bb795ab2bdf9c4a3f73dde8 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Wed, 18 Oct 2023 12:28:48 -0500 Subject: [PATCH 16/64] Fix a couple of tests --- tests/test_multidb.py | 46 ++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 552a10142..138da8624 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -41,13 +41,15 @@ supported_dbs = ["uds", "tcp"] @contextmanager -def start_in_context(exp, entity): +def make_entity_context(exp, entity): """Start entity in a context to ensure that it is always stopped""" exp.generate(entity, overwrite=True) try: yield entity finally: - if exp.get_status(entity) == status.STATUS_RUNNING: + print(exp.get_status(entity)[0]) + if exp.get_status(entity)[0] == status.STATUS_RUNNING: + print(f"Stopping {entity.name}") exp.stop(entity) def choose_host(wlmutils, index=0): @@ -106,7 +108,7 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ assert smartsim_model.run_settings.colocated_db_settings["db_identifier"] == "my_db" - with start_in_context(exp, orc) as orc, start_in_context(exp, smartsim_model) as smartsim_model: + with make_entity_context(exp, orc) as orc, make_entity_context(exp, smartsim_model) as smartsim_model: exp.start(orc) with pytest.raises(SSDBIDConflictError) as ex: exp.start(smartsim_model) @@ -170,7 +172,7 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ exp.generate(orc, smartsim_model) assert orc.name == "my_db" - with start_in_context(exp, orc) as orc, start_in_context(exp, smartsim_model) as smartsim_model: + with make_entity_context(exp, orc) as orc, make_entity_context(exp, smartsim_model) as smartsim_model: exp.start(smartsim_model, block=True) exp.start(orc) @@ -207,7 +209,7 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, make_test_dir): assert orc2.name == "my_db" # CREATE DATABASE with db_identifier - with start_in_context(exp, orc) as orc, start_in_context(exp, orc2): + with make_entity_context(exp, orc) as orc, make_entity_context(exp, orc2): exp.start(orc) with pytest.raises(SSDBIDConflictError) as ex: exp.start(orc2) @@ -240,7 +242,7 @@ def test_db_identifier_create_standard_once(make_test_dir, wlmutils): db_identifier="testdb_reg", hosts=choose_host(wlmutils) ) - with start_in_context(exp, db): + with make_entity_context(exp, db): exp.start(db) def test_multidb_create_standard_twice(fileutils, wlmutils, make_test_dir): @@ -270,10 +272,10 @@ def test_multidb_create_standard_twice(fileutils, wlmutils, make_test_dir): ) # launch - with start_in_context(exp, db) as db, start_in_context(exp, db2) as db2: + with make_entity_context(exp, db) as db, make_entity_context(exp, db2) as db2: exp.start(db, db2) - with start_in_context(exp, db) as db, start_in_context(exp, db2) as db2: + with make_entity_context(exp, db) as db, make_entity_context(exp, db2) as db2: exp.start(db, db2) @pytest.mark.parametrize("db_type", supported_dbs) @@ -296,7 +298,6 @@ def test_multidb_colo_once(fileutils, make_test_dir, wlmutils, coloutils, db_typ # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) db_args = { "port": test_port + 1, @@ -314,11 +315,8 @@ def test_multidb_colo_once(fileutils, make_test_dir, wlmutils, coloutils, db_typ db_args, ) - exp.generate(smartsim_model) - exp.start(smartsim_model) - - exp.stop(smartsim_model) - print(exp.summary()) + with make_entity_context(exp, smartsim_model): + exp.start(smartsim_model) @pytest.mark.parametrize("db_type", supported_dbs) @@ -345,12 +343,11 @@ def test_multidb_standard_then_colo(fileutils, make_test_dir, wlmutils, coloutil # create and generate an instance of the Orchestrator database db = exp.create_database( port=test_port, interface=test_interface, db_identifier="testdb_reg", - hosts=wlmutils.get_test_hostlist(), + hosts=choose_host(wlmutils), ) # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) db_args = { "port": test_port + 1, @@ -367,20 +364,11 @@ def test_multidb_standard_then_colo(fileutils, make_test_dir, wlmutils, coloutil db_args, ) - exp.generate(db, smartsim_model) - exp.start(db) - exp.start(smartsim_model, block=True) - - # test restart colocated db - exp.start(smartsim_model) - - exp.stop(db) - # test restart standard db - exp.start(db) - exp.stop(db) - exp.stop(smartsim_model) - print(exp.summary()) + with make_entity_context(exp, db) as db, \ + make_entity_context(exp, smartsim_model) as smartsim_model: + exp.start(db) + exp.start(smartsim_model, block=True) assert all(stat is not status.STATUS_FAILED for stat in exp.get_status(db, smartsim_model)) From ee3920466de8e56a7a5698883a928ae3209307db Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Wed, 18 Oct 2023 16:07:24 -0500 Subject: [PATCH 17/64] tests are passing on PBS --- tests/test_configs/smartredis/dbid.py | 8 +- tests/test_configs/smartredis/multidbid.py | 6 +- tests/test_multidb.py | 134 ++++++++------------- 3 files changed, 50 insertions(+), 98 deletions(-) diff --git a/tests/test_configs/smartredis/dbid.py b/tests/test_configs/smartredis/dbid.py index 49955eb60..93df38cd7 100644 --- a/tests/test_configs/smartredis/dbid.py +++ b/tests/test_configs/smartredis/dbid.py @@ -39,8 +39,6 @@ args = parser.parse_args() env_vars = [ - "SSKEYIN_testdb_colo", - "SSKEYOUT_testdb_colo", "SSDB_testdb_colo", "SR_DB_TYPE_testdb_colo", ] @@ -49,8 +47,4 @@ opts1 = ConfigOptions.create_from_environment("testdb_colo") - client = Client(opts1, logger_name="SmartSim") - - - - + client = Client(opts1, logger_name="SmartSim") \ No newline at end of file diff --git a/tests/test_configs/smartredis/multidbid.py b/tests/test_configs/smartredis/multidbid.py index e174f9a40..9691515f4 100644 --- a/tests/test_configs/smartredis/multidbid.py +++ b/tests/test_configs/smartredis/multidbid.py @@ -40,12 +40,8 @@ args = parser.parse_args() env_vars = [ - "SSKEYIN_testdb_reg", - "SSKEYOUT_testdb_reg", "SSDB_testdb_reg", "SR_DB_TYPE_testdb_reg", - "SSKEYIN_testdb_colo", - "SSKEYOUT_testdb_colo", "SSDB_testdb_colo", "SR_DB_TYPE_testdb_colo", ] @@ -58,4 +54,4 @@ c1 = Client(opts1, logger_name="SmartSim") c2 = Client(opts2, logger_name="SmartSim") - + diff --git a/tests/test_multidb.py b/tests/test_multidb.py index a3e7fa9e5..448b02b49 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -37,10 +37,10 @@ logger = get_logger(__name__) -should_run = True - supported_dbs = ["uds", "tcp"] +on_wlm = pytest.test_launcher in pytest.wlm_options, + @contextmanager def make_entity_context(exp: Experiment, entity: SmartSimEntity): """Start entity in a context to ensure that it is always stopped""" @@ -57,8 +57,13 @@ def choose_host(wlmutils, index=0): hosts = hosts[index] return hosts +def check_not_failed(exp, *args): + statuses = exp.get_status(*args) + assert all(stat is not status.STATUS_FAILED for stat in statuses) + @pytest.mark.parametrize("db_type", supported_dbs) -def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_type, make_test_dir): +def test_db_identifier_standard_then_colo_error( + fileutils, wlmutils, coloutils, db_type, make_test_dir): """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp with unique db_identifiers""" @@ -77,37 +82,31 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ # create regular database orc = exp.create_database( - port=test_port, interface=test_interface, db_identifier="my_db", + port=test_port, interface=test_interface, db_identifier="testdb_colo", hosts=choose_host(wlmutils), ) - assert orc.name == "my_db" - - # create run settings - colo_settings = exp.create_run_settings("python", test_script) - colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) - - # # Create the SmartSim Model - smartsim_model = exp.create_model("colocated_model", colo_settings) + assert orc.name == "test_db_colo" db_args = { "port": test_port + 1, "db_cpus": 1, "debug": True, - "db_identifier": "my_db", + "db_identifier": "testdb_colo", } smartsim_model = coloutils.setup_test_colo( fileutils, db_type, exp, - "send_data_local_smartredis_with_dbid_error_test.py", + test_script, db_args, + on_wlm = on_wlm ) - assert smartsim_model.run_settings.colocated_db_settings["db_identifier"] == "my_db" + assert smartsim_model.run_settings.colocated_db_settings["db_identifier"] == "testdb_colo" - with make_entity_context(exp, orc) as orc, make_entity_context(exp, smartsim_model) as smartsim_model: + with make_entity_context(exp, orc) as orc, \ + make_entity_context(exp, smartsim_model) as smartsim_model: exp.start(orc) with pytest.raises(SSDBIDConflictError) as ex: exp.start(smartsim_model) @@ -116,6 +115,7 @@ def test_db_identifier_standard_then_colo(fileutils, wlmutils, coloutils, db_typ "has already been used. Pass in a unique name for db_identifier" in ex.value.args[0] ) + check_not_failed(exp, orc) @pytest.mark.parametrize("db_type", supported_dbs) @@ -132,49 +132,44 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_dir = make_test_dir - test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") + test_script = fileutils.get_test_conf_path("smartredis/dbid.py") # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # Create run settings - colo_settings = exp.create_run_settings("python", test_script) - colo_settings.set_nodes(1) - colo_settings.set_tasks_per_node(1) - - # Create the SmartSim Model - smartsim_model = exp.create_model("colocated_model", colo_settings) - db_args = { "port": test_port, "db_cpus": 1, "debug": True, - "db_identifier": "my_db", + "db_identifier": "testdb_colo", } smartsim_model = coloutils.setup_test_colo( fileutils, db_type, exp, - "send_data_local_smartredis_with_dbid_error_test.py", + test_script, db_args, + on_wlm = on_wlm, ) - assert smartsim_model.run_settings.colocated_db_settings["db_identifier"] == "my_db" + assert smartsim_model.run_settings.colocated_db_settings["db_identifier"] == "testdb_colo" # Create Database orc = exp.create_database( - port=test_port + 1, interface=test_interface, db_identifier="my_db", + port=test_port + 1, interface=test_interface, db_identifier="testdb_colo", hosts=choose_host(wlmutils), ) - exp.generate(orc, smartsim_model) - assert orc.name == "my_db" + assert orc.name == "testdb_colo" - with make_entity_context(exp, orc) as orc, make_entity_context(exp, smartsim_model) as smartsim_model: + with make_entity_context(exp, orc) as orc, \ + make_entity_context(exp, smartsim_model) as smartsim_model: exp.start(smartsim_model, block=True) exp.start(orc) + check_not_failed(exp, orc, smartsim_model) + def test_db_identifier_standard_twice_not_unique(wlmutils, make_test_dir): """Test uniqueness of db_identifier several calls to create_database, with non unique names, @@ -216,7 +211,7 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, make_test_dir): "has already been used. Pass in a unique name for db_identifier" in ex.value.args[0] ) - + check_not_failed(exp, orc) def test_db_identifier_create_standard_once(make_test_dir, wlmutils): """One call to create database with a database identifier""" @@ -244,6 +239,8 @@ def test_db_identifier_create_standard_once(make_test_dir, wlmutils): with make_entity_context(exp, db): exp.start(db) + check_not_failed(exp, db) + def test_multidb_create_standard_twice(fileutils, wlmutils, make_test_dir): """Multiple calls to create database with unique db_identifiers""" @@ -290,14 +287,6 @@ def test_multidb_colo_once(fileutils, make_test_dir, wlmutils, coloutils, db_typ # start a new Experiment for this section exp = Experiment("test_multidb_colo_once", launcher=test_launcher, exp_path=test_dir) - # create run settings - run_settings = exp.create_run_settings("python", test_script) - run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) - - # Create the SmartSim Model - smartsim_model = exp.create_model("smartsim_model", run_settings) - db_args = { "port": test_port + 1, "db_cpus": 1, @@ -310,13 +299,16 @@ def test_multidb_colo_once(fileutils, make_test_dir, wlmutils, coloutils, db_typ fileutils, db_type, exp, - "send_data_local_smartredis_with_dbid.py", + test_script, db_args, + on_wlm = on_wlm, ) with make_entity_context(exp, smartsim_model): exp.start(smartsim_model) + check_not_failed(exp, smartsim_model) + @pytest.mark.parametrize("db_type", supported_dbs) def test_multidb_standard_then_colo(fileutils, make_test_dir, wlmutils, coloutils, db_type): @@ -334,20 +326,12 @@ def test_multidb_standard_then_colo(fileutils, make_test_dir, wlmutils, coloutil "test_multidb_standard_then_colo", exp_path=test_dir, launcher=test_launcher ) - # create run settings - run_settings = exp.create_run_settings("python", test_script) - run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) - # create and generate an instance of the Orchestrator database db = exp.create_database( port=test_port, interface=test_interface, db_identifier="testdb_reg", hosts=choose_host(wlmutils), ) - # Create the SmartSim Model - smartsim_model = exp.create_model("smartsim_model", run_settings) - db_args = { "port": test_port + 1, "db_cpus": 1, @@ -359,17 +343,17 @@ def test_multidb_standard_then_colo(fileutils, make_test_dir, wlmutils, coloutil fileutils, db_type, exp, - "send_data_local_smartredis_with_dbid.py", + test_script, db_args, + on_wlm = on_wlm, ) - with make_entity_context(exp, db) as db, \ make_entity_context(exp, smartsim_model) as smartsim_model: exp.start(db) exp.start(smartsim_model, block=True) - assert all(stat is not status.STATUS_FAILED for stat in exp.get_status(db, smartsim_model)) + check_not_failed(exp, smartsim_model, db) @pytest.mark.parametrize("db_type", supported_dbs) @@ -388,14 +372,6 @@ def test_multidb_colo_then_standard(fileutils, make_test_dir, wlmutils, coloutil "test_multidb_colo_then_standard", exp_path=test_dir, launcher=test_launcher ) - # create run settings - run_settings = exp.create_run_settings("python", test_script) - run_settings.set_nodes(1) - run_settings.set_tasks_per_node(1) - - smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) - db_args = { "port": test_port, "db_cpus": 1, @@ -408,31 +384,23 @@ def test_multidb_colo_then_standard(fileutils, make_test_dir, wlmutils, coloutil fileutils, db_type, exp, - "send_data_local_smartredis_with_dbid.py", + test_script, db_args, + on_wlm = on_wlm ) # create and start an instance of the Orchestrator database db = exp.create_database( port=test_port + 1, interface=test_interface, db_identifier="testdb_reg", - hosts=wlmutils.get_test_hostlist(), + hosts=choose_host(wlmutils), ) - exp.generate(db, smartsim_model) - - exp.start(db) - exp.start(smartsim_model) - - # test restart colocated db - exp.start(smartsim_model) - exp.stop(db) - - # test restart standard db - exp.start(db) + with make_entity_context(exp, db) as db, \ + make_entity_context(exp, smartsim_model) as smartsim_model: + exp.start(db) + exp.start(smartsim_model, block=True) - exp.stop(smartsim_model) - exp.stop(db) - print(exp.summary()) + check_not_failed(exp, db, smartsim_model) @pytest.mark.skipif( @@ -459,15 +427,9 @@ def test_launch_cluster_orc_single_dbid(make_test_dir, wlmutils): hosts=wlmutils.get_test_hostlist(), db_identifier="testdb_reg", ) - exp.generate(orc) - exp.start(orc, block=True) - statuses = exp.get_status(orc) - # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: - exp.stop(orc) - assert False + with make_entity_context(exp, orc) as orc: + exp.start(orc, block=True) - exp.stop(orc) statuses = exp.get_status(orc) assert all([stat == status.STATUS_CANCELLED for stat in statuses]) From 9f8a6239cc8b610b9d82f8f523c0e031b3fc5c31 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Wed, 18 Oct 2023 16:42:25 -0500 Subject: [PATCH 18/64] Fix one last typo --- tests/test_multidb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 448b02b49..912897779 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -85,7 +85,7 @@ def test_db_identifier_standard_then_colo_error( port=test_port, interface=test_interface, db_identifier="testdb_colo", hosts=choose_host(wlmutils), ) - assert orc.name == "test_db_colo" + assert orc.name == "testdb_colo" db_args = { "port": test_port + 1, From 3769c90fe72a0d25826dd74579d8b7b3af6042a6 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Thu, 19 Oct 2023 09:06:48 -0500 Subject: [PATCH 19/64] Make reset_hosts work on LSF --- smartsim/_core/control/controller.py | 15 ++++++--------- smartsim/database/orchestrator.py | 21 +++++++++------------ tests/test_multidb.py | 3 +-- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index cea2ed233..f7cf7e373 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -25,44 +25,41 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from __future__ import annotations -import itertools +import itertools import os.path as osp -from os import environ import pickle import signal import threading import time import typing as t +from os import environ from smartredis import Client, ConfigOptions from smartsim._core.utils.network import get_ip_from_host from ..._core.launcher.step import Step +from ..._core.utils.helpers import unpack_colo_db_identifier, unpack_db_identifier from ..._core.utils.redis import ( db_is_active, set_ml_model, set_script, shutdown_db_node, ) -from ..._core.utils.helpers import ( - unpack_db_identifier, - unpack_colo_db_identifier, -) from ...database import Orchestrator from ...entity import Ensemble, EntityList, EntitySequence, Model, SmartSimEntity from ...error import ( LauncherError, SmartSimError, + SSDBIDConflictError, SSInternalError, SSUnsupportedError, - SSDBIDConflictError, ) from ...log import get_logger +from ...servertype import CLUSTERED, STANDALONE from ...settings.base import BatchSettings from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES -from ...servertype import STANDALONE, CLUSTERED from ..config import CONFIG from ..launcher import ( CobaltLauncher, @@ -235,7 +232,7 @@ def stop_db(self, db: Orchestrator) -> None: job.set_status(STATUS_CANCELLED, "", 0, output=None, error=None) self._jobs.move_to_completed(job) - db.clear_hosts() + db.reset_hosts() def stop_entity_list(self, entity_list: EntitySequence[SmartSimEntity]) -> None: """Stop an instance of an entity list diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 28b354f28..589a5d821 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -198,6 +198,7 @@ def __init__( self.ports: t.List[int] = [] self.path = getcwd() self._hosts: t.List[str] = [] + self._user_hostlist: t.List[str] = [] if isinstance(interface, str): interface = [interface] self._interfaces = interface @@ -305,13 +306,15 @@ def hosts(self) -> t.List[str]: self._hosts = self._get_db_hosts() return self._hosts - def clear_hosts(self) -> None: - """Clears the list of hosts for this orchestrator. + def reset_hosts(self) -> None: + """Clear hosts or reset them to last user choice """ for node in self.entities: node.clear_hosts() - self._hosts = [] + # This is only needed on LSF + if self._user_hostlist: + self.set_hosts(self._user_hostlist) def remove_stale_files(self) -> None: """Can be used to remove database files of a previous launch""" @@ -415,7 +418,7 @@ def set_walltime(self, walltime: str) -> None: if hasattr(self, "batch_settings") and self.batch_settings: self.batch_settings.set_walltime(walltime) - def set_hosts(self, host_list: t.List[str]) -> None: + def set_hosts(self, host_list: t.List[str] | str) -> None: """Specify the hosts for the ``Orchestrator`` to launch on :param host_list: list of host (compute node names) @@ -428,6 +431,7 @@ def set_hosts(self, host_list: t.List[str]) -> None: raise TypeError("host_list argument must be a list of strings") if not all(isinstance(host, str) for host in host_list): raise TypeError("host_list argument must be list of strings") + self._user_hostlist = host_list # TODO check length if self.batch: if hasattr(self, "batch_settings") and self.batch_settings: @@ -441,8 +445,7 @@ def set_hosts(self, host_list: t.List[str]) -> None: and isinstance(self.entities[0].run_settings, PalsMpiexecSettings) and self.entities[0].is_mpmd ): - # In this case, --hosts is a global option, we only set it to the - # first run command + # In this case, --hosts is a global option, set it to first run command self.entities[0].run_settings.set_hostlist(host_list) else: for host, db in zip(host_list, self.entities): @@ -694,7 +697,6 @@ def _build_run_settings_lsf( **_kwargs: t.Any # Needed to ensure no API break and do not want to # introduce that possibility, even if this method is # protected, without running the test suite. - # TODO: Test against an LSF system before merge! ) -> t.Optional[JsrunSettings]: run_args = {} if run_args is None else run_args erf_rs: t.Optional[JsrunSettings] = None @@ -738,9 +740,6 @@ def _build_run_settings_lsf( return erf_rs - # Old pylint from TF 2.6.x does not understand that this argument list is - # equivalent to `(self, **kwargs)` - # # pylint: disable-next=arguments-differ def _initialize_entities( self, *, @@ -775,7 +774,6 @@ def _initialize_entities( start_script_args = self._get_start_script_args( db_node_name, port, cluster ) - # if only launching 1 db per command, we don't need a # list of exe args lists run_settings = self._build_run_settings( @@ -829,7 +827,6 @@ def _initialize_entities_mpmd( node = DBNode(self.name, self.path, run_settings, [port], output_files) self.entities.append(node) - self.ports = [port] def _get_start_script_args( diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 912897779..d89e656d4 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -31,7 +31,6 @@ from smartsim.error.errors import SSDBIDConflictError from smartsim.log import get_logger -from smartsim.entity.dbobject import DBScript from smartredis import * @@ -241,7 +240,7 @@ def test_db_identifier_create_standard_once(make_test_dir, wlmutils): check_not_failed(exp, db) -def test_multidb_create_standard_twice(fileutils, wlmutils, make_test_dir): +def test_multidb_create_standard_twice(wlmutils, make_test_dir): """Multiple calls to create database with unique db_identifiers""" # Retrieve parameters from testing environment From c959e173ea973b3b7db3a16d0743449a1f6f14c4 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Thu, 19 Oct 2023 10:57:08 -0500 Subject: [PATCH 20/64] Comply to mypy syntax for union --- smartsim/database/orchestrator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 589a5d821..31b8defad 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -418,7 +418,7 @@ def set_walltime(self, walltime: str) -> None: if hasattr(self, "batch_settings") and self.batch_settings: self.batch_settings.set_walltime(walltime) - def set_hosts(self, host_list: t.List[str] | str) -> None: + def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: """Specify the hosts for the ``Orchestrator`` to launch on :param host_list: list of host (compute node names) From 336bebd6a3481c3d712bca622ee0d367613359df Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 25 Oct 2023 20:25:58 +0200 Subject: [PATCH 21/64] Update signatures in conftest.py --- conftest.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/conftest.py b/conftest.py index d560c36aa..45701abbe 100644 --- a/conftest.py +++ b/conftest.py @@ -406,7 +406,7 @@ def local_db( @pytest.fixture def db( - request: t.Any, wlmutils: t.Type[WLMUtils], make_test_dir: t.Any + request: t.Any, wlmutils: t.Type[WLMUtils], make_test_dir: str ) -> t.Generator[Orchestrator, None, None]: """Yield fixture for startup and teardown of an orchestrator""" launcher = wlmutils.get_test_launcher() @@ -426,7 +426,7 @@ def db( @pytest.fixture def db_cluster( - make_test_dir: t.Any, wlmutils: t.Type[WLMUtils], request: t.Any + make_test_dir: str, wlmutils: t.Type[WLMUtils], request: t.Any ) -> t.Generator[Orchestrator, None, None]: """ Yield fixture for startup and teardown of a clustered orchestrator. @@ -548,7 +548,7 @@ def _sanitize_caller_function(caller_function: str) -> str: caller_function_list = caller_function.split("[", maxsplit=1) def is_accepted_char(char: str): - return char.isalnum or char in "-." + return char.isalnum() or char in "-._" if len(caller_function_list) > 1: caller_function_list[1] = "".join( @@ -559,9 +559,9 @@ def is_accepted_char(char: str): @pytest.fixture -def get_test_dir(request: t.Optional[pytest.FixtureRequest]): +def get_test_dir(request: pytest.FixtureRequest): caller_function = _sanitize_caller_function(request.node.name) - dir_path = FileUtils._test_dir_path(caller_function, request.node.fspath) + dir_path = FileUtils._test_dir_path(caller_function, str(request.path)) if not os.path.exists(os.path.dirname(dir_path)): os.makedirs(os.path.dirname(dir_path)) @@ -570,9 +570,9 @@ def get_test_dir(request: t.Optional[pytest.FixtureRequest]): @pytest.fixture -def make_test_dir(request: t.Optional[pytest.FixtureRequest]): +def make_test_dir(request: pytest.FixtureRequest): caller_function = _sanitize_caller_function(request.node.name) - dir_path = FileUtils._test_dir_path(caller_function, request.node.fspath) + dir_path = FileUtils._test_dir_path(caller_function, str(request.path)) try: os.makedirs(dir_path) From a343e2779ff8a45b7dad2ae9302a2541b5d5ae52 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Oct 2023 16:37:18 +0200 Subject: [PATCH 22/64] Address reviewer's comments --- conftest.py | 46 +++++++------------ smartsim/_core/control/jobmanager.py | 2 +- smartsim/database/orchestrator.py | 2 +- smartsim/settings/pbsSettings.py | 1 - tests/backends/test_cli_mini_exp.py | 4 +- tests/backends/test_dataloader.py | 12 ++--- tests/backends/test_dbmodel.py | 32 ++++++------- tests/backends/test_dbscript.py | 24 +++++----- tests/backends/test_onnx.py | 4 +- tests/backends/test_tf.py | 8 ++-- tests/backends/test_torch.py | 4 +- tests/full_wlm/test_generic_batch_launch.py | 12 ++--- .../full_wlm/test_generic_orc_launch_batch.py | 19 ++++---- tests/full_wlm/test_mpmd.py | 4 +- tests/on_wlm/test_base_settings_on_wlm.py | 8 ++-- tests/on_wlm/test_colocated_model.py | 24 +++++----- tests/on_wlm/test_containers_wlm.py | 4 +- tests/on_wlm/test_generic_orc_launch.py | 12 ++--- tests/on_wlm/test_launch_errors.py | 8 ++-- tests/on_wlm/test_launch_ompi_lsf.py | 4 +- tests/on_wlm/test_local_step.py | 8 ++-- tests/on_wlm/test_restart.py | 4 +- .../test_simple_base_settings_on_wlm.py | 8 ++-- tests/on_wlm/test_simple_entity_launch.py | 12 ++--- tests/on_wlm/test_stop.py | 8 ++-- tests/test_colo_model_local.py | 24 +++++----- tests/test_config.py | 20 ++++---- tests/test_containers.py | 12 ++--- tests/test_dbnode.py | 4 +- tests/test_experiment.py | 16 +++---- tests/test_generator.py | 24 +++++----- tests/test_interrupt.py | 8 ++-- tests/test_launch_errors.py | 8 ++-- tests/test_local_launch.py | 8 ++-- tests/test_local_multi_run.py | 4 +- tests/test_local_restart.py | 8 ++-- tests/test_modelwriter.py | 16 +++---- tests/test_multidb.py | 36 +++++++-------- tests/test_orchestrator.py | 8 ++-- tests/test_pals_settings.py | 8 ++-- tests/test_reconnect_orchestrator.py | 4 +- tests/test_smartredis.py | 8 ++-- 42 files changed, 237 insertions(+), 253 deletions(-) diff --git a/conftest.py b/conftest.py index 45701abbe..c11d9b20c 100644 --- a/conftest.py +++ b/conftest.py @@ -28,7 +28,6 @@ import json import os -import inspect import pytest import psutil import shutil @@ -56,7 +55,7 @@ # Globals, yes, but its a testing file test_path = os.path.dirname(os.path.abspath(__file__)) -test_dir = os.path.join(test_path, "tests", "test_output") +test_output_root = os.path.join(test_path, "tests", "test_output") test_launcher = CONFIG.test_launcher test_device = CONFIG.test_device test_num_gpus = CONFIG.test_num_gpus @@ -84,7 +83,7 @@ def print_test_configuration() -> None: print("TEST_NETWORK_INTERFACE (WLM only):", test_nic) if test_alloc_specs_path: print("TEST_ALLOC_SPEC_SHEET_PATH:", test_alloc_specs_path) - print("TEST_DIR:", test_dir) + print("TEST_DIR:", test_output_root) print("Test output will be located in TEST_DIR if there is a failure") print( "TEST_PORTS:", ", ".join(str(port) for port in range(test_port, test_port + 3)) @@ -110,9 +109,9 @@ def pytest_sessionstart( Called after the Session object has been created and before performing collection and entering the run test loop. """ - if os.path.isdir(test_dir): - shutil.rmtree(test_dir) - os.makedirs(test_dir) + if os.path.isdir(test_output_root): + shutil.rmtree(test_output_root) + os.makedirs(test_output_root) print_test_configuration() @@ -124,7 +123,7 @@ def pytest_sessionfinish( returning the exit status to the system. """ if exitstatus == 0: - shutil.rmtree(test_dir) + shutil.rmtree(test_output_root) else: # kill all spawned processes in case of error kill_all_test_spawned_processes() @@ -387,13 +386,13 @@ def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: @pytest.fixture def local_db( - request: t.Any, wlmutils: t.Type[WLMUtils], make_test_dir: t.Any + request: t.Any, wlmutils: t.Type[WLMUtils], test_dir: str ) -> t.Generator[Orchestrator, None, None]: """Yield fixture for startup and teardown of an local orchestrator""" exp_name = request.function.__name__ exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir db = Orchestrator(port=wlmutils.get_test_port(), interface="lo") db.set_path(test_dir) exp.start(db) @@ -406,14 +405,14 @@ def local_db( @pytest.fixture def db( - request: t.Any, wlmutils: t.Type[WLMUtils], make_test_dir: str + request: t.Any, wlmutils: t.Type[WLMUtils], test_dir: str ) -> t.Generator[Orchestrator, None, None]: """Yield fixture for startup and teardown of an orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = request.function.__name__ exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir db = wlmutils.get_orchestrator() db.set_path(test_dir) exp.start(db) @@ -426,7 +425,7 @@ def db( @pytest.fixture def db_cluster( - make_test_dir: str, wlmutils: t.Type[WLMUtils], request: t.Any + test_dir: str, wlmutils: t.Type[WLMUtils], request: t.Any ) -> t.Generator[Orchestrator, None, None]: """ Yield fixture for startup and teardown of a clustered orchestrator. @@ -436,7 +435,7 @@ def db_cluster( exp_name = request.function.__name__ exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir db = wlmutils.get_orchestrator(nodes=3) db.set_path(test_dir) exp.start(db) @@ -559,20 +558,9 @@ def is_accepted_char(char: str): @pytest.fixture -def get_test_dir(request: pytest.FixtureRequest): +def test_dir(request: pytest.FixtureRequest): caller_function = _sanitize_caller_function(request.node.name) - dir_path = FileUtils._test_dir_path(caller_function, str(request.path)) - - if not os.path.exists(os.path.dirname(dir_path)): - os.makedirs(os.path.dirname(dir_path)) - - return dir_path - - -@pytest.fixture -def make_test_dir(request: pytest.FixtureRequest): - caller_function = _sanitize_caller_function(request.node.name) - dir_path = FileUtils._test_dir_path(caller_function, str(request.path)) + dir_path = FileUtils.get_test_dir_path(caller_function, str(request.path)) try: os.makedirs(dir_path) @@ -588,10 +576,10 @@ def fileutils() -> t.Type[FileUtils]: class FileUtils: @staticmethod - def _test_dir_path(caller_function: str, caller_fspath: str) -> str: + def get_test_dir_path(caller_function: str, caller_fspath: str) -> str: caller_file_to_dir = os.path.splitext(str(caller_fspath))[0] - rel_path = os.path.relpath(caller_file_to_dir, os.path.dirname(test_dir)) - dir_path = os.path.join(test_dir, rel_path, caller_function) + rel_path = os.path.relpath(caller_file_to_dir, os.path.dirname(test_output_root)) + dir_path = os.path.join(test_output_root, rel_path, caller_function) return dir_path @staticmethod diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index eed82de92..dd3ebe405 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -163,7 +163,7 @@ def __call__(self) -> t.Dict[str, Job]: def __contains__(self, key: str) -> bool: try: - _ = self[key] + self[key] # pylint: disable=pointless-statement return True except KeyError: return False diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 31b8defad..bfc9594f7 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -431,7 +431,7 @@ def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: raise TypeError("host_list argument must be a list of strings") if not all(isinstance(host, str) for host in host_list): raise TypeError("host_list argument must be list of strings") - self._user_hostlist = host_list + self._user_hostlist = host_list.copy() # TODO check length if self.batch: if hasattr(self, "batch_settings") and self.batch_settings: diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 1b337874e..19b882f5d 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -236,5 +236,4 @@ def _create_resource_list(self) -> t.List[str]: for resource, value in self.resources.items(): if resource not in ["select", "walltime", "place"]: res += [f"-l {resource}={str(value)}"] - print(res) return res diff --git a/tests/backends/test_cli_mini_exp.py b/tests/backends/test_cli_mini_exp.py index ea9abe066..90d4dbd38 100644 --- a/tests/backends/test_cli_mini_exp.py +++ b/tests/backends/test_cli_mini_exp.py @@ -47,7 +47,7 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( local_db, - get_test_dir, + test_dir, monkeypatch, ): """Presumably devs running the test suite have built SS correctly. @@ -71,7 +71,7 @@ def _mock_make_managed_local_orc(*a, **kw): smartsim._core._cli.validate.test_install( # Shouldn't matter bc we are stubbing creation of orc # but best to give it "correct" vals for safety - location=get_test_dir, + location=test_dir, port=db_port, # Always test on CPU, heads don't always have GPU device="CPU", diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index ea4859ad4..771c8937b 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -155,8 +155,8 @@ def train_tf(generator): @pytest.mark.skipif(not shouldrun_tf, reason="Test needs TensorFlow to run") -def test_tf_dataloaders(make_test_dir, wlmutils): - test_dir = make_test_dir +def test_tf_dataloaders(test_dir, wlmutils): + test_dir = test_dir exp = Experiment("test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher()) orc: Orchestrator = wlmutils.get_orchestrator() exp.generate(orc) @@ -221,8 +221,8 @@ def create_trainer_torch(experiment: Experiment, filedir, wlmutils): @pytest.mark.skipif(not shouldrun_torch, reason="Test needs Torch to run") -def test_torch_dataloaders(fileutils, make_test_dir, wlmutils): - test_dir = make_test_dir +def test_torch_dataloaders(fileutils, test_dir, wlmutils): + test_dir = test_dir exp = Experiment("test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher()) orc: Orchestrator = wlmutils.get_orchestrator() config_dir = fileutils.get_test_dir_path("ml") @@ -317,8 +317,8 @@ def test_data_info_repr(): @pytest.mark.skipif( not (shouldrun_torch or shouldrun_tf), reason="Requires TF or PyTorch" ) -def test_wrong_dataloaders(make_test_dir, wlmutils): - test_dir = make_test_dir +def test_wrong_dataloaders(test_dir, wlmutils): + test_dir = test_dir exp = Experiment("test-wrong-dataloaders", exp_path=test_dir, launcher=wlmutils.get_test_launcher()) orc = wlmutils.get_orchestrator() exp.generate(orc) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 591fd8512..4a94e1881 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -150,7 +150,7 @@ def save_torch_cnn(path, file_name): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_tf_db_model(fileutils, make_test_dir, wlmutils, mlutils): +def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): """Test TensorFlow DB Models on remote DB""" # Set experiment name @@ -162,7 +162,7 @@ def test_tf_db_model(fileutils, make_test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -228,7 +228,7 @@ def test_tf_db_model(fileutils, make_test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_pt_db_model(fileutils, make_test_dir, wlmutils, mlutils): +def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): """Test PyTorch DB Models on remote DB""" # Set experiment name @@ -240,7 +240,7 @@ def test_pt_db_model(fileutils, make_test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -294,7 +294,7 @@ def test_pt_db_model(fileutils, make_test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils): +def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): """Test DBModels on remote DB, with an ensemble""" # Set experiment name @@ -306,7 +306,7 @@ def test_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -393,7 +393,7 @@ def test_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_tf(fileutils, make_test_dir, wlmutils, mlutils): +def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): """Test DB Models on colocated DB (TensorFlow backend)""" # Set experiment name @@ -405,7 +405,7 @@ def test_colocated_db_model_tf(fileutils, make_test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experience @@ -465,7 +465,7 @@ def test_colocated_db_model_tf(fileutils, make_test_dir, wlmutils, mlutils): exp.stop(colo_model) @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_colocated_db_model_pytorch(fileutils, make_test_dir, wlmutils, mlutils): +def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): """Test DB Models on colocated DB (PyTorch backend)""" # Set experiment name @@ -477,7 +477,7 @@ def test_colocated_db_model_pytorch(fileutils, make_test_dir, wlmutils, mlutils) test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -525,7 +525,7 @@ def test_colocated_db_model_pytorch(fileutils, make_test_dir, wlmutils, mlutils) @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils): +def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): """Test DBModel on colocated ensembles, first colocating DB, then adding DBModel. """ @@ -539,7 +539,7 @@ def test_colocated_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -630,7 +630,7 @@ def test_colocated_db_model_ensemble(fileutils, make_test_dir, wlmutils, mlutils @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble_reordered(fileutils, make_test_dir, wlmutils, mlutils): +def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): """Test DBModel on colocated ensembles, first adding the DBModel to the ensemble, then colocating DB. """ @@ -644,7 +644,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, make_test_dir, wlmutil test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -734,7 +734,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, make_test_dir, wlmutil @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_errors(fileutils, make_test_dir, wlmutils, mlutils): +def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): """Test error when colocated db model has no file.""" # Set experiment name @@ -746,7 +746,7 @@ def test_colocated_db_model_errors(fileutils, make_test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experiment diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 8b69ed5f3..6df908fb1 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -64,7 +64,7 @@ def timestwo(x): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script(fileutils, make_test_dir, wlmutils, mlutils): +def test_db_script(fileutils, test_dir, wlmutils, mlutils): """Test DB scripts on remote DB""" # Set experiment name @@ -76,7 +76,7 @@ def test_db_script(fileutils, make_test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -140,7 +140,7 @@ def test_db_script(fileutils, make_test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutils): +def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): """Test DB scripts on remote DB""" # Set experiment name @@ -152,7 +152,7 @@ def test_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -236,7 +236,7 @@ def test_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script(fileutils, make_test_dir, wlmutils, mlutils): +def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): """Test DB Scripts on colocated DB""" # Set the experiment name @@ -248,7 +248,7 @@ def test_colocated_db_script(fileutils, make_test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -303,7 +303,7 @@ def test_colocated_db_script(fileutils, make_test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutils): +def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): """Test DB Scripts on colocated DB from ensemble, first colocating DB, then adding script. """ @@ -317,7 +317,7 @@ def test_colocated_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutil test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -403,7 +403,7 @@ def test_colocated_db_script_ensemble(fileutils, make_test_dir, wlmutils, mlutil @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble_reordered(fileutils, make_test_dir, wlmutils, mlutils): +def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): """Test DB Scripts on colocated DB from ensemble, first adding the script to the ensemble, then colocating the DB""" @@ -416,7 +416,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, make_test_dir, wlmuti test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -501,7 +501,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, make_test_dir, wlmuti @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_errors(fileutils, make_test_dir, wlmutils, mlutils): +def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): """Test DB Scripts error when setting a serialized function on colocated DB""" # Set Experiment name @@ -513,7 +513,7 @@ def test_db_script_errors(fileutils, make_test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") # Create SmartSim experiment diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index d0873af89..3226c5f57 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -56,7 +56,7 @@ ) -def test_sklearn_onnx(make_test_dir, mlutils, wlmutils): +def test_sklearn_onnx(test_dir, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 some sklearn models here we test the following sklearn models: @@ -75,7 +75,7 @@ def test_sklearn_onnx(make_test_dir, mlutils, wlmutils): """ exp_name = "test_sklearn_onnx" - test_dir = make_test_dir + test_dir = test_dir exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index d010933f2..a2c4ba8c0 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -50,7 +50,7 @@ (not tf_backend_available) or (not tf_available), reason="Requires RedisAI TF backend", ) -def test_keras_model(make_test_dir, mlutils, wlmutils): +def test_keras_model(test_dir, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 for a keras model script this test can run on CPU/GPU by setting SMARTSIM_TEST_DEVICE=GPU @@ -61,7 +61,7 @@ def test_keras_model(make_test_dir, mlutils, wlmutils): """ exp_name = "test_keras_model" - test_dir = make_test_dir + test_dir = test_dir exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() @@ -110,8 +110,8 @@ def create_tf_model(): @pytest.mark.skipif(not tf_available, reason="Requires Tensorflow and Keras") -def test_freeze_model(make_test_dir): - test_dir = make_test_dir +def test_freeze_model(test_dir): + test_dir = test_dir model = create_tf_model() model_path, inputs, outputs = freeze_model(model, test_dir, "mnist.pb") diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index 2c104bce5..610fa50b0 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -48,7 +48,7 @@ ) -def test_torch_model_and_script(make_test_dir, mlutils, wlmutils): +def test_torch_model_and_script(test_dir, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 for a torch model script Here we test both the torchscipt API and the NN API from torch @@ -61,7 +61,7 @@ def test_torch_model_and_script(make_test_dir, mlutils, wlmutils): """ exp_name = "test_torch_model_and_script" - test_dir = make_test_dir + test_dir = test_dir exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 41f32a101..e01596dc6 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -47,12 +47,12 @@ def add_batch_resources(wlmutils, batch_settings): batch_settings.set_resource(key, value) -def test_batch_model(fileutils, make_test_dir, wlmutils): +def test_batch_model(fileutils, test_dir, wlmutils): """Test the launch of a manually construced batch model""" exp_name = "test-batch-model" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") @@ -73,12 +73,12 @@ def test_batch_model(fileutils, make_test_dir, wlmutils): assert statuses[0] == status.STATUS_COMPLETED -def test_batch_ensemble(fileutils, make_test_dir, wlmutils): +def test_batch_ensemble(fileutils, test_dir, wlmutils): """Test the launch of a manually constructed batch ensemble""" exp_name = "test-batch-ensemble" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") @@ -101,10 +101,10 @@ def test_batch_ensemble(fileutils, make_test_dir, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_batch_ensemble_replicas(fileutils, make_test_dir, wlmutils): +def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): exp_name = "test-batch-ensemble-replicas" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index ea52f84ae..3222e7860 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -25,13 +25,10 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os.path as osp -import shutil import time - import pytest from smartsim import Experiment, status -from smartsim.wlm import detect_launcher # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -42,13 +39,13 @@ reason="Launching orchestrators in a batch job is not supported on PBS without ALPS" ) -def test_launch_orc_auto_batch(make_test_dir, wlmutils): +def test_launch_orc_auto_batch(test_dir, wlmutils): """test single node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-orc-batch" exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -80,14 +77,14 @@ def test_launch_orc_auto_batch(make_test_dir, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_batch_single(make_test_dir, wlmutils): +def test_launch_cluster_orc_batch_single(test_dir, wlmutils): """test clustered 3-node orchestrator with single command""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-batch-single" exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -123,14 +120,14 @@ def test_launch_cluster_orc_batch_single(make_test_dir, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_batch_multi(make_test_dir, wlmutils): +def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): """test clustered 3-node orchestrator""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-batch-multi" exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -162,12 +159,12 @@ def test_launch_cluster_orc_batch_multi(make_test_dir, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_reconnect(make_test_dir, wlmutils): +def test_launch_cluster_orc_reconnect(test_dir, wlmutils): """test reconnecting to clustered 3-node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-cluster-orc-batch-reconect" exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index c8f92cae8..8b9c1f5b0 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -36,7 +36,7 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_mpmd(fileutils, make_test_dir, wlmutils): +def test_mpmd(fileutils, test_dir, wlmutils): """Run an MPMD model twice and check that it always gets executed the same way. @@ -77,7 +77,7 @@ def prune_commands(launcher): f"MPMD on {launcher} only supported for run commands {by_launcher[launcher]}" ) - test_dir = make_test_dir + test_dir = test_dir for run_command in run_commands: script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings( diff --git a/tests/on_wlm/test_base_settings_on_wlm.py b/tests/on_wlm/test_base_settings_on_wlm.py index 7022884e7..03491ac8a 100644 --- a/tests/on_wlm/test_base_settings_on_wlm.py +++ b/tests/on_wlm/test_base_settings_on_wlm.py @@ -40,10 +40,10 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_model_on_wlm(fileutils, make_test_dir, wlmutils): +def test_model_on_wlm(fileutils, test_dir, wlmutils): exp_name = "test-base-settings-model-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") @@ -58,10 +58,10 @@ def test_model_on_wlm(fileutils, make_test_dir, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_model_stop_on_wlm(fileutils, make_test_dir, wlmutils): +def test_model_stop_on_wlm(fileutils, test_dir, wlmutils): exp_name = "test-base-settings-model-stop" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index 1d6d3530d..83eaf4bd7 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -45,12 +45,12 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") @pytest.mark.parametrize("db_type", supported_dbs) -def test_launch_colocated_model_defaults(fileutils, make_test_dir, coloutils, db_type): +def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, db_type): """Test the launch of a model with a colocated database and local launcher""" db_args = { "debug": DEBUG_DB } - exp = Experiment(f"colocated_model_defaults_{db_type}", launcher=launcher, exp_path=make_test_dir) + exp = Experiment(f"colocated_model_defaults_{db_type}", launcher=launcher, exp_path=test_dir) colo_model = coloutils.setup_test_colo( fileutils, db_type, @@ -71,9 +71,9 @@ def test_launch_colocated_model_defaults(fileutils, make_test_dir, coloutils, db assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_disable_pinning(fileutils, make_test_dir, coloutils, db_type): +def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type): - exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", launcher=launcher, exp_path=make_test_dir) + exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", launcher=launcher, exp_path=test_dir) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -96,9 +96,9 @@ def test_colocated_model_disable_pinning(fileutils, make_test_dir, coloutils, db assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_auto_2cpu(fileutils, make_test_dir, coloutils, db_type): +def test_colocated_model_pinning_auto_2cpu(fileutils, test_dir, coloutils, db_type): - exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", launcher=launcher, exp_path=make_test_dir) + exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", launcher=launcher, exp_path=test_dir) db_args = { "db_cpus": 2, @@ -121,11 +121,11 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, make_test_dir, coloutils, assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_range(fileutils, make_test_dir, coloutils, db_type): +def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # Assume that there are at least 4 cpus on the node - exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=make_test_dir) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=test_dir) db_args = { "db_cpus": 4, @@ -148,11 +148,11 @@ def test_colocated_model_pinning_range(fileutils, make_test_dir, coloutils, db_t assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_list(fileutils, make_test_dir, coloutils, db_type): +def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this has more than 2 CPUs on the supercomputer node - exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=make_test_dir) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=test_dir) db_args = { "db_cpus": 2, @@ -174,11 +174,11 @@ def test_colocated_model_pinning_list(fileutils, make_test_dir, coloutils, db_ty assert all([stat == status.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_mixed(fileutils, make_test_dir, coloutils, db_type): +def test_colocated_model_pinning_mixed(fileutils, test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this at least 4 CPUs on the supercomputer node - exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=make_test_dir) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=test_dir) db_args = { "db_cpus": 2, diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py index 46eb6d771..a221ab4c3 100644 --- a/tests/on_wlm/test_containers_wlm.py +++ b/tests/on_wlm/test_containers_wlm.py @@ -40,7 +40,7 @@ @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_wlm_smartredis(fileutils, make_test_dir, wlmutils): +def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. @@ -55,7 +55,7 @@ def test_singularity_wlm_smartredis(fileutils, make_test_dir, wlmutils): f"Test only runs on systems with PBS or Slurm as WLM. Current launcher: {launcher}" ) - test_dir = make_test_dir + test_dir = test_dir exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher=launcher ) diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index 9533d5b4c..d087d1f8e 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -33,13 +33,13 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_launch_orc_auto(make_test_dir, wlmutils): +def test_launch_orc_auto(test_dir, wlmutils): """test single node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-orc" exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -65,14 +65,14 @@ def test_launch_orc_auto(make_test_dir, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_single(make_test_dir, wlmutils): +def test_launch_cluster_orc_single(test_dir, wlmutils): """test clustered 3-node orchestrator with single command""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-single" exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -99,14 +99,14 @@ def test_launch_cluster_orc_single(make_test_dir, wlmutils): assert all([stat == status.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_multi(make_test_dir, wlmutils): +def test_launch_cluster_orc_multi(test_dir, wlmutils): """test clustered 3-node orchestrator with multiple commands""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-multi" exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/on_wlm/test_launch_errors.py b/tests/on_wlm/test_launch_errors.py index 4fd4886f6..ef65ab3f6 100644 --- a/tests/on_wlm/test_launch_errors.py +++ b/tests/on_wlm/test_launch_errors.py @@ -36,12 +36,12 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_failed_status(fileutils, make_test_dir, wlmutils): +def test_failed_status(fileutils, test_dir, wlmutils): """Test when a failure occurs deep into model execution""" exp_name = "test-report-failure" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("bad.py") settings = exp.create_run_settings( @@ -58,7 +58,7 @@ def test_failed_status(fileutils, make_test_dir, wlmutils): assert stat[0] == status.STATUS_FAILED -def test_bad_run_command_args(fileutils, make_test_dir, wlmutils): +def test_bad_run_command_args(fileutils, test_dir, wlmutils): """Should fail because of incorrect arguments given to the run command @@ -70,7 +70,7 @@ def test_bad_run_command_args(fileutils, make_test_dir, wlmutils): exp_name = "test-bad-run-command-args" exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_launch_ompi_lsf.py b/tests/on_wlm/test_launch_ompi_lsf.py index 05fc9cd5f..7bb92102b 100644 --- a/tests/on_wlm/test_launch_ompi_lsf.py +++ b/tests/on_wlm/test_launch_ompi_lsf.py @@ -34,13 +34,13 @@ @pytest.mark.skip("OpenMPI currently not working on LSF systems") -def test_launch_openmpi_lsf(fileutils, make_test_dir, wlmutils): +def test_launch_openmpi_lsf(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() if launcher != "lsf": pytest.skip("Test only runs on systems with LSF as WLM") exp_name = "test-launch-openmpi-lsf" exp = Experiment(exp_name, launcher=launcher) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", script, "mpirun") diff --git a/tests/on_wlm/test_local_step.py b/tests/on_wlm/test_local_step.py index dd838b80e..c3da53cb5 100644 --- a/tests/on_wlm/test_local_step.py +++ b/tests/on_wlm/test_local_step.py @@ -40,13 +40,13 @@ """ -def test_local_env_pass_implicit(fileutils, make_test_dir) -> None: +def test_local_env_pass_implicit(fileutils, test_dir) -> None: """Ensure implicitly exported env is available to running task""" exp_value = str(uuid.uuid4()) env_key = "test_local_env_pass_implicit" os.environ[env_key] = exp_value - test_dir = make_test_dir + test_dir = test_dir exp_dir = f"{test_dir}/exp" os.makedirs(exp_dir) script = fileutils.get_test_conf_path("check_env.py") @@ -77,14 +77,14 @@ def test_local_env_pass_implicit(fileutils, make_test_dir) -> None: assert f"{env_key}=={exp_value}" in app_output -def test_local_env_pass_explicit(fileutils, make_test_dir) -> None: +def test_local_env_pass_explicit(fileutils, test_dir) -> None: """Ensure explicitly exported env is available to running task""" exp_value = str(uuid.uuid4()) env_key = "test_local_env_pass_explicit" assert env_key not in os.environ - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("check_env.py") exp_dir = f"{test_dir}/exp" diff --git a/tests/on_wlm/test_restart.py b/tests/on_wlm/test_restart.py index 4ef779d40..6dd3ba1c6 100644 --- a/tests/on_wlm/test_restart.py +++ b/tests/on_wlm/test_restart.py @@ -35,11 +35,11 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_restart(fileutils, make_test_dir, wlmutils): +def test_restart(fileutils, test_dir, wlmutils): exp_name = "test-restart" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index 7f28b2080..382e4deb0 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -48,7 +48,7 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_simple_model_on_wlm(fileutils, make_test_dir, wlmutils): +def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: pytest.skip( @@ -57,7 +57,7 @@ def test_simple_model_on_wlm(fileutils, make_test_dir, wlmutils): exp_name = "test-simplebase-settings-model-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") @@ -69,7 +69,7 @@ def test_simple_model_on_wlm(fileutils, make_test_dir, wlmutils): assert exp.get_status(M)[0] == status.STATUS_COMPLETED -def test_simple_model_stop_on_wlm(fileutils, make_test_dir, wlmutils): +def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): launcher = wlmutils.get_test_launcher() if launcher not in ["pbs", "slurm", "cobalt", "lsf"]: pytest.skip( @@ -78,7 +78,7 @@ def test_simple_model_stop_on_wlm(fileutils, make_test_dir, wlmutils): exp_name = "test-simplebase-settings-model-stop" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index f66971996..d049cc394 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -46,10 +46,10 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_models(fileutils, make_test_dir, wlmutils): +def test_models(fileutils, test_dir, wlmutils): exp_name = "test-models-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -63,10 +63,10 @@ def test_models(fileutils, make_test_dir, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_ensemble(fileutils, make_test_dir, wlmutils): +def test_ensemble(fileutils, test_dir, wlmutils): exp_name = "test-ensemble-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -80,12 +80,12 @@ def test_ensemble(fileutils, make_test_dir, wlmutils): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_summary(fileutils, make_test_dir, wlmutils): +def test_summary(fileutils, test_dir, wlmutils): """Fairly rudimentary test of the summary dataframe""" exp_name = "test-launch-summary" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir sleep = fileutils.get_test_conf_path("sleep.py") bad = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index 13e6b54af..94169ec9d 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -42,10 +42,10 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_stop_entity(fileutils, make_test_dir, wlmutils): +def test_stop_entity(fileutils, test_dir, wlmutils): exp_name = "test-launch-stop-model" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") @@ -59,11 +59,11 @@ def test_stop_entity(fileutils, make_test_dir, wlmutils): assert exp.get_status(M1)[0] == status.STATUS_CANCELLED -def test_stop_entity_list(fileutils, make_test_dir, wlmutils): +def test_stop_entity_list(fileutils, test_dir, wlmutils): exp_name = "test-launch-stop-ensemble" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index db67ea0a6..29a96718a 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -110,13 +110,13 @@ def test_create_pinning_string(pin_list, num_cpus, expected): @pytest.mark.parametrize("db_type", supported_dbs) def test_launch_colocated_model_defaults( - fileutils, make_test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, db_type, launcher="local" ): """Test the launch of a model with a colocated database and local launcher""" db_args = {} - test_dir = make_test_dir + test_dir = test_dir exp = Experiment(f"colocated_model_defaults_{db_type}", test_dir, launcher=launcher) colo_model = coloutils.setup_test_colo( fileutils, @@ -146,14 +146,14 @@ def test_launch_colocated_model_defaults( @pytest.mark.parametrize("db_type", supported_dbs) def test_launch_multiple_colocated_models( - fileutils, make_test_dir, coloutils, wlmutils, db_type, launcher="local" + fileutils, test_dir, coloutils, wlmutils, db_type, launcher="local" ): """Test the concurrent launch of two models with a colocated database and local launcher """ db_args = {} - exp = Experiment("multi_colo_models", launcher=launcher, exp_path=make_test_dir) + exp = Experiment("multi_colo_models", launcher=launcher, exp_path=test_dir) colo_models = [ coloutils.setup_test_colo( fileutils, @@ -187,9 +187,9 @@ def test_launch_multiple_colocated_models( @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_disable_pinning( - fileutils, make_test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, db_type, launcher="local" ): - exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", launcher=launcher, exp_path=make_test_dir) + exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", launcher=launcher, exp_path=test_dir) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -211,9 +211,9 @@ def test_colocated_model_disable_pinning( @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_auto_2cpu( - fileutils, make_test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, db_type, launcher="local" ): - exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", launcher=launcher, exp_path=make_test_dir) + exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", launcher=launcher, exp_path=test_dir) db_args = { "db_cpus": 2, @@ -242,10 +242,10 @@ def test_colocated_model_pinning_auto_2cpu( @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_range(fileutils, make_test_dir, coloutils, db_type, launcher="local"): +def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=make_test_dir) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=test_dir) db_args = {"db_cpus": 2, "custom_pinning": range(2)} @@ -265,10 +265,10 @@ def test_colocated_model_pinning_range(fileutils, make_test_dir, coloutils, db_t @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_list(fileutils, make_test_dir, coloutils, db_type, launcher="local"): +def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type, launcher="local"): # Check to make sure that the CPU mask was correctly generated - exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=make_test_dir) + exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=test_dir) db_args = {"db_cpus": 1, "custom_pinning": [1]} diff --git a/tests/test_config.py b/tests/test_config.py index 2af76dcd7..edd130171 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -82,9 +82,9 @@ def make_file(filepath: str) -> None: with open(filepath, "w+", encoding="utf-8") as dummy_file: dummy_file.write("dummy\n") -def test_redisai_invalid_rai_path(get_test_dir, monkeypatch): +def test_redisai_invalid_rai_path(test_dir, monkeypatch): """An invalid RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should fail""" - test_dir = get_test_dir + test_dir = test_dir rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") make_file(os.path.join(test_dir, "lib", "redisai.so")) env = get_redisai_env(rai_file_path, test_dir) @@ -99,9 +99,9 @@ def test_redisai_invalid_rai_path(get_test_dir, monkeypatch): assert 'RedisAI dependency not found' in ex.value.args[0] -def test_redisai_valid_rai_path(get_test_dir, monkeypatch): +def test_redisai_valid_rai_path(test_dir, monkeypatch): """A valid RAI_PATH should override valid SMARTSIM_DEP_INSTALL_PATH and succeed""" - test_dir = get_test_dir + test_dir = test_dir rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") make_file(rai_file_path) @@ -114,9 +114,9 @@ def test_redisai_valid_rai_path(get_test_dir, monkeypatch): assert config.redisai == rai_file_path -def test_redisai_invalid_lib_path(make_test_dir, monkeypatch): +def test_redisai_invalid_lib_path(test_dir, monkeypatch): """Invalid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should fail""" - test_dir = make_test_dir + test_dir = test_dir rai_file_path = f"{test_dir}/railib/redisai.so" env = get_redisai_env(rai_file_path, test_dir) @@ -130,9 +130,9 @@ def test_redisai_invalid_lib_path(make_test_dir, monkeypatch): assert 'RedisAI dependency not found' in ex.value.args[0] -def test_redisai_valid_lib_path(get_test_dir, monkeypatch): +def test_redisai_valid_lib_path(test_dir, monkeypatch): """Valid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should succeed""" - test_dir = get_test_dir + test_dir = test_dir rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") make_file(rai_file_path) env = get_redisai_env(rai_file_path, test_dir) @@ -144,9 +144,9 @@ def test_redisai_valid_lib_path(get_test_dir, monkeypatch): assert config.redisai == rai_file_path -def test_redisai_valid_lib_path_null_rai(get_test_dir, monkeypatch): +def test_redisai_valid_lib_path_null_rai(test_dir, monkeypatch): """Missing RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should succeed""" - test_dir = get_test_dir + test_dir = test_dir rai_file_path: t.Optional[str] = None lib_file_path = os.path.join(test_dir, "lib", "redisai.so") make_file(lib_file_path) diff --git a/tests/test_containers.py b/tests/test_containers.py index f848f3663..e92620562 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -87,9 +87,9 @@ def test_singularity_commands(fileutils): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_basic(fileutils, make_test_dir): +def test_singularity_basic(fileutils, test_dir): """Basic argument-less Singularity test""" - test_dir = make_test_dir + test_dir = test_dir container = Singularity(containerURI) @@ -113,9 +113,9 @@ def test_singularity_basic(fileutils, make_test_dir): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_args(fileutils, make_test_dir): +def test_singularity_args(fileutils, test_dir): """Test combinations of args and mount arguments for Singularity""" - test_dir = make_test_dir + test_dir = test_dir hometest_dir = os.path.join(str(Path.home()), "test") # $HOME/test mount_paths = {test_dir + "/singularity_args": hometest_dir} container = Singularity(containerURI, args="--contain", mount=mount_paths) @@ -140,7 +140,7 @@ def test_singularity_args(fileutils, make_test_dir): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_smartredis(make_test_dir, fileutils, wlmutils): +def test_singularity_smartredis(test_dir, fileutils, wlmutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. @@ -148,7 +148,7 @@ def test_singularity_smartredis(make_test_dir, fileutils, wlmutils): Note: This is a containerized port of test_smartredis.py """ - test_dir = make_test_dir + test_dir = test_dir exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" ) diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index 434362a57..43c8f24b5 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -46,10 +46,10 @@ def test_parse_db_host_error(): orc.entities[0].host -def test_hosts(make_test_dir, wlmutils): +def test_hosts(test_dir, wlmutils): exp_name = "test_hosts" exp = Experiment(exp_name) - test_dir = make_test_dir + test_dir = test_dir orc = Orchestrator(port=wlmutils.get_test_port(), interface="lo", launcher="local") orc.set_path(test_dir) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 4148d01b1..0fb6d9c34 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -32,10 +32,10 @@ from smartsim.status import STATUS_NEVER_STARTED -def test_model_prefix(make_test_dir): +def test_model_prefix(test_dir): exp_name = "test_prefix" exp = Experiment(exp_name) - test_dir = make_test_dir + test_dir = test_dir model = exp.create_model( "model", path=test_dir, @@ -105,10 +105,10 @@ def test_bad_ensemble_init_no_rs_bs(): exp.create_ensemble("name") -def test_stop_entity(make_test_dir): +def test_stop_entity(test_dir): exp_name = "test_stop_entity" exp = Experiment(exp_name) - test_dir = make_test_dir + test_dir = test_dir m = exp.create_model("model", path=test_dir, run_settings=RunSettings("sleep", "5")) exp.start(m, block=False) assert exp.finished(m) == False @@ -116,11 +116,11 @@ def test_stop_entity(make_test_dir): assert exp.finished(m) == True -def test_poll(make_test_dir): +def test_poll(test_dir): # Ensure that a SmartSimError is not raised exp_name = "test_exp_poll" exp = Experiment(exp_name) - test_dir = make_test_dir + test_dir = test_dir model = exp.create_model( "model", path=test_dir, run_settings=RunSettings("sleep", "5") ) @@ -129,10 +129,10 @@ def test_poll(make_test_dir): exp.stop(model) -def test_summary(make_test_dir): +def test_summary(test_dir): exp_name = "test_exp_summary" exp = Experiment(exp_name) - test_dir = make_test_dir + test_dir = test_dir m = exp.create_model( "model", path=test_dir, run_settings=RunSettings("echo", "Hello") ) diff --git a/tests/test_generator.py b/tests/test_generator.py index 0719cd308..c496b9fc6 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -112,8 +112,8 @@ def test_ensemble_overwrite_error(fileutils, get_test_dir): gen.generate_experiment(ensemble) -def test_full_exp(fileutils, make_test_dir, wlmutils): - test_dir = make_test_dir +def test_full_exp(fileutils, test_dir, wlmutils): + test_dir = test_dir exp = Experiment("gen-test", test_dir, launcher="local") model = exp.create_model("model", run_settings=rs) @@ -141,12 +141,12 @@ def test_full_exp(fileutils, make_test_dir, wlmutils): assert osp.isfile(osp.join(test_dir, "model/sleep.py")) -def test_dir_files(fileutils, make_test_dir): +def test_dir_files(fileutils, test_dir): """test the generate of models with files that are directories with subdirectories and files """ - test_dir = make_test_dir + test_dir = test_dir exp = Experiment("gen-test", test_dir, launcher="local") params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} @@ -164,10 +164,10 @@ def test_dir_files(fileutils, make_test_dir): assert osp.isfile(osp.join(model_path, "test.py")) -def test_print_files(fileutils, make_test_dir, capsys): +def test_print_files(fileutils, test_dir, capsys): """Test the stdout print of files attached to an ensemble""" - test_dir = make_test_dir + test_dir = test_dir exp = Experiment("print-attached-files-test", test_dir, launcher="local") ensemble = exp.create_ensemble("dir_test", replicas=1, run_settings=rs) @@ -245,9 +245,9 @@ def test_print_files(fileutils, make_test_dir, capsys): assert captured.out == expected_out_multi -def test_multiple_tags(fileutils, make_test_dir): +def test_multiple_tags(fileutils, test_dir): """Test substitution of multiple tagged parameters on same line""" - test_dir = make_test_dir + test_dir = test_dir exp = Experiment("test-multiple-tags", test_dir) model_params = {"port": 6379, "password": "unbreakable_password"} @@ -267,10 +267,10 @@ def test_multiple_tags(fileutils, make_test_dir): ) -def test_generation_log(fileutils, make_test_dir): +def test_generation_log(fileutils, test_dir): """Test that an error is issued when a tag is unused and make_fatal is True""" - test_dir = make_test_dir + test_dir = test_dir exp = Experiment("gen-log-test", test_dir, launcher="local") params = {"THERMO": [10, 20], "STEPS": [10, 20]} @@ -302,12 +302,12 @@ def not_header(line): ), ) -def test_config_dir(fileutils, make_test_dir): +def test_config_dir(fileutils, test_dir): """Test the generation and configuration of models with tagged files that are directories with subdirectories and files """ exp = Experiment("config-dir", launcher="local") - test_dir = make_test_dir + test_dir = test_dir gen = Generator(test_dir) params = {"PARAM0": [0, 1], "PARAM1": [2, 3]} diff --git a/tests/test_interrupt.py b/tests/test_interrupt.py index 529dc966f..0b6cf0a47 100644 --- a/tests/test_interrupt.py +++ b/tests/test_interrupt.py @@ -40,13 +40,13 @@ def keyboard_interrupt(pid): os.kill(pid, signal.SIGINT) -def test_interrupt_blocked_jobs(make_test_dir): +def test_interrupt_blocked_jobs(test_dir): """ Launches and polls a model and an ensemble with two more models. Once polling starts, the SIGINT signal is sent to the main thread, and consequently, all running jobs are killed. """ - test_dir = make_test_dir + test_dir = test_dir exp_name = "test_interrupt_blocked_jobs" exp = Experiment(exp_name, exp_path=test_dir) model = exp.create_model( @@ -77,7 +77,7 @@ def test_interrupt_blocked_jobs(make_test_dir): assert len(completed_jobs) == num_jobs -def test_interrupt_multi_experiment_unblocked_jobs(make_test_dir): +def test_interrupt_multi_experiment_unblocked_jobs(test_dir): """ Starts two Experiments, each having one model and an ensemble with two more models. Since @@ -85,7 +85,7 @@ def test_interrupt_multi_experiment_unblocked_jobs(make_test_dir): the SIGINT signal is sent, resulting in both Experiment's running jobs to be killed. """ - test_dir = make_test_dir + test_dir = test_dir exp_names = ["test_interrupt_jobs_0", "test_interrupt_jobs_1"] experiments = [Experiment(exp_names[i], exp_path=test_dir) for i in range(2)] jobs_per_experiment = [0] * len(experiments) diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index 2b311350d..bc428106c 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -43,10 +43,10 @@ def test_unsupported_run_settings(): exp.start(model) -def test_model_failure(fileutils, make_test_dir): +def test_model_failure(fileutils, test_dir): exp_name = "test-model-failure" exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("bad.py") settings = RunSettings("python", f"{script} --time=3") @@ -58,11 +58,11 @@ def test_model_failure(fileutils, make_test_dir): assert all([stat == status.STATUS_FAILED for stat in statuses]) -def test_orchestrator_relaunch(make_test_dir, wlmutils): +def test_orchestrator_relaunch(test_dir, wlmutils): """Test when users try to launch second orchestrator""" exp_name = "test-orc-on-relaunch" exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir orc = Orchestrator(port=wlmutils.get_test_port()) orc.set_path(test_dir) diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index 71642b739..4c2a0c76d 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -32,10 +32,10 @@ """ -def test_models(fileutils, make_test_dir): +def test_models(fileutils, test_dir): exp_name = "test-models-local-launch" exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -48,10 +48,10 @@ def test_models(fileutils, make_test_dir): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_ensemble(fileutils, make_test_dir): +def test_ensemble(fileutils, test_dir): exp_name = "test-ensemble-launch" exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_local_multi_run.py b/tests/test_local_multi_run.py index f9c4e0ad4..94e60e2c8 100644 --- a/tests/test_local_multi_run.py +++ b/tests/test_local_multi_run.py @@ -32,10 +32,10 @@ """ -def test_models(fileutils, make_test_dir): +def test_models(fileutils, test_dir): exp_name = "test-models-local-launch" exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index 625cecb5a..7cb9c015a 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -32,11 +32,11 @@ """ -def test_restart(fileutils, make_test_dir): +def test_restart(fileutils, test_dir): exp_name = "test-models-local-restart" exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -53,10 +53,10 @@ def test_restart(fileutils, make_test_dir): assert all([stat == status.STATUS_COMPLETED for stat in statuses]) -def test_ensemble(fileutils, make_test_dir): +def test_ensemble(fileutils, test_dir): exp_name = "test-ensemble-restart" exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_modelwriter.py b/tests/test_modelwriter.py index 71dbb1d6f..ccd6d9c4e 100644 --- a/tests/test_modelwriter.py +++ b/tests/test_modelwriter.py @@ -40,8 +40,8 @@ def get_gen_file(fileutils, filename): return fileutils.get_test_conf_path(path.join("generator_files", filename)) -def test_write_easy_configs(fileutils, make_test_dir): - test_dir = make_test_dir +def test_write_easy_configs(fileutils, test_dir): + test_dir = test_dir param_dict = { "5": 10, # MOM_input @@ -69,8 +69,8 @@ def test_write_easy_configs(fileutils, make_test_dir): assert filecmp.cmp(written, correct) -def test_write_med_configs(fileutils, make_test_dir): - test_dir = make_test_dir +def test_write_med_configs(fileutils, test_dir): + test_dir = test_dir param_dict = { "1 0 0 0": "3 0 0 0", # in.ellipse.gayberne @@ -101,10 +101,10 @@ def test_write_med_configs(fileutils, make_test_dir): assert filecmp.cmp(written, correct) -def test_write_new_tag_configs(fileutils, make_test_dir): +def test_write_new_tag_configs(fileutils, test_dir): """sets the tag to the dollar sign""" - test_dir = make_test_dir + test_dir = test_dir param_dict = { "1 0 0 0": "3 0 0 0", # in.ellipse.gayberne @@ -146,8 +146,8 @@ def test_mw_error_2(): writer._write_changes("[not/a/path]") -def test_write_mw_error_3(fileutils, make_test_dir): - test_dir = make_test_dir +def test_write_mw_error_3(fileutils, test_dir): + test_dir = test_dir param_dict = { "5": 10, # MOM_input diff --git a/tests/test_multidb.py b/tests/test_multidb.py index d89e656d4..1baa08029 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -62,7 +62,7 @@ def check_not_failed(exp, *args): @pytest.mark.parametrize("db_type", supported_dbs) def test_db_identifier_standard_then_colo_error( - fileutils, wlmutils, coloutils, db_type, make_test_dir): + fileutils, wlmutils, coloutils, db_type, test_dir): """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp with unique db_identifiers""" @@ -73,7 +73,7 @@ def test_db_identifier_standard_then_colo_error( test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") # Create SmartSim Experiment @@ -118,7 +118,7 @@ def test_db_identifier_standard_then_colo_error( @pytest.mark.parametrize("db_type", supported_dbs) -def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_type, make_test_dir): +def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_type, test_dir): """Test colocate_db_uds/colocate_db_tcp then create_database with database identifiers. """ @@ -130,7 +130,7 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("smartredis/dbid.py") # Create SmartSim Experiment @@ -170,7 +170,7 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ check_not_failed(exp, orc, smartsim_model) -def test_db_identifier_standard_twice_not_unique(wlmutils, make_test_dir): +def test_db_identifier_standard_twice_not_unique(wlmutils, test_dir): """Test uniqueness of db_identifier several calls to create_database, with non unique names, checking error is raised before exp start is called""" @@ -181,7 +181,7 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, make_test_dir): test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = make_test_dir + test_dir = test_dir # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) @@ -212,7 +212,7 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, make_test_dir): ) check_not_failed(exp, orc) -def test_db_identifier_create_standard_once(make_test_dir, wlmutils): +def test_db_identifier_create_standard_once(test_dir, wlmutils): """One call to create database with a database identifier""" # Set experiment name @@ -222,7 +222,7 @@ def test_db_identifier_create_standard_once(make_test_dir, wlmutils): test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = make_test_dir + test_dir = test_dir # Create the SmartSim Experiment exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) @@ -240,14 +240,14 @@ def test_db_identifier_create_standard_once(make_test_dir, wlmutils): check_not_failed(exp, db) -def test_multidb_create_standard_twice(wlmutils, make_test_dir): +def test_multidb_create_standard_twice(wlmutils, test_dir): """Multiple calls to create database with unique db_identifiers""" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = make_test_dir + test_dir = test_dir # start a new Experiment for this section exp = Experiment( @@ -274,13 +274,13 @@ def test_multidb_create_standard_twice(wlmutils, make_test_dir): exp.start(db, db2) @pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_colo_once(fileutils, make_test_dir, wlmutils, coloutils, db_type): +def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): """create one model with colocated database with db_identifier""" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() test_port = wlmutils.get_test_port() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("smartredis/dbid.py") # start a new Experiment for this section @@ -310,12 +310,12 @@ def test_multidb_colo_once(fileutils, make_test_dir, wlmutils, coloutils, db_typ @pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_standard_then_colo(fileutils, make_test_dir, wlmutils, coloutils, db_type): +def test_multidb_standard_then_colo(fileutils, test_dir, wlmutils, coloutils, db_type): """Create regular database then colocate_db_tcp/uds with unique db_identifiers""" # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") test_interface = wlmutils.get_test_interface() test_launcher = wlmutils.get_test_launcher() @@ -356,12 +356,12 @@ def test_multidb_standard_then_colo(fileutils, make_test_dir, wlmutils, coloutil @pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_colo_then_standard(fileutils, make_test_dir, wlmutils, coloutils, db_type): +def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db_type): """create regular database then colocate_db_tcp/uds with unique db_identifiers""" # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() - test_dir = make_test_dir + test_dir = test_dir test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") test_interface = wlmutils.get_test_interface() test_launcher = wlmutils.get_test_launcher() @@ -406,13 +406,13 @@ def test_multidb_colo_then_standard(fileutils, make_test_dir, wlmutils, coloutil pytest.test_launcher not in pytest.wlm_options, reason="Not testing WLM integrations", ) -def test_launch_cluster_orc_single_dbid(make_test_dir, wlmutils): +def test_launch_cluster_orc_single_dbid(test_dir, wlmutils): """test clustered 3-node orchestrator with single command with a database identifier""" # TODO detect number of nodes in allocation and skip if not sufficent exp_name = "test_launch_cluster_orc_single_dbid" launcher = wlmutils.get_test_launcher() - test_dir = make_test_dir + test_dir = test_dir exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index 7b3b54be6..bc490f1e3 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -65,10 +65,10 @@ def test_inactive_orc_get_address(): db.get_address() -def test_orc_active_functions(make_test_dir, wlmutils): +def test_orc_active_functions(test_dir, wlmutils): exp_name = "test_orc_active_functions" exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir db = Orchestrator(port=wlmutils.get_test_port()) db.set_path(test_dir) @@ -93,10 +93,10 @@ def test_orc_active_functions(make_test_dir, wlmutils): db.get_address() -def test_multiple_interfaces(make_test_dir, wlmutils): +def test_multiple_interfaces(test_dir, wlmutils): exp_name = "test_multiple_interfaces" exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir net_if_addrs = psutil.net_if_addrs() net_if_addrs = [ diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index e70ec5d8a..3bd7a3cbc 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -115,7 +115,7 @@ def set_env_var_to_inherit(rs): ], ) def test_pbs_can_make_step_from_pals_settings_fmt_cmd( - monkeypatch, mock_mpiexec, make_test_dir, rs_mutation, run_args + monkeypatch, mock_mpiexec, test_dir, rs_mutation, run_args ): # Setup run settings exe_args = ["-c", """'print("Hello")'"""] @@ -126,7 +126,7 @@ def test_pbs_can_make_step_from_pals_settings_fmt_cmd( launcher = PBSLauncher() monkeypatch.setenv(f"PBS_JOBID", "mock-job") - wdir = make_test_dir + wdir = test_dir step = launcher.create_step("my_step", wdir, rs) assert isinstance(step, MpiexecStep) assert step.get_launch_cmd() == [ @@ -139,7 +139,7 @@ def test_pbs_can_make_step_from_pals_settings_fmt_cmd( ] -def test_pals_settings_can_be_correctly_made_mpmd(monkeypatch, make_test_dir, mock_mpiexec): +def test_pals_settings_can_be_correctly_made_mpmd(monkeypatch, test_dir, mock_mpiexec): # Setup run settings def make_rs(exe, exe_args): return PalsMpiexecSettings(exe, exe_args), [exe] + exe_args @@ -166,7 +166,7 @@ def set_tasks(rs, num): launcher = PBSLauncher() monkeypatch.setenv(f"PBS_JOBID", "mock-job") - wdir = make_test_dir + wdir = test_dir step = launcher.create_step("my_step", wdir, rs_1) assert isinstance(step, MpiexecStep) assert step.get_launch_cmd() == [ diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index 1c5502c6c..a378ad945 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -37,12 +37,12 @@ # use https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test -def test_local_orchestrator(make_test_dir, wlmutils): +def test_local_orchestrator(test_dir, wlmutils): """Test launching orchestrator locally""" global first_dir exp_name = "test-orc-launch-local" exp = Experiment(exp_name, launcher="local") - test_dir = make_test_dir + test_dir = test_dir first_dir = test_dir orc = Orchestrator(port=wlmutils.get_test_port()) diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py index 2c52590dc..41cb55870 100644 --- a/tests/test_smartredis.py +++ b/tests/test_smartredis.py @@ -55,13 +55,13 @@ ) -def test_exchange(fileutils, make_test_dir, wlmutils): +def test_exchange(fileutils, test_dir, wlmutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. """ - test_dir = make_test_dir + test_dir = test_dir exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" ) @@ -100,14 +100,14 @@ def test_exchange(fileutils, make_test_dir, wlmutils): exp.stop(orc) -def test_consumer(fileutils, make_test_dir, wlmutils): +def test_consumer(fileutils, test_dir, wlmutils): """Run three processes, each one of the first two processes puts a tensor on the DB; the third process accesses the tensors put by the two producers. Finally, the tensor is used to run a model by each producer and the consumer accesses the two results. """ - test_dir = make_test_dir + test_dir = test_dir exp = Experiment( "smartredis_ensemble_consumer", exp_path=test_dir, launcher="local" ) From 20384717c1c55048d05709bc99036443b2a6938a Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Oct 2023 17:31:58 +0200 Subject: [PATCH 23/64] Fix name collision in FileUtils --- conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conftest.py b/conftest.py index c11d9b20c..4f1652754 100644 --- a/conftest.py +++ b/conftest.py @@ -560,7 +560,7 @@ def is_accepted_char(char: str): @pytest.fixture def test_dir(request: pytest.FixtureRequest): caller_function = _sanitize_caller_function(request.node.name) - dir_path = FileUtils.get_test_dir_path(caller_function, str(request.path)) + dir_path = FileUtils.get_test_output_path(caller_function, str(request.path)) try: os.makedirs(dir_path) @@ -576,7 +576,7 @@ def fileutils() -> t.Type[FileUtils]: class FileUtils: @staticmethod - def get_test_dir_path(caller_function: str, caller_fspath: str) -> str: + def get_test_output_path(caller_function: str, caller_fspath: str) -> str: caller_file_to_dir = os.path.splitext(str(caller_fspath))[0] rel_path = os.path.relpath(caller_file_to_dir, os.path.dirname(test_output_root)) dir_path = os.path.join(test_output_root, rel_path, caller_function) From 9ff094a11a838846c335fccec13a4f38fa486d62 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 26 Oct 2023 21:56:51 +0200 Subject: [PATCH 24/64] Fix fixture usage --- tests/test_generator.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_generator.py b/tests/test_generator.py index c496b9fc6..16b75cca0 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -53,9 +53,9 @@ def get_gen_file(fileutils, filename): return fileutils.get_test_conf_path(osp.join("generator_files", filename)) -def test_ensemble(fileutils, get_test_dir): +def test_ensemble(fileutils, test_dir): exp = Experiment("gen-test", launcher="local") - test_dir = get_test_dir + test_dir = test_dir gen = Generator(test_dir) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test", params=params, run_settings=rs) @@ -70,9 +70,9 @@ def test_ensemble(fileutils, get_test_dir): assert osp.isdir(osp.join(test_dir, "test/test_" + str(i))) -def test_ensemble_overwrite(fileutils, get_test_dir): +def test_ensemble_overwrite(fileutils, test_dir): exp = Experiment("gen-test-overwrite", launcher="local") - test_dir = get_test_dir + test_dir = test_dir gen = Generator(test_dir, overwrite=True) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} @@ -93,9 +93,9 @@ def test_ensemble_overwrite(fileutils, get_test_dir): assert osp.isdir(osp.join(test_dir, "test/test_" + str(i))) -def test_ensemble_overwrite_error(fileutils, get_test_dir): +def test_ensemble_overwrite_error(fileutils, test_dir): exp = Experiment("gen-test-overwrite-error", launcher="local") - test_dir = get_test_dir + test_dir = test_dir gen = Generator(test_dir) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} From ac89651f0a92cbd8c7ca848d76ae469f3fc432fd Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 27 Oct 2023 16:08:35 +0200 Subject: [PATCH 25/64] Fix lock scope --- smartsim/_core/control/controller.py | 22 +++++++++++----------- smartsim/_core/utils/redis.py | 5 +---- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index f7cf7e373..b70840a72 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -217,17 +217,17 @@ def stop_db(self, db: Orchestrator) -> None: if db.batch: self.stop_entity(db) else: - for node in db.entities: - for host_ip, port in itertools.product( - (get_ip_from_host(host) for host in node.hosts), db.ports - ): - retcode, _, _ = shutdown_db_node(host_ip, port) - # Sometimes the DB will not shutdown (unless we force NOSAVE) - if retcode != 0: - self.stop_entity(node) - continue - - with JM_LOCK: + with JM_LOCK: + for node in db.entities: + for host_ip, port in itertools.product( + (get_ip_from_host(host) for host in node.hosts), db.ports + ): + retcode, _, _ = shutdown_db_node(host_ip, port) + # Sometimes the DB will not shutdown (unless we force NOSAVE) + if retcode != 0: + self.stop_entity(node) + continue + job = self._jobs[node.name] job.set_status(STATUS_CANCELLED, "", 0, output=None, error=None) self._jobs.move_to_completed(job) diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 91a972d04..e0e9868f8 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -222,7 +222,7 @@ def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov- """Send shutdown signal to DB node. Should only be used in the case where cluster deallocation - needs to occur manually. Usually, the SmartSim task manager + needs to occur manually. Usually, the SmartSim job manager will take care of this automatically. :param host_ip: IP of host to connect to @@ -231,10 +231,7 @@ def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov- :type ports: int :return: returncode, output, and error of the process :rtype: tuple of (int, str, str) - :raises SmartSimError: if cluster creation fails """ - - # call cluster command redis_cli = CONFIG.database_cli cmd = [redis_cli, "-h", host_ip, "-p", str(port), "shutdown"] returncode, out, err = execute_cmd( From f7be14a9213bec53035b0a8ef764bba24a3ce826 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Mon, 30 Oct 2023 16:20:28 -0500 Subject: [PATCH 26/64] Replace repeated module level function choose_host was duplicated across test_dbmodel and test_dbscript. Move into wlmutils fixture as a static method --- conftest.py | 5 +++++ tests/backends/test_dbmodel.py | 12 +++--------- tests/backends/test_dbscript.py | 10 ++-------- 3 files changed, 10 insertions(+), 17 deletions(-) diff --git a/conftest.py b/conftest.py index 4f1652754..e616ff745 100644 --- a/conftest.py +++ b/conftest.py @@ -40,6 +40,7 @@ AprunSettings, JsrunSettings, MpirunSettings, + MpiexecSettings, PalsMpiexecSettings, RunSettings, ) @@ -383,6 +384,10 @@ def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: return Orchestrator(port=test_port, interface="lo") + @staticmethod + def choose_host(rs): + return get_hostlist()[0] if isinstance(rs, (MpirunSettings, MpiexecSettings)) else None + @pytest.fixture def local_db( diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 4a94e1881..435a363bd 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -109,12 +109,6 @@ def forward(self, x): should_run_pt &= "torch" in installed_redisai_backends() -def choose_host(run_settings, wlmutils): - host = None - if isinstance(run_settings, (MpirunSettings, MpiexecSettings)): - host = wlmutils.get_test_hostlist()[0] - return host - def save_tf_cnn(path, file_name): """Create a Keras CNN for testing purposes""" from smartsim.ml.tf import freeze_model @@ -177,7 +171,7 @@ def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): smartsim_model = exp.create_model("smartsim_model", run_settings) # Create database - host = choose_host(run_settings, wlmutils) + host = wlmutils.choose_host(run_settings) db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) @@ -255,7 +249,7 @@ def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): smartsim_model = exp.create_model("smartsim_model", run_settings) # Create database - host = choose_host(run_settings, wlmutils) + host = wlmutils.choose_host(run_settings) db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) @@ -326,7 +320,7 @@ def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): smartsim_model = exp.create_model("smartsim_model", run_settings) # Create database - host = choose_host(run_settings, wlmutils) + host = wlmutils.choose_host(run_settings) db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 6df908fb1..dcd86ff50 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -53,12 +53,6 @@ should_run &= "torch" in installed_redisai_backends() -def choose_host(run_settings, wlmutils): - host = None - if isinstance(run_settings, (MpirunSettings, MpiexecSettings)): - host = wlmutils.get_test_hostlist()[0] - return host - def timestwo(x): return 2 * x @@ -93,7 +87,7 @@ def test_db_script(fileutils, test_dir, wlmutils, mlutils): smartsim_model.set_path(test_dir) # Create the SmartSim database - host = choose_host(run_settings, wlmutils) + host = wlmutils.choose_host(run_settings) db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) @@ -173,7 +167,7 @@ def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): smartsim_model = exp.create_model("smartsim_model", run_settings) # Create SmartSim database - host = choose_host(run_settings, wlmutils) + host = wlmutils.choose_host(run_settings) db = exp.create_database(port=test_port, interface=test_interface, hosts=host) exp.generate(db) From e2b923868f184165df0858d70e5672ac8d41f6e8 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Mon, 30 Oct 2023 16:23:13 -0500 Subject: [PATCH 27/64] Remove extraneous print in add_batch_resources --- tests/full_wlm/test_generic_batch_launch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index e01596dc6..8ec67d712 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -42,7 +42,6 @@ def add_batch_resources(wlmutils, batch_settings): if isinstance(batch_settings, QsubBatchSettings): - print(wlmutils.get_batch_resources()) for key, value in wlmutils.get_batch_resources().items(): batch_settings.set_resource(key, value) From aa95d5ec944421d10e197ef110ac594101ea9aea Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Mon, 30 Oct 2023 17:14:33 -0500 Subject: [PATCH 28/64] Enforce type for batch resources --- smartsim/_core/config/config.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 9e4880ab2..b1f33ec8b 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -182,7 +182,17 @@ def test_port(self) -> int: # pragma: no cover @property def test_batch_resources(self) -> t.Any: # pragma: no cover - return json.loads(os.environ.get("SMARTSIM_TEST_BATCH_RESOURCES", "{}")) + resource_str = os.environ.get("SMARTSIM_TEST_BATCH_RESOURCES", "{}") + resources = json.loads(resource_str) + if not isinstance(resources, dict): + raise TypeError( + ( + "SMARTSIM_TEST_BATCH_RESOURCES was not interpreted as a " + "dictionary, check to make sure that it is a valid " + f"JSON string: {resource_str}" + ) + ) + return resources @property def test_interface(self) -> t.List[str]: # pragma: no cover From 9f4eac0af930c562c3aa09f526d6f21469a3636b Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Tue, 31 Oct 2023 12:44:38 -0500 Subject: [PATCH 29/64] Reset license text after inadvertent find/replace --- smartsim/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/status.py b/smartsim/status.py index a9eff28eb..ba5f5076d 100644 --- a/smartsim/status.py +++ b/smartsim/status.py @@ -14,7 +14,7 @@ # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NEVER LIMITED TO, THE +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL From e697f58fd65b10e01b7021fde0fd530388da86cc Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Wed, 1 Nov 2023 13:39:04 -0500 Subject: [PATCH 30/64] Fix SS env vars --- smartsim/_core/control/controller.py | 21 +++++++++------------ smartsim/_core/control/jobmanager.py | 2 +- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index b70840a72..31e9f036c 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -528,25 +528,22 @@ def _prep_entity_client_env(self, entity: Model) -> None: for db_id, addresses in address_dict.items(): db_name, _ = unpack_db_identifier(db_id, "_") - if addresses: - if len(addresses) <= 128: - client_env[f"SSDB{db_name}"] = ",".join(addresses) - else: - # Cap max length of SSDB - client_env[f"SSDB{db_name}"] = ",".join(addresses[:128]) - if entity.incoming_entities: - client_env[f"SSKEYIN{db_name}"] = ",".join( - [in_entity.name for in_entity in entity.incoming_entities] - ) - if entity.query_key_prefixing(): - client_env[f"SSKEYOUT{db_name}"] = entity.name + # Cap max length of SSDB + client_env[f"SSDB{db_name}"] = ",".join(addresses[:128]) # Retrieve num_shards to append to client env client_env[f"SR_DB_TYPE{db_name}"] = ( CLUSTERED if len(addresses) > 1 else STANDALONE ) + if entity.incoming_entities: + client_env[f"SSKEYIN"] = ",".join( + [in_entity.name for in_entity in entity.incoming_entities] + ) + if entity.query_key_prefixing(): + client_env[f"SSKEYOUT"] = entity.name + # Set address to local if it's a colocated model if entity.colocated and entity.run_settings.colocated_db_settings is not None: db_name_colo = entity.run_settings.colocated_db_settings["db_identifier"] diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index dd3ebe405..75cb0ebc9 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -324,7 +324,7 @@ def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: ip_addr = get_ip_from_host(combine[0]) addresses.append(":".join((ip_addr, str(combine[1])))) - address_dict.update({db_entity.name: addresses}) + address_dict.update({db_entity.name: addresses}) return address_dict From cce73db692d0bc8b37ce668ec287f50e0e31b534 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Wed, 1 Nov 2023 16:57:45 -0500 Subject: [PATCH 31/64] Use db_identifier --- smartsim/_core/control/controller.py | 14 ++++----- smartsim/_core/control/jobmanager.py | 2 +- smartsim/database/orchestrator.py | 18 ++++++----- tests/test_configs/smartredis/multidbid.py | 4 --- tests/test_multidb.py | 35 ++++++++++++++++++---- 5 files changed, 48 insertions(+), 25 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 31e9f036c..d20b16c59 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -532,23 +532,23 @@ def _prep_entity_client_env(self, entity: Model) -> None: # Cap max length of SSDB client_env[f"SSDB{db_name}"] = ",".join(addresses[:128]) - # Retrieve num_shards to append to client env - client_env[f"SR_DB_TYPE{db_name}"] = ( - CLUSTERED if len(addresses) > 1 else STANDALONE - ) + # Retrieve num_shards to append to client env + client_env[f"SR_DB_TYPE{db_name}"] = ( + CLUSTERED if len(addresses) > 1 else STANDALONE + ) if entity.incoming_entities: - client_env[f"SSKEYIN"] = ",".join( + client_env["SSKEYIN"] = ",".join( [in_entity.name for in_entity in entity.incoming_entities] ) if entity.query_key_prefixing(): - client_env[f"SSKEYOUT"] = entity.name + client_env["SSKEYOUT"] = entity.name # Set address to local if it's a colocated model if entity.colocated and entity.run_settings.colocated_db_settings is not None: db_name_colo = entity.run_settings.colocated_db_settings["db_identifier"] - for key in self._jobs.get_db_host_addresses(): + for key in address_dict: _, db_id = unpack_db_identifier(key, "_") if db_name_colo == db_id: raise SSDBIDConflictError( diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 493f8bae8..efc43be85 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -324,7 +324,7 @@ def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: ip_addr = get_ip_from_host(combine[0]) addresses.append(":".join((ip_addr, str(combine[1])))) - address_dict.update({db_entity.name: addresses}) + address_dict.update({db_entity.db_identifier: addresses}) return address_dict diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index bfc9594f7..b2a5e4f4b 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -307,8 +307,7 @@ def hosts(self) -> t.List[str]: return self._hosts def reset_hosts(self) -> None: - """Clear hosts or reset them to last user choice - """ + """Clear hosts or reset them to last user choice""" for node in self.entities: node.clear_hosts() self._hosts = [] @@ -786,7 +785,7 @@ def _initialize_entities( run_settings, [port], [db_node_name + ".out"], - self.name, + db_node_name, ) self.entities.append(node) @@ -808,9 +807,7 @@ def _initialize_entities_mpmd( ) exe_args = " ".join(start_script_args) exe_args_mpmd.append(sh_split(exe_args)) - run_settings: t.Optional[RunSettings] = None - if self.launcher == "lsf": run_settings = self._build_run_settings_lsf( sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs @@ -821,11 +818,16 @@ def _initialize_entities_mpmd( sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs ) output_files = [self.name + ".out"] - if not run_settings: raise ValueError(f"Could not build run settings for {self.launcher}") - - node = DBNode(self.name, self.path, run_settings, [port], output_files) + node = DBNode( + self.name, + self.path, + run_settings, + [port], + output_files, + db_identifier=self.name + "_0", + ) self.entities.append(node) self.ports = [port] diff --git a/tests/test_configs/smartredis/multidbid.py b/tests/test_configs/smartredis/multidbid.py index 9691515f4..5f4806ac5 100644 --- a/tests/test_configs/smartredis/multidbid.py +++ b/tests/test_configs/smartredis/multidbid.py @@ -25,8 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse - - import os from smartredis import ConfigOptions, Client @@ -53,5 +51,3 @@ c1 = Client(opts1, logger_name="SmartSim") c2 = Client(opts2, logger_name="SmartSim") - - diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 1baa08029..a9da538be 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -27,6 +27,7 @@ import pytest from smartsim import Experiment, status +from smartsim.database import Orchestrator from smartsim.entity.entity import SmartSimEntity from smartsim.error.errors import SSDBIDConflictError from smartsim.log import get_logger @@ -406,18 +407,21 @@ def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db pytest.test_launcher not in pytest.wlm_options, reason="Not testing WLM integrations", ) -def test_launch_cluster_orc_single_dbid(test_dir, wlmutils): +@pytest.mark.parametrize("db_type", supported_dbs) +def test_launch_cluster_orc_single_dbid(test_dir, coloutils, fileutils, wlmutils, db_type): """test clustered 3-node orchestrator with single command with a database identifier""" # TODO detect number of nodes in allocation and skip if not sufficent exp_name = "test_launch_cluster_orc_single_dbid" launcher = wlmutils.get_test_launcher() + test_port = wlmutils.get_test_port() + test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") test_dir = test_dir exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + orc: Orchestrator = exp.create_database( wlmutils.get_test_port(), db_nodes=3, batch=False, @@ -427,8 +431,29 @@ def test_launch_cluster_orc_single_dbid(test_dir, wlmutils): db_identifier="testdb_reg", ) - with make_entity_context(exp, orc) as orc: + db_args = { + "port": test_port, + "db_cpus": 1, + "debug": True, + "db_identifier": "testdb_colo", + } + + # Create model with colocated database + smartsim_model = coloutils.setup_test_colo( + fileutils, + db_type, + exp, + test_script, + db_args, + on_wlm = on_wlm + ) + + with make_entity_context(exp, orc) as orc, \ + make_entity_context(exp, smartsim_model) as smartsim_model: exp.start(orc, block=True) + exp.start(smartsim_model, block=True) + job_dict = exp._control._jobs.get_db_host_addresses() + print(job_dict) + assert len(job_dict[orc.entities[0].db_identifier]) == 3 - statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + check_not_failed(exp, orc, smartsim_model) From bd3ff355bca44c7f122574f4a89fcbaf3470c0b9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 2 Nov 2023 00:41:14 +0100 Subject: [PATCH 32/64] Disable key prefixing for test colocated entities --- tests/backends/test_dbmodel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 435a363bd..6cdbac5c0 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -583,6 +583,7 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): inputs=inputs2, outputs=outputs2, ) + entity.disable_key_prefixing() # Test adding a model from Ensemble object colo_ensemble.add_ml_model( @@ -693,13 +694,14 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml inputs=inputs2, outputs=outputs2, ) + entity.disable_key_prefixing() # Add another ensemble member colo_ensemble.add_model(colo_model) # Colocate a database with the new ensemble member colo_model.colocate_db_tcp( - port=test_port + len(colo_ensemble), + port=test_port + len(colo_ensemble) - 1, db_cpus=1, debug=True, ifname=test_interface From 2eab46588e74aef82203b8574de09ad460a04b6c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 2 Nov 2023 19:04:58 +0100 Subject: [PATCH 33/64] Add test to validate uds socket file name --- tests/test_colo_model_local.py | 48 ++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index 29a96718a..39d7c2dbe 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -148,8 +148,7 @@ def test_launch_colocated_model_defaults( def test_launch_multiple_colocated_models( fileutils, test_dir, coloutils, wlmutils, db_type, launcher="local" ): - """Test the concurrent launch of two models with a colocated database and local launcher - """ + """Test the concurrent launch of two models with a colocated database and local launcher""" db_args = {} @@ -189,7 +188,11 @@ def test_launch_multiple_colocated_models( def test_colocated_model_disable_pinning( fileutils, test_dir, coloutils, db_type, launcher="local" ): - exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", launcher=launcher, exp_path=test_dir) + exp = Experiment( + f"colocated_model_pinning_auto_1cpu_{db_type}", + launcher=launcher, + exp_path=test_dir, + ) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -213,7 +216,11 @@ def test_colocated_model_disable_pinning( def test_colocated_model_pinning_auto_2cpu( fileutils, test_dir, coloutils, db_type, launcher="local" ): - exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", launcher=launcher, exp_path=test_dir) + exp = Experiment( + f"colocated_model_pinning_auto_2cpu_{db_type}", + launcher=launcher, + exp_path=test_dir, + ) db_args = { "db_cpus": 2, @@ -242,10 +249,16 @@ def test_colocated_model_pinning_auto_2cpu( @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type, launcher="local"): +def test_colocated_model_pinning_range( + fileutils, test_dir, coloutils, db_type, launcher="local" +): # Check to make sure that the CPU mask was correctly generated - exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=test_dir) + exp = Experiment( + f"colocated_model_pinning_manual_{db_type}", + launcher=launcher, + exp_path=test_dir, + ) db_args = {"db_cpus": 2, "custom_pinning": range(2)} @@ -265,10 +278,16 @@ def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type, @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type, launcher="local"): +def test_colocated_model_pinning_list( + fileutils, test_dir, coloutils, db_type, launcher="local" +): # Check to make sure that the CPU mask was correctly generated - exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=test_dir) + exp = Experiment( + f"colocated_model_pinning_manual_{db_type}", + launcher=launcher, + exp_path=test_dir, + ) db_args = {"db_cpus": 1, "custom_pinning": [1]} @@ -284,3 +303,16 @@ def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type, l exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + + +def test_colo_uds_verifies_socket_file_name(test_dir, launcher="local"): + # Check to make sure that the CPU mask was correctly generated + + exp = Experiment(f"colo_uds_wrong_name", launcher=launcher, exp_path=test_dir) + + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=["--version"]) + + colo_model = exp.create_model("wrong_uds_socket_name", colo_settings) + + with pytest.raises(ValueError): + colo_model.colocate_db_uds(unix_socket="this is not a valid name!") From fcdc5db100449150e95ad3446fca4dcb83b53ef7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 2 Nov 2023 19:08:00 +0100 Subject: [PATCH 34/64] Remove "test_dir = test_dir" --- conftest.py | 6 +++--- tests/backends/test_dataloader.py | 6 +++--- tests/backends/test_dbmodel.py | 16 ++++++++-------- tests/backends/test_dbscript.py | 12 ++++++------ tests/backends/test_onnx.py | 2 +- tests/backends/test_tf.py | 4 ++-- tests/backends/test_torch.py | 2 +- tests/full_wlm/test_generic_batch_launch.py | 6 +++--- .../full_wlm/test_generic_orc_launch_batch.py | 8 ++++---- tests/full_wlm/test_mpmd.py | 2 +- tests/on_wlm/test_base_settings_on_wlm.py | 4 ++-- tests/on_wlm/test_containers_wlm.py | 2 +- tests/on_wlm/test_generic_orc_launch.py | 6 +++--- tests/on_wlm/test_launch_errors.py | 4 ++-- tests/on_wlm/test_launch_ompi_lsf.py | 2 +- tests/on_wlm/test_local_step.py | 4 ++-- tests/on_wlm/test_restart.py | 2 +- .../on_wlm/test_simple_base_settings_on_wlm.py | 4 ++-- tests/on_wlm/test_simple_entity_launch.py | 6 +++--- tests/on_wlm/test_stop.py | 4 ++-- tests/test_colo_model_local.py | 2 +- tests/test_config.py | 10 +++++----- tests/test_containers.py | 6 +++--- tests/test_dbnode.py | 2 +- tests/test_experiment.py | 8 ++++---- tests/test_generator.py | 18 +++++++++--------- tests/test_interrupt.py | 4 ++-- tests/test_launch_errors.py | 4 ++-- tests/test_local_launch.py | 4 ++-- tests/test_local_multi_run.py | 2 +- tests/test_local_restart.py | 4 ++-- tests/test_modelwriter.py | 8 ++++---- tests/test_multidb.py | 18 +++++++++--------- tests/test_orchestrator.py | 4 ++-- tests/test_reconnect_orchestrator.py | 2 +- tests/test_smartredis.py | 4 ++-- 36 files changed, 101 insertions(+), 101 deletions(-) diff --git a/conftest.py b/conftest.py index e616ff745..e545b8a67 100644 --- a/conftest.py +++ b/conftest.py @@ -397,7 +397,7 @@ def local_db( exp_name = request.function.__name__ exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + db = Orchestrator(port=wlmutils.get_test_port(), interface="lo") db.set_path(test_dir) exp.start(db) @@ -417,7 +417,7 @@ def db( exp_name = request.function.__name__ exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + db = wlmutils.get_orchestrator() db.set_path(test_dir) exp.start(db) @@ -440,7 +440,7 @@ def db_cluster( exp_name = request.function.__name__ exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + db = wlmutils.get_orchestrator(nodes=3) db.set_path(test_dir) exp.start(db) diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index 771c8937b..cb9ce0d35 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -156,7 +156,7 @@ def train_tf(generator): @pytest.mark.skipif(not shouldrun_tf, reason="Test needs TensorFlow to run") def test_tf_dataloaders(test_dir, wlmutils): - test_dir = test_dir + exp = Experiment("test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher()) orc: Orchestrator = wlmutils.get_orchestrator() exp.generate(orc) @@ -222,7 +222,7 @@ def create_trainer_torch(experiment: Experiment, filedir, wlmutils): @pytest.mark.skipif(not shouldrun_torch, reason="Test needs Torch to run") def test_torch_dataloaders(fileutils, test_dir, wlmutils): - test_dir = test_dir + exp = Experiment("test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher()) orc: Orchestrator = wlmutils.get_orchestrator() config_dir = fileutils.get_test_dir_path("ml") @@ -318,7 +318,7 @@ def test_data_info_repr(): not (shouldrun_torch or shouldrun_tf), reason="Requires TF or PyTorch" ) def test_wrong_dataloaders(test_dir, wlmutils): - test_dir = test_dir + exp = Experiment("test-wrong-dataloaders", exp_path=test_dir, launcher=wlmutils.get_test_launcher()) orc = wlmutils.get_orchestrator() exp.generate(orc) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 6cdbac5c0..bfaa56f06 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -156,7 +156,7 @@ def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -234,7 +234,7 @@ def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -300,7 +300,7 @@ def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -399,7 +399,7 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experience @@ -471,7 +471,7 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -533,7 +533,7 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -639,7 +639,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create the SmartSim Experiment @@ -742,7 +742,7 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") # Create SmartSim Experiment diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index dcd86ff50..83a13999f 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -70,7 +70,7 @@ def test_db_script(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -146,7 +146,7 @@ def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -242,7 +242,7 @@ def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -311,7 +311,7 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -410,7 +410,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -507,7 +507,7 @@ def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") # Create SmartSim experiment diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index 3226c5f57..19c40017e 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -75,7 +75,7 @@ def test_sklearn_onnx(test_dir, mlutils, wlmutils): """ exp_name = "test_sklearn_onnx" - test_dir = test_dir + exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index a2c4ba8c0..407d82c1e 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -61,7 +61,7 @@ def test_keras_model(test_dir, mlutils, wlmutils): """ exp_name = "test_keras_model" - test_dir = test_dir + exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() @@ -111,7 +111,7 @@ def create_tf_model(): @pytest.mark.skipif(not tf_available, reason="Requires Tensorflow and Keras") def test_freeze_model(test_dir): - test_dir = test_dir + model = create_tf_model() model_path, inputs, outputs = freeze_model(model, test_dir, "mnist.pb") diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index 610fa50b0..71a63adb9 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -61,7 +61,7 @@ def test_torch_model_and_script(test_dir, mlutils, wlmutils): """ exp_name = "test_torch_model_and_script" - test_dir = test_dir + exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 8ec67d712..157641422 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -51,7 +51,7 @@ def test_batch_model(fileutils, test_dir, wlmutils): exp_name = "test-batch-model" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") @@ -77,7 +77,7 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): exp_name = "test-batch-ensemble" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") @@ -103,7 +103,7 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): exp_name = "test-batch-ensemble-replicas" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 3222e7860..3966d0382 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -45,7 +45,7 @@ def test_launch_orc_auto_batch(test_dir, wlmutils): exp_name = "test-launch-auto-orc-batch" exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -84,7 +84,7 @@ def test_launch_cluster_orc_batch_single(test_dir, wlmutils): exp_name = "test-launch-auto-cluster-orc-batch-single" exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -127,7 +127,7 @@ def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): exp_name = "test-launch-auto-cluster-orc-batch-multi" exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -164,7 +164,7 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-cluster-orc-batch-reconect" exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index 8b9c1f5b0..14401351b 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -77,7 +77,7 @@ def prune_commands(launcher): f"MPMD on {launcher} only supported for run commands {by_launcher[launcher]}" ) - test_dir = test_dir + for run_command in run_commands: script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings( diff --git a/tests/on_wlm/test_base_settings_on_wlm.py b/tests/on_wlm/test_base_settings_on_wlm.py index 03491ac8a..f324153c5 100644 --- a/tests/on_wlm/test_base_settings_on_wlm.py +++ b/tests/on_wlm/test_base_settings_on_wlm.py @@ -43,7 +43,7 @@ def test_model_on_wlm(fileutils, test_dir, wlmutils): exp_name = "test-base-settings-model-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") @@ -61,7 +61,7 @@ def test_model_on_wlm(fileutils, test_dir, wlmutils): def test_model_stop_on_wlm(fileutils, test_dir, wlmutils): exp_name = "test-base-settings-model-stop" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py index a221ab4c3..fc9bc10d1 100644 --- a/tests/on_wlm/test_containers_wlm.py +++ b/tests/on_wlm/test_containers_wlm.py @@ -55,7 +55,7 @@ def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): f"Test only runs on systems with PBS or Slurm as WLM. Current launcher: {launcher}" ) - test_dir = test_dir + exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher=launcher ) diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index d087d1f8e..b5e2e4394 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -39,7 +39,7 @@ def test_launch_orc_auto(test_dir, wlmutils): exp_name = "test-launch-auto-orc" exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -72,7 +72,7 @@ def test_launch_cluster_orc_single(test_dir, wlmutils): exp_name = "test-launch-auto-cluster-orc-single" exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -106,7 +106,7 @@ def test_launch_cluster_orc_multi(test_dir, wlmutils): exp_name = "test-launch-auto-cluster-orc-multi" exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/on_wlm/test_launch_errors.py b/tests/on_wlm/test_launch_errors.py index ef65ab3f6..bba0b6b00 100644 --- a/tests/on_wlm/test_launch_errors.py +++ b/tests/on_wlm/test_launch_errors.py @@ -41,7 +41,7 @@ def test_failed_status(fileutils, test_dir, wlmutils): exp_name = "test-report-failure" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("bad.py") settings = exp.create_run_settings( @@ -70,7 +70,7 @@ def test_bad_run_command_args(fileutils, test_dir, wlmutils): exp_name = "test-bad-run-command-args" exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + script = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_launch_ompi_lsf.py b/tests/on_wlm/test_launch_ompi_lsf.py index 7bb92102b..d0bafe2eb 100644 --- a/tests/on_wlm/test_launch_ompi_lsf.py +++ b/tests/on_wlm/test_launch_ompi_lsf.py @@ -40,7 +40,7 @@ def test_launch_openmpi_lsf(fileutils, test_dir, wlmutils): pytest.skip("Test only runs on systems with LSF as WLM") exp_name = "test-launch-openmpi-lsf" exp = Experiment(exp_name, launcher=launcher) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", script, "mpirun") diff --git a/tests/on_wlm/test_local_step.py b/tests/on_wlm/test_local_step.py index c3da53cb5..c15e97642 100644 --- a/tests/on_wlm/test_local_step.py +++ b/tests/on_wlm/test_local_step.py @@ -46,7 +46,7 @@ def test_local_env_pass_implicit(fileutils, test_dir) -> None: env_key = "test_local_env_pass_implicit" os.environ[env_key] = exp_value - test_dir = test_dir + exp_dir = f"{test_dir}/exp" os.makedirs(exp_dir) script = fileutils.get_test_conf_path("check_env.py") @@ -84,7 +84,7 @@ def test_local_env_pass_explicit(fileutils, test_dir) -> None: assert env_key not in os.environ - test_dir = test_dir + script = fileutils.get_test_conf_path("check_env.py") exp_dir = f"{test_dir}/exp" diff --git a/tests/on_wlm/test_restart.py b/tests/on_wlm/test_restart.py index 6dd3ba1c6..72c2d2311 100644 --- a/tests/on_wlm/test_restart.py +++ b/tests/on_wlm/test_restart.py @@ -39,7 +39,7 @@ def test_restart(fileutils, test_dir, wlmutils): exp_name = "test-restart" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index 382e4deb0..bfcac0d5e 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -57,7 +57,7 @@ def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): exp_name = "test-simplebase-settings-model-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") @@ -78,7 +78,7 @@ def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): exp_name = "test-simplebase-settings-model-stop" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index 00645a329..c6146d517 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -49,7 +49,7 @@ def test_models(fileutils, test_dir, wlmutils): exp_name = "test-models-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -66,7 +66,7 @@ def test_models(fileutils, test_dir, wlmutils): def test_ensemble(fileutils, test_dir, wlmutils): exp_name = "test-ensemble-launch" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -85,7 +85,7 @@ def test_summary(fileutils, test_dir, wlmutils): exp_name = "test-launch-summary" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + sleep = fileutils.get_test_conf_path("sleep.py") bad = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index 94169ec9d..36f8c5400 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -45,7 +45,7 @@ def test_stop_entity(fileutils, test_dir, wlmutils): exp_name = "test-launch-stop-model" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") @@ -63,7 +63,7 @@ def test_stop_entity_list(fileutils, test_dir, wlmutils): exp_name = "test-launch-stop-ensemble" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index 39d7c2dbe..6ac6838d7 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -116,7 +116,7 @@ def test_launch_colocated_model_defaults( db_args = {} - test_dir = test_dir + exp = Experiment(f"colocated_model_defaults_{db_type}", test_dir, launcher=launcher) colo_model = coloutils.setup_test_colo( fileutils, diff --git a/tests/test_config.py b/tests/test_config.py index edd130171..1c4f918b1 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -84,7 +84,7 @@ def make_file(filepath: str) -> None: def test_redisai_invalid_rai_path(test_dir, monkeypatch): """An invalid RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should fail""" - test_dir = test_dir + rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") make_file(os.path.join(test_dir, "lib", "redisai.so")) env = get_redisai_env(rai_file_path, test_dir) @@ -101,7 +101,7 @@ def test_redisai_invalid_rai_path(test_dir, monkeypatch): def test_redisai_valid_rai_path(test_dir, monkeypatch): """A valid RAI_PATH should override valid SMARTSIM_DEP_INSTALL_PATH and succeed""" - test_dir = test_dir + rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") make_file(rai_file_path) @@ -116,7 +116,7 @@ def test_redisai_valid_rai_path(test_dir, monkeypatch): def test_redisai_invalid_lib_path(test_dir, monkeypatch): """Invalid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should fail""" - test_dir = test_dir + rai_file_path = f"{test_dir}/railib/redisai.so" env = get_redisai_env(rai_file_path, test_dir) @@ -132,7 +132,7 @@ def test_redisai_invalid_lib_path(test_dir, monkeypatch): def test_redisai_valid_lib_path(test_dir, monkeypatch): """Valid RAI_PATH and invalid SMARTSIM_DEP_INSTALL_PATH should succeed""" - test_dir = test_dir + rai_file_path = os.path.join(test_dir, "lib", "mock-redisai.so") make_file(rai_file_path) env = get_redisai_env(rai_file_path, test_dir) @@ -146,7 +146,7 @@ def test_redisai_valid_lib_path(test_dir, monkeypatch): def test_redisai_valid_lib_path_null_rai(test_dir, monkeypatch): """Missing RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should succeed""" - test_dir = test_dir + rai_file_path: t.Optional[str] = None lib_file_path = os.path.join(test_dir, "lib", "redisai.so") make_file(lib_file_path) diff --git a/tests/test_containers.py b/tests/test_containers.py index e92620562..c3afa394d 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -89,7 +89,7 @@ def test_singularity_commands(fileutils): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") def test_singularity_basic(fileutils, test_dir): """Basic argument-less Singularity test""" - test_dir = test_dir + container = Singularity(containerURI) @@ -115,7 +115,7 @@ def test_singularity_basic(fileutils, test_dir): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") def test_singularity_args(fileutils, test_dir): """Test combinations of args and mount arguments for Singularity""" - test_dir = test_dir + hometest_dir = os.path.join(str(Path.home()), "test") # $HOME/test mount_paths = {test_dir + "/singularity_args": hometest_dir} container = Singularity(containerURI, args="--contain", mount=mount_paths) @@ -148,7 +148,7 @@ def test_singularity_smartredis(test_dir, fileutils, wlmutils): Note: This is a containerized port of test_smartredis.py """ - test_dir = test_dir + exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" ) diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index 43c8f24b5..092ebf9fe 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -49,7 +49,7 @@ def test_parse_db_host_error(): def test_hosts(test_dir, wlmutils): exp_name = "test_hosts" exp = Experiment(exp_name) - test_dir = test_dir + orc = Orchestrator(port=wlmutils.get_test_port(), interface="lo", launcher="local") orc.set_path(test_dir) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index a77b6316e..7a3e2a764 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -35,7 +35,7 @@ def test_model_prefix(test_dir): exp_name = "test_prefix" exp = Experiment(exp_name) - test_dir = test_dir + model = exp.create_model( "model", path=test_dir, @@ -108,7 +108,7 @@ def test_bad_ensemble_init_no_rs_bs(): def test_stop_entity(test_dir): exp_name = "test_stop_entity" exp = Experiment(exp_name) - test_dir = test_dir + m = exp.create_model("model", path=test_dir, run_settings=RunSettings("sleep", "5")) exp.start(m, block=False) assert exp.finished(m) == False @@ -120,7 +120,7 @@ def test_poll(test_dir): # Ensure that a SmartSimError is not raised exp_name = "test_exp_poll" exp = Experiment(exp_name) - test_dir = test_dir + model = exp.create_model( "model", path=test_dir, run_settings=RunSettings("sleep", "5") ) @@ -132,7 +132,7 @@ def test_poll(test_dir): def test_summary(test_dir): exp_name = "test_exp_summary" exp = Experiment(exp_name) - test_dir = test_dir + m = exp.create_model( "model", path=test_dir, run_settings=RunSettings("echo", "Hello") ) diff --git a/tests/test_generator.py b/tests/test_generator.py index 16b75cca0..586c9a96d 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -55,7 +55,7 @@ def get_gen_file(fileutils, filename): def test_ensemble(fileutils, test_dir): exp = Experiment("gen-test", launcher="local") - test_dir = test_dir + gen = Generator(test_dir) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test", params=params, run_settings=rs) @@ -72,7 +72,7 @@ def test_ensemble(fileutils, test_dir): def test_ensemble_overwrite(fileutils, test_dir): exp = Experiment("gen-test-overwrite", launcher="local") - test_dir = test_dir + gen = Generator(test_dir, overwrite=True) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} @@ -95,7 +95,7 @@ def test_ensemble_overwrite(fileutils, test_dir): def test_ensemble_overwrite_error(fileutils, test_dir): exp = Experiment("gen-test-overwrite-error", launcher="local") - test_dir = test_dir + gen = Generator(test_dir) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} @@ -113,7 +113,7 @@ def test_ensemble_overwrite_error(fileutils, test_dir): def test_full_exp(fileutils, test_dir, wlmutils): - test_dir = test_dir + exp = Experiment("gen-test", test_dir, launcher="local") model = exp.create_model("model", run_settings=rs) @@ -146,7 +146,7 @@ def test_dir_files(fileutils, test_dir): are directories with subdirectories and files """ - test_dir = test_dir + exp = Experiment("gen-test", test_dir, launcher="local") params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} @@ -167,7 +167,7 @@ def test_dir_files(fileutils, test_dir): def test_print_files(fileutils, test_dir, capsys): """Test the stdout print of files attached to an ensemble""" - test_dir = test_dir + exp = Experiment("print-attached-files-test", test_dir, launcher="local") ensemble = exp.create_ensemble("dir_test", replicas=1, run_settings=rs) @@ -247,7 +247,7 @@ def test_print_files(fileutils, test_dir, capsys): def test_multiple_tags(fileutils, test_dir): """Test substitution of multiple tagged parameters on same line""" - test_dir = test_dir + exp = Experiment("test-multiple-tags", test_dir) model_params = {"port": 6379, "password": "unbreakable_password"} @@ -270,7 +270,7 @@ def test_multiple_tags(fileutils, test_dir): def test_generation_log(fileutils, test_dir): """Test that an error is issued when a tag is unused and make_fatal is True""" - test_dir = test_dir + exp = Experiment("gen-log-test", test_dir, launcher="local") params = {"THERMO": [10, 20], "STEPS": [10, 20]} @@ -307,7 +307,7 @@ def test_config_dir(fileutils, test_dir): tagged files that are directories with subdirectories and files """ exp = Experiment("config-dir", launcher="local") - test_dir = test_dir + gen = Generator(test_dir) params = {"PARAM0": [0, 1], "PARAM1": [2, 3]} diff --git a/tests/test_interrupt.py b/tests/test_interrupt.py index 0b6cf0a47..7e18fa98f 100644 --- a/tests/test_interrupt.py +++ b/tests/test_interrupt.py @@ -46,7 +46,7 @@ def test_interrupt_blocked_jobs(test_dir): Once polling starts, the SIGINT signal is sent to the main thread, and consequently, all running jobs are killed. """ - test_dir = test_dir + exp_name = "test_interrupt_blocked_jobs" exp = Experiment(exp_name, exp_path=test_dir) model = exp.create_model( @@ -85,7 +85,7 @@ def test_interrupt_multi_experiment_unblocked_jobs(test_dir): the SIGINT signal is sent, resulting in both Experiment's running jobs to be killed. """ - test_dir = test_dir + exp_names = ["test_interrupt_jobs_0", "test_interrupt_jobs_1"] experiments = [Experiment(exp_names[i], exp_path=test_dir) for i in range(2)] jobs_per_experiment = [0] * len(experiments) diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index bc428106c..cdff2b068 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -46,7 +46,7 @@ def test_unsupported_run_settings(): def test_model_failure(fileutils, test_dir): exp_name = "test-model-failure" exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + script = fileutils.get_test_conf_path("bad.py") settings = RunSettings("python", f"{script} --time=3") @@ -62,7 +62,7 @@ def test_orchestrator_relaunch(test_dir, wlmutils): """Test when users try to launch second orchestrator""" exp_name = "test-orc-on-relaunch" exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + orc = Orchestrator(port=wlmutils.get_test_port()) orc.set_path(test_dir) diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index 4c2a0c76d..7c78f20ae 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -35,7 +35,7 @@ def test_models(fileutils, test_dir): exp_name = "test-models-local-launch" exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -51,7 +51,7 @@ def test_models(fileutils, test_dir): def test_ensemble(fileutils, test_dir): exp_name = "test-ensemble-launch" exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_local_multi_run.py b/tests/test_local_multi_run.py index 94e60e2c8..1a4d15c37 100644 --- a/tests/test_local_multi_run.py +++ b/tests/test_local_multi_run.py @@ -35,7 +35,7 @@ def test_models(fileutils, test_dir): exp_name = "test-models-local-launch" exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index 7cb9c015a..518cbdc52 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -36,7 +36,7 @@ def test_restart(fileutils, test_dir): exp_name = "test-models-local-restart" exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -56,7 +56,7 @@ def test_restart(fileutils, test_dir): def test_ensemble(fileutils, test_dir): exp_name = "test-ensemble-restart" exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_modelwriter.py b/tests/test_modelwriter.py index ccd6d9c4e..8413eaba5 100644 --- a/tests/test_modelwriter.py +++ b/tests/test_modelwriter.py @@ -41,7 +41,7 @@ def get_gen_file(fileutils, filename): return fileutils.get_test_conf_path(path.join("generator_files", filename)) def test_write_easy_configs(fileutils, test_dir): - test_dir = test_dir + param_dict = { "5": 10, # MOM_input @@ -70,7 +70,7 @@ def test_write_easy_configs(fileutils, test_dir): def test_write_med_configs(fileutils, test_dir): - test_dir = test_dir + param_dict = { "1 0 0 0": "3 0 0 0", # in.ellipse.gayberne @@ -104,7 +104,7 @@ def test_write_med_configs(fileutils, test_dir): def test_write_new_tag_configs(fileutils, test_dir): """sets the tag to the dollar sign""" - test_dir = test_dir + param_dict = { "1 0 0 0": "3 0 0 0", # in.ellipse.gayberne @@ -147,7 +147,7 @@ def test_mw_error_2(): def test_write_mw_error_3(fileutils, test_dir): - test_dir = test_dir + param_dict = { "5": 10, # MOM_input diff --git a/tests/test_multidb.py b/tests/test_multidb.py index a9da538be..d21859b1c 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -74,7 +74,7 @@ def test_db_identifier_standard_then_colo_error( test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") # Create SmartSim Experiment @@ -131,7 +131,7 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("smartredis/dbid.py") # Create SmartSim Experiment @@ -182,7 +182,7 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, test_dir): test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = test_dir + # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) @@ -223,7 +223,7 @@ def test_db_identifier_create_standard_once(test_dir, wlmutils): test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = test_dir + # Create the SmartSim Experiment exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) @@ -248,7 +248,7 @@ def test_multidb_create_standard_twice(wlmutils, test_dir): test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_dir = test_dir + # start a new Experiment for this section exp = Experiment( @@ -281,7 +281,7 @@ def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() test_port = wlmutils.get_test_port() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("smartredis/dbid.py") # start a new Experiment for this section @@ -316,7 +316,7 @@ def test_multidb_standard_then_colo(fileutils, test_dir, wlmutils, coloutils, db # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") test_interface = wlmutils.get_test_interface() test_launcher = wlmutils.get_test_launcher() @@ -362,7 +362,7 @@ def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() - test_dir = test_dir + test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") test_interface = wlmutils.get_test_interface() test_launcher = wlmutils.get_test_launcher() @@ -416,7 +416,7 @@ def test_launch_cluster_orc_single_dbid(test_dir, coloutils, fileutils, wlmutils launcher = wlmutils.get_test_launcher() test_port = wlmutils.get_test_port() test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") - test_dir = test_dir + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index bc490f1e3..84a027cda 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -68,7 +68,7 @@ def test_inactive_orc_get_address(): def test_orc_active_functions(test_dir, wlmutils): exp_name = "test_orc_active_functions" exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + db = Orchestrator(port=wlmutils.get_test_port()) db.set_path(test_dir) @@ -96,7 +96,7 @@ def test_orc_active_functions(test_dir, wlmutils): def test_multiple_interfaces(test_dir, wlmutils): exp_name = "test_multiple_interfaces" exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + net_if_addrs = psutil.net_if_addrs() net_if_addrs = [ diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index a378ad945..b0881924b 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -42,7 +42,7 @@ def test_local_orchestrator(test_dir, wlmutils): global first_dir exp_name = "test-orc-launch-local" exp = Experiment(exp_name, launcher="local") - test_dir = test_dir + first_dir = test_dir orc = Orchestrator(port=wlmutils.get_test_port()) diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py index 41cb55870..a9fa38177 100644 --- a/tests/test_smartredis.py +++ b/tests/test_smartredis.py @@ -61,7 +61,7 @@ def test_exchange(fileutils, test_dir, wlmutils): Finally, the tensor is used to run a model. """ - test_dir = test_dir + exp = Experiment( "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" ) @@ -107,7 +107,7 @@ def test_consumer(fileutils, test_dir, wlmutils): Finally, the tensor is used to run a model by each producer and the consumer accesses the two results. """ - test_dir = test_dir + exp = Experiment( "smartredis_ensemble_consumer", exp_path=test_dir, launcher="local" ) From 8524819c485d3c36e4a457719065e6f73f6311f6 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Thu, 2 Nov 2023 13:38:54 -0500 Subject: [PATCH 35/64] Fix typehinting --- conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index e545b8a67..1db40b5af 100644 --- a/conftest.py +++ b/conftest.py @@ -626,7 +626,7 @@ def setup_test_colo( exp: Experiment, application_file: str, db_args: t.Dict[str, t.Any], - colo_settings: t.Optional[t.Dict[str, t.Any]] = None, + colo_settings: t.Optional[RunSettings] = None, colo_model_name: t.Optional[str] = "colocated_model", port: t.Optional[int] = test_port, on_wlm: t.Optional[bool] = False, From be38db2e85843b1c3f4f605092b7ce8fdf553fb8 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Thu, 2 Nov 2023 19:36:03 -0500 Subject: [PATCH 36/64] Address feedback from @drozt --- smartsim/database/orchestrator.py | 2 +- smartsim/experiment.py | 2 +- smartsim/settings/base.py | 21 +++++++++++++++++++++ smartsim/settings/pbsSettings.py | 10 +++++++++- tests/test_batch_settings.py | 8 ++++++++ tests/test_multidb.py | 31 ++++++++++++++++--------------- 6 files changed, 56 insertions(+), 18 deletions(-) diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index b2a5e4f4b..cc850ceba 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -156,7 +156,7 @@ def __init__( run_command: str = "auto", db_nodes: int = 1, batch: bool = False, - hosts: t.Optional[t.List[str]] = None, + hosts: t.Optional[t.Union[t.List[str], str]] = None, account: t.Optional[str] = None, time: t.Optional[str] = None, alloc: t.Optional[str] = None, diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 9c616d83c..ebc8f4a85 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -696,7 +696,7 @@ def create_database( port: int = 6379, db_nodes: int = 1, batch: bool = False, - hosts: t.Optional[t.List[str]] = None, + hosts: t.Optional[t.Union[t.List[str], str]] = None, run_command: str = "auto", interface: str = "ipogif0", account: t.Optional[str] = None, diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index b1e57ad37..5fcf5cae2 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -586,11 +586,32 @@ def __init__( self._batch_cmd = batch_cmd self.batch_args = batch_args or {} self._preamble: t.List[str] = [] + self._nodes: t.Optional[int] = None self.set_nodes(kwargs.get("nodes", None)) self.set_walltime(kwargs.get("time", None)) self.set_queue(kwargs.get("queue", None)) self.set_account(kwargs.get("account", None)) + @property + def nodes(self) -> t.Optional[int]: + return self._nodes + + + @nodes.setter + def nodes(self, num_nodes: t.Optional[t.Union[int, str]]) -> None: + if num_nodes: + if isinstance(num_nodes, int): + self._nodes = num_nodes + elif isinstance(num_nodes, str): + self._nodes = int(num_nodes) + else: + raise TypeError( + "Nodes must be an int or a string interpretable as an int" + ) + else: + self._nodes = None + + @property def batch_cmd(self) -> str: """Return the batch command diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 19b882f5d..5f0181ef2 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -73,6 +73,13 @@ def __init__( self._nodes: t.Optional[int] = None self._ncpus = ncpus + if resources and "nodes" in resources and nodes is not None: + if nodes != resources["nodes"]: + raise ValueError( + "nodes was specified as a kwarg and also in the resources " + f"but are not the same value: {nodes=} {resources['nodes']=}" + ) + # time, queue, nodes, and account set in parent class init super().__init__( "qsub", @@ -86,6 +93,7 @@ def __init__( self.resources = init_default({}, resources, dict) self._hosts: t.List[str] = [] + def set_nodes(self, num_nodes: int) -> None: """Set the number of nodes for this batch job @@ -96,7 +104,7 @@ def set_nodes(self, num_nodes: int) -> None: :type num_nodes: int """ if num_nodes: - self._nodes = int(num_nodes) + self._nodes = num_nodes def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job diff --git a/tests/test_batch_settings.py b/tests/test_batch_settings.py index 805edcb95..689792649 100644 --- a/tests/test_batch_settings.py +++ b/tests/test_batch_settings.py @@ -191,3 +191,11 @@ def test_preamble(): bsub.add_preamble(["first line", "last line"]) assert len(list(bsub.preamble)) == 4 + +def test_qsub_batch_nodes(): + """ + Test specifying nodes in as kwarg and in resources + """ + with pytest.raises(ValueError): + QsubBatchSettings(nodes=1, resources={"nodes":2}) + QsubBatchSettings(nodes=1, resources={"nodes":1}) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index d21859b1c..2454da361 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -54,8 +54,9 @@ def make_entity_context(exp: Experiment, entity: SmartSimEntity): def choose_host(wlmutils, index=0): hosts = wlmutils.get_test_hostlist() if hosts: - hosts = hosts[index] - return hosts + return hosts[index] + else: + return None def check_not_failed(exp, *args): statuses = exp.get_status(*args) @@ -105,8 +106,8 @@ def test_db_identifier_standard_then_colo_error( assert smartsim_model.run_settings.colocated_db_settings["db_identifier"] == "testdb_colo" - with make_entity_context(exp, orc) as orc, \ - make_entity_context(exp, smartsim_model) as smartsim_model: + with make_entity_context(exp, orc), \ + make_entity_context(exp, smartsim_model): exp.start(orc) with pytest.raises(SSDBIDConflictError) as ex: exp.start(smartsim_model) @@ -163,8 +164,8 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ assert orc.name == "testdb_colo" - with make_entity_context(exp, orc) as orc, \ - make_entity_context(exp, smartsim_model) as smartsim_model: + with make_entity_context(exp, orc), \ + make_entity_context(exp, smartsim_model): exp.start(smartsim_model, block=True) exp.start(orc) @@ -203,7 +204,7 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, test_dir): assert orc2.name == "my_db" # CREATE DATABASE with db_identifier - with make_entity_context(exp, orc) as orc, make_entity_context(exp, orc2): + with make_entity_context(exp, orc), make_entity_context(exp, orc2): exp.start(orc) with pytest.raises(SSDBIDConflictError) as ex: exp.start(orc2) @@ -268,10 +269,10 @@ def test_multidb_create_standard_twice(wlmutils, test_dir): ) # launch - with make_entity_context(exp, db) as db, make_entity_context(exp, db2) as db2: + with make_entity_context(exp, db), make_entity_context(exp, db2): exp.start(db, db2) - with make_entity_context(exp, db) as db, make_entity_context(exp, db2) as db2: + with make_entity_context(exp, db), make_entity_context(exp, db2): exp.start(db, db2) @pytest.mark.parametrize("db_type", supported_dbs) @@ -348,8 +349,8 @@ def test_multidb_standard_then_colo(fileutils, test_dir, wlmutils, coloutils, db on_wlm = on_wlm, ) - with make_entity_context(exp, db) as db, \ - make_entity_context(exp, smartsim_model) as smartsim_model: + with make_entity_context(exp, db), \ + make_entity_context(exp, smartsim_model): exp.start(db) exp.start(smartsim_model, block=True) @@ -395,8 +396,8 @@ def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db hosts=choose_host(wlmutils), ) - with make_entity_context(exp, db) as db, \ - make_entity_context(exp, smartsim_model) as smartsim_model: + with make_entity_context(exp, db), \ + make_entity_context(exp, smartsim_model): exp.start(db) exp.start(smartsim_model, block=True) @@ -448,8 +449,8 @@ def test_launch_cluster_orc_single_dbid(test_dir, coloutils, fileutils, wlmutils on_wlm = on_wlm ) - with make_entity_context(exp, orc) as orc, \ - make_entity_context(exp, smartsim_model) as smartsim_model: + with make_entity_context(exp, orc), \ + make_entity_context(exp, smartsim_model): exp.start(orc, block=True) exp.start(smartsim_model, block=True) job_dict = exp._control._jobs.get_db_host_addresses() From 99f47f81c5e81e9244d3050c561396f4dd356f60 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 3 Nov 2023 15:08:44 +0100 Subject: [PATCH 37/64] Separate db name and db id --- smartsim/_core/control/controller.py | 4 ++-- smartsim/_core/utils/helpers.py | 24 ++++++++++++------------ smartsim/database/orchestrator.py | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index d20b16c59..66bb86044 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -341,9 +341,9 @@ def _launch(self, manifest: Manifest) -> None: for orchestrator in manifest.dbs: for key in self._jobs.get_db_host_addresses(): _, db_id = unpack_db_identifier(key, "_") - if orchestrator.name == db_id: + if orchestrator.db_identifier == db_id: raise SSDBIDConflictError( - f"Database identifier {orchestrator.name}" + f"Database identifier {orchestrator.db_identifier}" " has already been used. Pass in a unique" " name for db_identifier" ) diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index 5d6b6d769..e69cbdcce 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -38,28 +38,28 @@ def unpack_db_identifier(db_id: str, token: str) -> t.Tuple[str, str]: - """Unpack the unformatted database identifier using the token, - and format for env variable suffix - :db_id: the unformatted database identifier eg. identifier_1_0 - :token: character '_' or '-' to use to unpack the database identifier - :return: db suffix, and formatted db_id eg. _identifier_1, identifier_1 + """Unpack the unformatted database identifier + and format for env variable suffix using the token + :param db_id: the unformatted database identifier eg. identifier_1 + :type db_id: str + :param token: character to use to construct the db suffix + :type token: str + :return: db id suffix and formatted db_id e.g. ("_identifier_1", "identifier_1") + :rtype: (str, str) """ if db_id == "orchestrator": return "", "" - db_id = "_".join(db_id.split(token)[:-1]) - # if unpacked db_id is default, return empty - if db_id == "orchestrator": - # if db_id is default after unpack, return empty - return "", "" - db_name_suffix = "_" + db_id + db_name_suffix = token + db_id return db_name_suffix, db_id def unpack_colo_db_identifier(db_id: str) -> str: """Create database identifier suffix for colocated database - :db_id: the unformatted database identifier + :param db_id: the unformatted database identifier + :type db_id: str :return: db suffix + :rtype: str """ return "_" + db_id if db_id else "" diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index cc850ceba..f650e7889 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -785,7 +785,7 @@ def _initialize_entities( run_settings, [port], [db_node_name + ".out"], - db_node_name, + self.db_identifier, ) self.entities.append(node) @@ -826,7 +826,7 @@ def _initialize_entities_mpmd( run_settings, [port], output_files, - db_identifier=self.name + "_0", + db_identifier=self.db_identifier, ) self.entities.append(node) self.ports = [port] From a9b3fff14b19012cf18f916a37ca6cc3923e9dd1 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Fri, 3 Nov 2023 12:17:16 -0500 Subject: [PATCH 38/64] Add test for db ids and names --- smartsim/database/orchestrator.py | 4 ++-- tests/test_dbnode.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index f650e7889..ca70800d1 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -817,11 +817,11 @@ def _initialize_entities_mpmd( run_settings = self._build_run_settings( sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs ) - output_files = [self.name + ".out"] + output_files = [self.name + "_0.out"] if not run_settings: raise ValueError(f"Could not build run settings for {self.launcher}") node = DBNode( - self.name, + self.name + "_0", self.path, run_settings, [port], diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index 092ebf9fe..e78021643 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -123,3 +123,11 @@ def test_set_host(): orc = Orchestrator() orc.entities[0].set_hosts(["host"]) assert orc.entities[0].host == "host" + + +@pytest.mark.parametrize("nodes, mpmd", [[3, False], [3,True], [1, False]]) +def test_db_id_and_name(mpmd, nodes): + orc = Orchestrator(db_identifier="test_db", db_nodes=nodes, single_cmd=mpmd, launcher="slurm") + for i, node in enumerate(orc.entities): + assert node.name == f"{orc.name}_{i}" + assert node.db_identifier == orc.db_identifier From f664477d332906c7333c84001ec88c7ecb0fa8b2 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Fri, 3 Nov 2023 13:50:15 -0500 Subject: [PATCH 39/64] Fix db node test for local --- tests/test_dbnode.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index e78021643..5891d9ae7 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -37,7 +37,6 @@ from smartsim.entity.dbnode import DBNode, LaunchedShardData from smartsim.error.errors import SmartSimError - def test_parse_db_host_error(): orc = Orchestrator() orc.entities[0].path = "not/a/path" @@ -126,8 +125,10 @@ def test_set_host(): @pytest.mark.parametrize("nodes, mpmd", [[3, False], [3,True], [1, False]]) -def test_db_id_and_name(mpmd, nodes): - orc = Orchestrator(db_identifier="test_db", db_nodes=nodes, single_cmd=mpmd, launcher="slurm") +def test_db_id_and_name(mpmd, nodes, wlmutils): + if nodes > 1 and wlmutils.get_test_launcher() not in pytest.wlm_options: + pytest.skip(reason="Clustered DB can only be checked on WLMs") + orc = Orchestrator(db_identifier="test_db", db_nodes=nodes, single_cmd=mpmd, launcher=wlmutils.get_test_launcher()) for i, node in enumerate(orc.entities): assert node.name == f"{orc.name}_{i}" assert node.db_identifier == orc.db_identifier From 1b29f120f43394ba332f89a07910336ab2b52a39 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 10 Nov 2023 00:18:31 +0100 Subject: [PATCH 40/64] Addresse reviewer's comments --- smartsim/_core/control/jobmanager.py | 4 +++- smartsim/database/orchestrator.py | 30 +++++++++++++--------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index efc43be85..46e3d09bb 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -324,7 +324,9 @@ def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: ip_addr = get_ip_from_host(combine[0]) addresses.append(":".join((ip_addr, str(combine[1])))) - address_dict.update({db_entity.db_identifier: addresses}) + dict_entry: t.List[str] = address_dict.get(db_entity.db_identifier, []) + dict_entry.extend(addresses) + address_dict[db_entity.db_identifier] = dict_entry return address_dict diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index ca70800d1..c721d9d16 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -60,7 +60,6 @@ logger = get_logger(__name__) - by_launcher: t.Dict[str, t.List[str]] = { "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], @@ -70,7 +69,6 @@ "local": [""], } - def _detect_command(launcher: str) -> str: if launcher in by_launcher: for cmd in by_launcher[launcher]: @@ -85,7 +83,6 @@ def _detect_command(launcher: str) -> str: ) raise SmartSimError(msg) - def _autodetect(launcher: str, run_command: str) -> t.Tuple[str, str]: """Automatically detect the launcher and run command to use""" if launcher == "auto": @@ -96,7 +93,6 @@ def _autodetect(launcher: str, run_command: str) -> t.Tuple[str, str]: return launcher, run_command - def _check_run_command(launcher: str, run_command: str) -> None: """Check that the run command is supported by the launcher""" if run_command not in by_launcher[launcher]: @@ -107,7 +103,6 @@ def _check_run_command(launcher: str, run_command: str) -> None: ) raise SmartSimError(msg) - def _get_single_command(run_command: str, batch: bool, single_cmd: bool) -> bool: if not single_cmd: return single_cmd @@ -133,7 +128,6 @@ def _get_single_command(run_command: str, batch: bool, single_cmd: bool) -> bool return single_cmd - def _check_local_constraints(launcher: str, batch: bool) -> None: """Check that the local launcher is not launched with invalid batch config""" if launcher == "local" and batch: @@ -187,14 +181,9 @@ def __init__( :type intra_op_threads: int, optional """ self.launcher, self.run_command = _autodetect(launcher, run_command) - _check_run_command(self.launcher, self.run_command) _check_local_constraints(self.launcher, batch) - single_cmd = _get_single_command(self.run_command, batch, single_cmd) - - self.db_identifier = db_identifier - self.ports: t.List[int] = [] self.path = getcwd() self._hosts: t.List[str] = [] @@ -214,8 +203,8 @@ def __init__( cpus_per_shard = int(kwargs.pop("cpus_per_shard", 4)) super().__init__( - db_identifier, - self.path, + name=db_identifier, + path=self.path, port=port, interface=interface, db_nodes=db_nodes, @@ -269,6 +258,15 @@ def __init__( self._reserved_batch_args: t.Dict[t.Type[BatchSettings], t.List[str]] = {} self._fill_reserved() + @property + def db_identifier(self) -> str: + """Return the DB identifier, which is common to a DB and all of its nodes + + :return: DB identifier + :rtype: str + """ + return self.name + @property def num_shards(self) -> int: """Return the number of DB shards contained in the orchestrator. @@ -795,7 +793,7 @@ def _initialize_entities_mpmd( self, *, db_nodes: int = 1, port: int = 6379, **kwargs: t.Any ) -> None: cluster = db_nodes >= 3 - + mpmd_node_name = self.name + "_0" exe_args_mpmd: t.List[t.List[str]] = [] for db_id in range(db_nodes): @@ -817,11 +815,11 @@ def _initialize_entities_mpmd( run_settings = self._build_run_settings( sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs ) - output_files = [self.name + "_0.out"] + output_files = [mpmd_node_name + ".out"] if not run_settings: raise ValueError(f"Could not build run settings for {self.launcher}") node = DBNode( - self.name + "_0", + mpmd_node_name, self.path, run_settings, [port], From bebbb04c77b316a3cd3a5b2e4f68a23f742c239f Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Thu, 23 Nov 2023 15:54:58 -0600 Subject: [PATCH 41/64] Make socket filename unique in tests --- conftest.py | 9 +++++++-- tests/test_colo_model_local.py | 6 +++--- tests/test_multidb.py | 3 --- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/conftest.py b/conftest.py index 1db40b5af..463142afa 100644 --- a/conftest.py +++ b/conftest.py @@ -48,7 +48,9 @@ from smartsim.error import SSConfigError from subprocess import run import sys +import tempfile import typing as t +import uuid import warnings @@ -551,7 +553,7 @@ def _sanitize_caller_function(caller_function: str) -> str: caller_function = caller_function.replace("]","") caller_function_list = caller_function.split("[", maxsplit=1) - def is_accepted_char(char: str): + def is_accepted_char(char: str) -> bool: return char.isalnum() or char in "-._" if len(caller_function_list) > 1: @@ -650,7 +652,10 @@ def setup_test_colo( db_args["port"] = port db_args["ifname"] = "lo" if db_type == "uds" and colo_model_name is not None: - db_args["unix_socket"] = f"/tmp/{colo_model_name}.socket" + tmp_dir = os.path.join(tempfile.gettempdir()) + socket_suffix = str(uuid.uuid4())[:7] + db_args["unix_socket"] = os.path.join(tmp_dir, + f"{colo_model_name}_{socket_suffix}.socket") colocate_fun: t.Dict[str, t.Callable[..., None]] = { "tcp": colo_model.colocate_db_tcp, diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index 6ac6838d7..f7c7dbd55 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -136,12 +136,12 @@ def test_launch_colocated_model_defaults( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses) # test restarting the colocated model exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses) @pytest.mark.parametrize("db_type", supported_dbs) @@ -176,7 +176,7 @@ def test_launch_multiple_colocated_models( exp.generate(*colo_models) exp.start(*colo_models, block=True) statuses = exp.get_status(*colo_models) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses) # test restarting the colocated model exp.start(*colo_models, block=True) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 2454da361..965fe746f 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -33,8 +33,6 @@ from smartsim.log import get_logger -from smartredis import * - logger = get_logger(__name__) supported_dbs = ["uds", "tcp"] @@ -454,7 +452,6 @@ def test_launch_cluster_orc_single_dbid(test_dir, coloutils, fileutils, wlmutils exp.start(orc, block=True) exp.start(smartsim_model, block=True) job_dict = exp._control._jobs.get_db_host_addresses() - print(job_dict) assert len(job_dict[orc.entities[0].db_identifier]) == 3 check_not_failed(exp, orc, smartsim_model) From 0b67eddb4745ec641aa7efa4cf11dfb4085c4e49 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Fri, 24 Nov 2023 06:15:13 -0600 Subject: [PATCH 42/64] Fix smartredis test scripts --- tests/on_wlm/test_colocated_model.py | 4 ++-- tests/test_configs/send_data_local_smartredis.py | 3 +-- tests/test_configs/send_data_local_smartredis_with_dbid.py | 4 +--- .../send_data_local_smartredis_with_dbid_error_test.py | 4 +--- 4 files changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index 83eaf4bd7..e05816be4 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -63,12 +63,12 @@ def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, db_type assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0" exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]), f"Statuses: {(stat for stat in statuses)}" # test restarting the colocated model exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == status.STATUS_COMPLETED for stat in statuses]), f"Statuses: {(stat for stat in statuses)}" @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type): diff --git a/tests/test_configs/send_data_local_smartredis.py b/tests/test_configs/send_data_local_smartredis.py index 48acf0915..b7c0a6782 100644 --- a/tests/test_configs/send_data_local_smartredis.py +++ b/tests/test_configs/send_data_local_smartredis.py @@ -27,7 +27,6 @@ import numpy as np from smartredis import Client - def main(): # address should be set as we are launching through # SmartSim. @@ -39,7 +38,7 @@ def main(): returned = client.get_tensor("test_array") np.testing.assert_array_equal(array, returned) - print(f"Test worked! Sent and received array: {str(array)}") + print(f"Test worked! Sent {str(array)} and received {str(returned)}") if __name__ == "__main__": diff --git a/tests/test_configs/send_data_local_smartredis_with_dbid.py b/tests/test_configs/send_data_local_smartredis_with_dbid.py index fecbe28ae..675f9f7c7 100644 --- a/tests/test_configs/send_data_local_smartredis_with_dbid.py +++ b/tests/test_configs/send_data_local_smartredis_with_dbid.py @@ -26,8 +26,6 @@ import numpy as np from smartredis import Client, ConfigOptions -from os import environ - def main(): # address should be set as we are launching through @@ -41,7 +39,7 @@ def main(): returned = client.get_tensor("test_array") np.testing.assert_array_equal(array, returned) - print(f"Test worked! Sent and received array: {str(array)}") + print(f"Test worked! Sent {str(array)} and received {str(returned)}") if __name__ == "__main__": diff --git a/tests/test_configs/send_data_local_smartredis_with_dbid_error_test.py b/tests/test_configs/send_data_local_smartredis_with_dbid_error_test.py index 4167503d3..f06a2a94e 100644 --- a/tests/test_configs/send_data_local_smartredis_with_dbid_error_test.py +++ b/tests/test_configs/send_data_local_smartredis_with_dbid_error_test.py @@ -26,8 +26,6 @@ import numpy as np from smartredis import Client, ConfigOptions -from os import environ - def main(): # address should be set as we are launching through @@ -43,7 +41,7 @@ def main(): returned = client.get_tensor("test_array") np.testing.assert_array_equal(array, returned) - print(f"Test worked! Sent and received array: {str(array)}") + print(f"Test worked! Sent {str(array)} and received {str(returned)}") if __name__ == "__main__": From 05fe0b29351ff84e485baa1f7626c9bb433336ac Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Fri, 24 Nov 2023 07:57:11 -0600 Subject: [PATCH 43/64] Make some asserts more helpful --- tests/on_wlm/test_colocated_model.py | 14 +++++++------- tests/test_colo_model_local.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index e05816be4..0f6d1fccd 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -63,12 +63,12 @@ def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, db_type assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "0" exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]), f"Statuses: {(stat for stat in statuses)}" + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" # test restarting the colocated model exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]), f"Statuses: {(stat for stat in statuses)}" + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type): @@ -93,7 +93,7 @@ def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_auto_2cpu(fileutils, test_dir, coloutils, db_type): @@ -118,7 +118,7 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, test_dir, coloutils, db_ty exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type): @@ -145,7 +145,7 @@ def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type): exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type): @@ -171,7 +171,7 @@ def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type): exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_mixed(fileutils, test_dir, coloutils, db_type): @@ -197,4 +197,4 @@ def test_colocated_model_pinning_mixed(fileutils, test_dir, coloutils, db_type): exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index f7c7dbd55..1bc292015 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -141,7 +141,7 @@ def test_launch_colocated_model_defaults( # test restarting the colocated model exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all(stat == status.STATUS_COMPLETED for stat in statuses) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses {statuses}" @pytest.mark.parametrize("db_type", supported_dbs) From e66c65cce1f4312abc657923ee2b50769c0b47e7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sat, 25 Nov 2023 02:00:12 -0600 Subject: [PATCH 44/64] Patch TF multigpu tests --- tests/backends/test_dbmodel.py | 12 ++++++------ tests/backends/test_dbscript.py | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index bfaa56f06..7c0fce0d9 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -155,7 +155,7 @@ def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = 1 # TF backend fails on multiple GPUs test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") @@ -299,7 +299,7 @@ def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = 1 # TF backend fails on multiple GPUs test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") @@ -398,7 +398,7 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = 1 # TF backend fails on multiple GPUs test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") @@ -532,7 +532,7 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = 1 # TF backend fails on multiple GPUs test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") @@ -638,7 +638,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = 1 # TF backend fails on multiple GPUs test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") @@ -741,7 +741,7 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = 1 # TF backend fails on multiple GPUs test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 83a13999f..b3cc3550d 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -84,12 +84,11 @@ def test_db_script(fileutils, test_dir, wlmutils, mlutils): # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) # Create the SmartSim database host = wlmutils.choose_host(run_settings) db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db) + exp.generate(db, smartsim_model) # Define the torch script string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" From a3842a4e53ea0c3ad4d0f8f46d41da6c5b07e076 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sat, 25 Nov 2023 02:14:00 -0600 Subject: [PATCH 45/64] Add info about num_test_devices --- conftest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 463142afa..805c463a1 100644 --- a/conftest.py +++ b/conftest.py @@ -82,7 +82,10 @@ def print_test_configuration() -> None: print("TEST_LAUNCHER:", test_launcher) if test_account != "": print("TEST_ACCOUNT:", test_account) - print("TEST_DEVICE:", test_device) + test_device_msg = f"TEST_DEVICE: {test_device}" + if test_device == "GPU": + test_device_msg += f"x{test_num_gpus}" + print(test_device_msg) print("TEST_NETWORK_INTERFACE (WLM only):", test_nic) if test_alloc_specs_path: print("TEST_ALLOC_SPEC_SHEET_PATH:", test_alloc_specs_path) From 05093b73bc9374b085b0ceb3901c9593876c8e31 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sun, 26 Nov 2023 06:09:02 -0600 Subject: [PATCH 46/64] Add details to failing asserts in test_dbmodel --- .gitignore | 1 + conftest.py | 4 ++-- tests/backends/test_dbmodel.py | 15 +++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 3c1f7db48..96dbd3fc1 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ venv/ .venv/ env/ .env/ +.env # written upon install smartsim/version.py diff --git a/conftest.py b/conftest.py index 805c463a1..75ad865f8 100644 --- a/conftest.py +++ b/conftest.py @@ -60,7 +60,7 @@ test_path = os.path.dirname(os.path.abspath(__file__)) test_output_root = os.path.join(test_path, "tests", "test_output") test_launcher = CONFIG.test_launcher -test_device = CONFIG.test_device +test_device = CONFIG.test_device.upper() test_num_gpus = CONFIG.test_num_gpus test_nic = CONFIG.test_interface test_alloc_specs_path = os.getenv("SMARTSIM_TEST_ALLOC_SPEC_SHEET_PATH", None) @@ -655,7 +655,7 @@ def setup_test_colo( db_args["port"] = port db_args["ifname"] = "lo" if db_type == "uds" and colo_model_name is not None: - tmp_dir = os.path.join(tempfile.gettempdir()) + tmp_dir = tempfile.gettempdir() socket_suffix = str(uuid.uuid4())[:7] db_args["unix_socket"] = os.path.join(tmp_dir, f"{colo_model_name}_{socket_suffix}.socket") diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 7c0fce0d9..d04ef364b 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -34,7 +34,6 @@ from smartsim.entity import Ensemble from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger -from smartsim.settings import MpirunSettings, MpiexecSettings from smartsim.entity.dbobject import DBModel @@ -216,7 +215,7 @@ def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): try: exp.start(db, smartsim_model, block=True) statuses = exp.get_status(smartsim_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" finally: exp.stop(db) @@ -282,7 +281,7 @@ def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): try: exp.start(db, smartsim_model, block=True) statuses = exp.get_status(smartsim_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" finally: exp.stop(db) @@ -381,7 +380,7 @@ def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): try: exp.start(db, smartsim_ensemble, block=True) statuses = exp.get_status(smartsim_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" finally: exp.stop(db) @@ -454,7 +453,7 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): try: exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" finally: exp.stop(colo_model) @@ -513,7 +512,7 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): try: exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" finally: exp.stop(colo_model) @@ -619,7 +618,7 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): try: exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" finally: exp.stop(colo_ensemble) @@ -724,7 +723,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml try: exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all(stat == status.STATUS_COMPLETED for stat in statuses), f"Statuses: {statuses}" finally: exp.stop(colo_ensemble) From 1e101e7fbe311276e25a4db80f79fbe88925829e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sun, 26 Nov 2023 07:59:27 -0600 Subject: [PATCH 47/64] Add mem cap to dataloader tests --- tests/backends/test_dataloader.py | 13 +++++++++++++ tests/backends/test_dbmodel.py | 8 ++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index cb9ce0d35..281f78bc4 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -35,16 +35,29 @@ from smartsim.experiment import Experiment from smartsim.ml.data import DataInfo, TrainingDataUploader from smartsim.status import STATUS_COMPLETED +from smartsim.log import get_logger + +logger = get_logger(__name__) shouldrun_tf = True if shouldrun_tf: try: from tensorflow import keras + import tensorflow as tf from smartsim.ml.tf import DynamicDataGenerator as TFDataGenerator from smartsim.ml.tf import StaticDataGenerator as TFStaticDataGenerator except: shouldrun_tf = False + else: + if pytest.test_device == "GPU": + try: + for device in tf.config.list_physical_devices('GPU'): + tf.config.set_logical_device_configuration( + device, + [tf.config.LogicalDeviceConfiguration(memory_limit=5_000)]) + except: + logger.warning("Could not set TF max memory limit for GPU") shouldrun_torch = True if shouldrun_torch: diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index d04ef364b..2d83ebf17 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -62,10 +62,10 @@ def call(self, x): return y if pytest.test_device == "GPU": try: - physical_devices = tf.config.list_physical_devices('GPU') - tf.config.set_logical_device_configuration( - physical_devices[0], - [tf.config.LogicalDeviceConfiguration(memory_limit=5_000)]) + for device in tf.config.list_physical_devices('GPU'): + tf.config.set_logical_device_configuration( + device, + [tf.config.LogicalDeviceConfiguration(memory_limit=5_000)]) except: logger.warning("Could not set TF max memory limit for GPU") From 94d47902feaa9ba7676275a473de1fb10ffd9bab Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sun, 26 Nov 2023 08:56:32 -0600 Subject: [PATCH 48/64] Fix number of devices if not GPU --- tests/backends/test_dbmodel.py | 4 ++-- tests/backends/test_dbscript.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 2d83ebf17..ad99d0038 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -232,7 +232,7 @@ def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") @@ -469,7 +469,7 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index b3cc3550d..67e8eb700 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -69,7 +69,7 @@ def test_db_script(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -144,7 +144,7 @@ def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -240,7 +240,7 @@ def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -309,7 +309,7 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -408,7 +408,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") @@ -505,7 +505,7 @@ def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() - test_num_gpus = mlutils.get_test_num_gpus() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") From 9af0c75cf275b49a7c928c1cb763c167039f39a8 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 27 Nov 2023 11:06:50 -0600 Subject: [PATCH 49/64] MyPy --- smartsim/_core/control/jobmanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 46e3d09bb..edb807064 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -315,7 +315,7 @@ def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: :return: dictionary of host ip addresses :rtype: Dict[str, list]""" - address_dict = {} + address_dict: t.Dict[str, t.List[str]] = {} for db_job in self.db_jobs.values(): addresses = [] if isinstance(db_job.entity, (DBNode, Orchestrator)): From 762db800d06d18c4cb78275221c1b81fb701a455 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Thu, 30 Nov 2023 19:00:48 -0600 Subject: [PATCH 50/64] Spawn in TF saving/serializing in a new process to avoid a locked GPU --- smartsim/_core/_cli/validate.py | 63 ++++++++----------- smartsim/ml/tf/utils.py | 103 ++++++++++++++++++++++---------- 2 files changed, 96 insertions(+), 70 deletions(-) diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 78db15516..580e45b0f 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -52,10 +52,6 @@ if t.TYPE_CHECKING: - # Pylint disables needed for old version of pylint w/ TF 2.6.2 - # pylint: disable-next=unused-import - from multiprocessing.connection import Connection - # pylint: disable-next=unsubscriptable-object _TemporaryDirectory = tempfile.TemporaryDirectory[str] else: @@ -86,16 +82,23 @@ def execute(args: argparse.Namespace, /) -> int: """Validate the SmartSim installation works as expected given a simple experiment """ + from importlib.util import find_spec + + torch_available = find_spec("torch") + tensorflow_available = find_spec("tensorflow") + onnx_available = find_spec("skl2onnx") and find_spec("sklearn") + backends = installed_redisai_backends() + has_tf = False try: with _VerificationTempDir(dir=os.getcwd()) as temp_dir: test_install( location=temp_dir, port=args.port, device=args.device.upper(), - with_tf="tensorflow" in backends, - with_pt="torch" in backends, - with_onnx="onnxruntime" in backends, + with_tf="tensorflow" in backends and torch_available, + with_pt="torch" in backends and tensorflow_available, + with_onnx="onnxruntime" in backends and onnx_available, ) except Exception as e: logger.error( @@ -146,12 +149,18 @@ def test_install( if with_tf: logger.info("Verifying TensorFlow Backend") _test_tf_install(client, location, device) + else: + logger.warning("Tensorflow not available. Skipping test") if with_pt: logger.info("Verifying Torch Backend") _test_torch_install(client, device) + else: + logger.warning("Torch not available. Skipping test") if with_onnx: logger.info("Verifying ONNX Backend") _test_onnx_install(client, device) + else: + logger.warning("ONNX not available. Skipping test") @contextmanager @@ -178,39 +187,10 @@ def _find_free_port() -> int: def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) -> None: - recv_conn, send_conn = mp.Pipe(duplex=False) - # Build the model in a subproc so that keras does not hog the gpu - proc = mp.Process(target=_build_tf_frozen_model, args=(send_conn, tmp_dir)) - proc.start() - - # do not need the sending connection in this proc anymore - send_conn.close() - - proc.join(timeout=120) - if proc.is_alive(): - proc.terminate() - raise Exception("Failed to build a simple keras model within 2 minutes") - try: - model_path, inputs, outputs = recv_conn.recv() - except EOFError as e: - raise Exception( - "Failed to receive serialized model from subprocess. " - "Is the `tensorflow` python package installed?" - ) from e - - client.set_model_from_file( - "keras-fcn", model_path, "TF", device=device, inputs=inputs, outputs=outputs - ) - client.put_tensor("keras-input", np.random.rand(1, 28, 28).astype(np.float32)) - client.run_model("keras-fcn", inputs=["keras-input"], outputs=["keras-output"]) - client.get_tensor("keras-output") - - -def _build_tf_frozen_model(conn: "Connection", tmp_dir: str) -> None: from tensorflow import keras - from smartsim.ml.tf import freeze_model + # Build a small TF model and freeze it fcn = keras.Sequential( layers=[ keras.layers.InputLayer(input_shape=(28, 28), name="input"), @@ -224,7 +204,14 @@ def _build_tf_frozen_model(conn: "Connection", tmp_dir: str) -> None: optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] ) model_path, inputs, outputs = freeze_model(fcn, tmp_dir, "keras_model.pb") - conn.send((model_path, inputs, outputs)) + + # Try to set the model and use it + client.set_model_from_file( + "keras-fcn", model_path, "TF", device=device, inputs=inputs, outputs=outputs + ) + client.put_tensor("keras-input", np.random.rand(1, 28, 28).astype(np.float32)) + client.run_model("keras-fcn", inputs=["keras-input"], outputs=["keras-output"]) + client.get_tensor("keras-output") def _test_torch_install(client: Client, device: _TCapitalDeviceStr) -> None: diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index 7ef8fb3c6..f975b08ba 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from __future__ import annotations from pathlib import Path import tensorflow as tf @@ -32,35 +33,43 @@ from tensorflow.python.framework.convert_to_constants import ( convert_variables_to_constants_v2, ) +import multiprocessing as mp +if t.TYPE_CHECKING: + from multiprocessing.connection import Connection -def freeze_model( - model: keras.Model, output_dir: str, file_name: str -) -> t.Tuple[str, t.List[str], t.List[str]]: - """Freeze a Keras or TensorFlow Graph +def _serialize_internals(connection: "Connection", model: keras.Model) -> None: - to use a Keras or TensorFlow model in SmartSim, the model - must be frozen and the inputs and outputs provided to the - smartredis.client.set_model_from_file() method. + full_model = tf.function(model) + full_model = full_model.get_concrete_function( + tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) + ) - This utiliy function provides everything users need to take - a trained model and put it inside an ``orchestrator`` instance + frozen_func = convert_variables_to_constants_v2(full_model) + frozen_func.graph.as_graph_def() - :param model: TensorFlow or Keras model - :type model: tf.Module - :param output_dir: output dir to save model file to - :type output_dir: str - :param file_name: name of model file to create - :type file_name: str - :return: path to model file, model input layer names, model output layer names - :rtype: str, list[str], list[str] + input_names = [x.name.split(":")[0] for x in frozen_func.inputs] + output_names = [x.name.split(":")[0] for x in frozen_func.outputs] + + model_serialized = frozen_func.graph.as_graph_def().SerializeToString( + deterministic=True + ) + + connection.send((model_serialized, input_names, output_names)) + connection.close() + +def _freeze_internals( + connection: "Connection", model: keras.Model, output_dir: str, file_name: str +) -> None: + """ + Needed to run the freezing in separate process + to avoid locking up the GPU """ - # TODO figure out why layer names don't match up to - # specified name in Model init. if not file_name.endswith(".pb"): file_name = file_name + ".pb" + full_model = tf.function(model) full_model = full_model.get_concrete_function( tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) @@ -79,6 +88,42 @@ def freeze_model( as_text=False, ) model_file_path = str(Path(output_dir, file_name).resolve()) + connection.send((model_file_path, input_names, output_names)) + connection.close() + +def freeze_model( + model: keras.Model, output_dir: str, file_name: str +) -> t.Tuple[str, t.List[str], t.List[str]]: + """Freeze a Keras or TensorFlow Graph + + to use a Keras or TensorFlow model in SmartSim, the model + must be frozen and the inputs and outputs provided to the + smartredis.client.set_model_from_file() method. + + This utiliy function provides everything users need to take + a trained model and put it inside an ``orchestrator`` instance + + :param model: TensorFlow or Keras model + :type model: tf.Module + :param output_dir: output dir to save model file to + :type output_dir: str + :param file_name: name of model file to create + :type file_name: str + :return: path to model file, model input layer names, model output layer names + :rtype: str, list[str], list[str] + """ + # TODO figure out why layer names don't match up to + # specified name in Model init. + + + parent_connection, child_connection = mp.Pipe() + graph_freeze_process = mp.Process( + target=_freeze_internals, + args=(child_connection, model, output_dir, file_name) + ) + graph_freeze_process.start() + model_file_path, input_names, output_names = parent_connection.recv() + graph_freeze_process.join() return model_file_path, input_names, output_names @@ -98,19 +143,13 @@ def serialize_model(model: keras.Model) -> t.Tuple[str, t.List[str], t.List[str] :rtype: str, list[str], list[str] """ - full_model = tf.function(model) - full_model = full_model.get_concrete_function( - tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) - ) - - frozen_func = convert_variables_to_constants_v2(full_model) - frozen_func.graph.as_graph_def() - input_names = [x.name.split(":")[0] for x in frozen_func.inputs] - output_names = [x.name.split(":")[0] for x in frozen_func.outputs] - - model_serialized = frozen_func.graph.as_graph_def().SerializeToString( - deterministic=True + parent_connection, child_connection = mp.Pipe() + graph_freeze_process = mp.Process( + target=_serialize_internals, + args=(child_connection, model) ) - + graph_freeze_process.start() + model_serialized, input_names, output_names = parent_connection.recv() + graph_freeze_process.join() return model_serialized, input_names, output_names From 31520a9e981ea0dec1bccd775c60412c2b1c5451 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Fri, 1 Dec 2023 14:49:33 -0600 Subject: [PATCH 51/64] Revert "Spawn in TF saving/serializing in a new process to avoid a locked GPU" This reverts commit 762db800d06d18c4cb78275221c1b81fb701a455. --- smartsim/_core/_cli/validate.py | 63 +++++++++++-------- smartsim/ml/tf/utils.py | 103 ++++++++++---------------------- 2 files changed, 70 insertions(+), 96 deletions(-) diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 580e45b0f..78db15516 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -52,6 +52,10 @@ if t.TYPE_CHECKING: + # Pylint disables needed for old version of pylint w/ TF 2.6.2 + # pylint: disable-next=unused-import + from multiprocessing.connection import Connection + # pylint: disable-next=unsubscriptable-object _TemporaryDirectory = tempfile.TemporaryDirectory[str] else: @@ -82,23 +86,16 @@ def execute(args: argparse.Namespace, /) -> int: """Validate the SmartSim installation works as expected given a simple experiment """ - from importlib.util import find_spec - - torch_available = find_spec("torch") - tensorflow_available = find_spec("tensorflow") - onnx_available = find_spec("skl2onnx") and find_spec("sklearn") - backends = installed_redisai_backends() - has_tf = False try: with _VerificationTempDir(dir=os.getcwd()) as temp_dir: test_install( location=temp_dir, port=args.port, device=args.device.upper(), - with_tf="tensorflow" in backends and torch_available, - with_pt="torch" in backends and tensorflow_available, - with_onnx="onnxruntime" in backends and onnx_available, + with_tf="tensorflow" in backends, + with_pt="torch" in backends, + with_onnx="onnxruntime" in backends, ) except Exception as e: logger.error( @@ -149,18 +146,12 @@ def test_install( if with_tf: logger.info("Verifying TensorFlow Backend") _test_tf_install(client, location, device) - else: - logger.warning("Tensorflow not available. Skipping test") if with_pt: logger.info("Verifying Torch Backend") _test_torch_install(client, device) - else: - logger.warning("Torch not available. Skipping test") if with_onnx: logger.info("Verifying ONNX Backend") _test_onnx_install(client, device) - else: - logger.warning("ONNX not available. Skipping test") @contextmanager @@ -187,10 +178,39 @@ def _find_free_port() -> int: def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) -> None: + recv_conn, send_conn = mp.Pipe(duplex=False) + # Build the model in a subproc so that keras does not hog the gpu + proc = mp.Process(target=_build_tf_frozen_model, args=(send_conn, tmp_dir)) + proc.start() + + # do not need the sending connection in this proc anymore + send_conn.close() + + proc.join(timeout=120) + if proc.is_alive(): + proc.terminate() + raise Exception("Failed to build a simple keras model within 2 minutes") + try: + model_path, inputs, outputs = recv_conn.recv() + except EOFError as e: + raise Exception( + "Failed to receive serialized model from subprocess. " + "Is the `tensorflow` python package installed?" + ) from e + + client.set_model_from_file( + "keras-fcn", model_path, "TF", device=device, inputs=inputs, outputs=outputs + ) + client.put_tensor("keras-input", np.random.rand(1, 28, 28).astype(np.float32)) + client.run_model("keras-fcn", inputs=["keras-input"], outputs=["keras-output"]) + client.get_tensor("keras-output") + + +def _build_tf_frozen_model(conn: "Connection", tmp_dir: str) -> None: from tensorflow import keras + from smartsim.ml.tf import freeze_model - # Build a small TF model and freeze it fcn = keras.Sequential( layers=[ keras.layers.InputLayer(input_shape=(28, 28), name="input"), @@ -204,14 +224,7 @@ def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) - optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] ) model_path, inputs, outputs = freeze_model(fcn, tmp_dir, "keras_model.pb") - - # Try to set the model and use it - client.set_model_from_file( - "keras-fcn", model_path, "TF", device=device, inputs=inputs, outputs=outputs - ) - client.put_tensor("keras-input", np.random.rand(1, 28, 28).astype(np.float32)) - client.run_model("keras-fcn", inputs=["keras-input"], outputs=["keras-output"]) - client.get_tensor("keras-output") + conn.send((model_path, inputs, outputs)) def _test_torch_install(client: Client, device: _TCapitalDeviceStr) -> None: diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index f975b08ba..7ef8fb3c6 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from __future__ import annotations from pathlib import Path import tensorflow as tf @@ -33,63 +32,7 @@ from tensorflow.python.framework.convert_to_constants import ( convert_variables_to_constants_v2, ) -import multiprocessing as mp -if t.TYPE_CHECKING: - from multiprocessing.connection import Connection - -def _serialize_internals(connection: "Connection", model: keras.Model) -> None: - - full_model = tf.function(model) - full_model = full_model.get_concrete_function( - tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) - ) - - frozen_func = convert_variables_to_constants_v2(full_model) - frozen_func.graph.as_graph_def() - - input_names = [x.name.split(":")[0] for x in frozen_func.inputs] - output_names = [x.name.split(":")[0] for x in frozen_func.outputs] - - model_serialized = frozen_func.graph.as_graph_def().SerializeToString( - deterministic=True - ) - - connection.send((model_serialized, input_names, output_names)) - connection.close() - -def _freeze_internals( - connection: "Connection", model: keras.Model, output_dir: str, file_name: str -) -> None: - """ - Needed to run the freezing in separate process - to avoid locking up the GPU - """ - - if not file_name.endswith(".pb"): - file_name = file_name + ".pb" - - - full_model = tf.function(model) - full_model = full_model.get_concrete_function( - tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) - ) - - frozen_func = convert_variables_to_constants_v2(full_model) - frozen_func.graph.as_graph_def() - - input_names = [x.name.split(":")[0] for x in frozen_func.inputs] - output_names = [x.name.split(":")[0] for x in frozen_func.outputs] - - tf.io.write_graph( - graph_or_graph_def=frozen_func.graph, - logdir=output_dir, - name=file_name, - as_text=False, - ) - model_file_path = str(Path(output_dir, file_name).resolve()) - connection.send((model_file_path, input_names, output_names)) - connection.close() def freeze_model( model: keras.Model, output_dir: str, file_name: str @@ -115,15 +58,27 @@ def freeze_model( # TODO figure out why layer names don't match up to # specified name in Model init. + if not file_name.endswith(".pb"): + file_name = file_name + ".pb" - parent_connection, child_connection = mp.Pipe() - graph_freeze_process = mp.Process( - target=_freeze_internals, - args=(child_connection, model, output_dir, file_name) + full_model = tf.function(model) + full_model = full_model.get_concrete_function( + tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) ) - graph_freeze_process.start() - model_file_path, input_names, output_names = parent_connection.recv() - graph_freeze_process.join() + + frozen_func = convert_variables_to_constants_v2(full_model) + frozen_func.graph.as_graph_def() + + input_names = [x.name.split(":")[0] for x in frozen_func.inputs] + output_names = [x.name.split(":")[0] for x in frozen_func.outputs] + + tf.io.write_graph( + graph_or_graph_def=frozen_func.graph, + logdir=output_dir, + name=file_name, + as_text=False, + ) + model_file_path = str(Path(output_dir, file_name).resolve()) return model_file_path, input_names, output_names @@ -143,13 +98,19 @@ def serialize_model(model: keras.Model) -> t.Tuple[str, t.List[str], t.List[str] :rtype: str, list[str], list[str] """ + full_model = tf.function(model) + full_model = full_model.get_concrete_function( + tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) + ) + + frozen_func = convert_variables_to_constants_v2(full_model) + frozen_func.graph.as_graph_def() - parent_connection, child_connection = mp.Pipe() - graph_freeze_process = mp.Process( - target=_serialize_internals, - args=(child_connection, model) + input_names = [x.name.split(":")[0] for x in frozen_func.inputs] + output_names = [x.name.split(":")[0] for x in frozen_func.outputs] + + model_serialized = frozen_func.graph.as_graph_def().SerializeToString( + deterministic=True ) - graph_freeze_process.start() - model_serialized, input_names, output_names = parent_connection.recv() - graph_freeze_process.join() + return model_serialized, input_names, output_names From b703bc9ae3f9aba1ba414d41e50a6d7a16275e90 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Wed, 6 Dec 2023 19:24:53 -0600 Subject: [PATCH 52/64] Simplify the logic in QsubBatchSettings Much of the need to check the type and value of the the nodes property in QsubBatchSettings is because there are two technically valid, but not quite equivalent ways of setting the number of nodes. Now, we check at various points that the both 'select' and 'nodes' is not set. Additionally, both routes can be used to set the internal _nodes property if it needs to be accessed within Python --- smartsim/settings/base.py | 21 +------ smartsim/settings/pbsSettings.py | 102 ++++++++++++++++++------------- smartsim/status.py | 6 +- tests/test_pbs_settings.py | 91 +++++++++++++++++++++++++++ 4 files changed, 153 insertions(+), 67 deletions(-) create mode 100644 tests/test_pbs_settings.py diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index 5fcf5cae2..bf725ea51 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -38,6 +38,7 @@ class SettingsBase: ... + # pylint: disable=too-many-public-methods class RunSettings(SettingsBase): # pylint: disable=unused-argument @@ -592,26 +593,6 @@ def __init__( self.set_queue(kwargs.get("queue", None)) self.set_account(kwargs.get("account", None)) - @property - def nodes(self) -> t.Optional[int]: - return self._nodes - - - @nodes.setter - def nodes(self, num_nodes: t.Optional[t.Union[int, str]]) -> None: - if num_nodes: - if isinstance(num_nodes, int): - self._nodes = num_nodes - elif isinstance(num_nodes, str): - self._nodes = int(num_nodes) - else: - raise TypeError( - "Nodes must be an int or a string interpretable as an int" - ) - else: - self._nodes = None - - @property def batch_cmd(self) -> str: """Return the batch command diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 5f0181ef2..2771f7bf5 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -27,9 +27,9 @@ import typing as t from .._core.utils import init_default -from ..error import SmartSimError -from .base import BatchSettings +from ..error import SSConfigError from ..log import get_logger +from .base import BatchSettings logger = get_logger(__name__) @@ -72,13 +72,7 @@ def __init__( self._time: t.Optional[str] = None self._nodes: t.Optional[int] = None self._ncpus = ncpus - - if resources and "nodes" in resources and nodes is not None: - if nodes != resources["nodes"]: - raise ValueError( - "nodes was specified as a kwarg and also in the resources " - f"but are not the same value: {nodes=} {resources['nodes']=}" - ) + self.resources = init_default({}, resources, dict) # time, queue, nodes, and account set in parent class init super().__init__( @@ -90,21 +84,33 @@ def __init__( time=time, **kwargs, ) - self.resources = init_default({}, resources, dict) - self._hosts: t.List[str] = [] + self._sanity_check_resources() + # Set the number of nodes if it was specified, note this needs + # to be done after the super init because nodes might also be set + self._nodes = self.resources.get("nodes", None) or self.resources.get( + "select", None + ) + + self._hosts: t.List[str] = [] def set_nodes(self, num_nodes: int) -> None: """Set the number of nodes for this batch job - If a select argument is provided in ``QsubBatchSettings.resources`` - this value will be overridden + In PBS, 'select' is the more primitive way of describing how + many nodes to allocate for the job. 'nodes' is equivalent to + 'select' with a 'place' statement. Assuming that only advanced + users would use 'set_resource' instead, defining the number of + nodes here is sets the 'nodes' resource. :param num_nodes: number of nodes :type num_nodes: int """ + if num_nodes: self._nodes = num_nodes + self.set_resource("nodes", self._nodes) + self._sanity_check_resources() def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job @@ -180,6 +186,11 @@ def set_resource(self, resource_name: str, value: str) -> None: # TODO add error checking here # TODO include option to overwrite place (warning for orchestrator?) self.resources[resource_name] = value + self._sanity_check_resources() + # Capture the case where someone is setting the number of nodes + # through 'select' or 'nodes' + if resource_name in ["select", "nodes"] and value: + self._nodes = int(value) def format_batch_args(self) -> t.List[str]: """Get the formatted batch arguments for a preview @@ -196,40 +207,43 @@ def format_batch_args(self) -> t.List[str]: opts += [" ".join((prefix + opt, str(value)))] return opts + def _sanity_check_resources(self) -> None: + """Check that only select or nodes was specified in resources + + Note: For PBS Pro, nodes is equivalent to 'select' and 'place' so + they are not quite synonyms. Here we assume that + """ + + has_select = self.resources.get("select", None) + has_nodes = self.resources.get("nodes", None) + + if has_select and has_nodes: + raise SSConfigError( + "'select' and 'nodes' cannot both be specified. This can happen " + "if nodes were specified using the 'set_nodes' method and" + "'select' was set using 'set_resource'. Please only specify one." + ) + def _create_resource_list(self) -> t.List[str]: + + self._sanity_check_resources() res = [] - # get select statement from resources or kwargs - if ("select" in self.resources) and "nodes" not in self.resources: - res += [f"-l select={str(self.resources['select'])}"] - elif ("select" in self.resources) and ("nodes" in self.resources): - nselect = self.resources["select"] - if nselect == self._nodes: - logger.warning("select and nodes were both specified, specifying nodes") - res += [f"-l nodes={self._nodes}"] - else: - raise SmartSimError( - ( - "select and nodes were both specified, but do not have " - f"the same value. select={nselect} nodes={self._nodes}" - ) - ) - elif "nodes" in self.resources: - res += [f"-l nodes={self._nodes}"] + # Construct the basic select/nodes statement + if self.resources.get("select", None): + select_command = f"-l select={self.resources['select']}" + elif self.resources.get("nodes", None): + select_command = f"-l nodes={self.resources['nodes']}" else: - select = "-l select=" - if self._nodes: - select += str(self._nodes) - else: - raise SmartSimError( - "Insufficient resource specification: no nodes or select statement" - ) - if self._ncpus: - select += f":ncpus={self._ncpus}" - if self._hosts: - hosts = ["=".join(("host", str(host))) for host in self._hosts] - select += f":{'+'.join(hosts)}" - res += [select] + raise SSConfigError( + "Insufficient resource specification: no nodes or select statement" + ) + if self._ncpus: + select_command += f":ncpus={self._ncpus}" + if self._hosts: + hosts = ["=".join(("host", str(host))) for host in self._hosts] + select_command += f":{'+'.join(hosts)}" + res += [select_command] if "place" in self.resources: res += [f"-l place={str(self.resources['place'])}"] @@ -242,6 +256,6 @@ def _create_resource_list(self) -> t.List[str]: res += [f"-l walltime={self._time}"] for resource, value in self.resources.items(): - if resource not in ["select", "walltime", "place"]: + if resource not in ["nodes", "select", "walltime", "place"]: res += [f"-l {resource}={str(value)}"] return res diff --git a/smartsim/status.py b/smartsim/status.py index ba5f5076d..74d440b8e 100644 --- a/smartsim/status.py +++ b/smartsim/status.py @@ -18,7 +18,7 @@ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NEVER LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE @@ -32,7 +32,7 @@ STATUS_FAILED = "Failed" STATUS_NEW = "New" STATUS_PAUSED = "Paused" -STATUS_NEVER_STARTED = "NotStarted" +STATUS_NEVER_STARTED = "NeverStarted" # SmartSim status mapping SMARTSIM_STATUS = { @@ -42,7 +42,7 @@ "Cancelled": STATUS_CANCELLED, "Failed": STATUS_FAILED, "New": STATUS_NEW, - "NeverStarted": STATUS_NEVER_STARTED + "NeverStarted": STATUS_NEVER_STARTED, } # Status groupings diff --git a/tests/test_pbs_settings.py b/tests/test_pbs_settings.py new file mode 100644 index 000000000..a33fb5d2d --- /dev/null +++ b/tests/test_pbs_settings.py @@ -0,0 +1,91 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +from smartsim.error import SSConfigError +from smartsim.settings import QsubBatchSettings + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + +def test_node_formatting(): + def validate_settings(settings, spec, num_nodes, num_cpus): + assert settings._create_resource_list() == [ + f"-l {spec}={num_nodes}:ncpus={num_cpus}" + ] + assert settings._ncpus == num_cpus + assert settings._nodes == num_nodes + + num_nodes = 10 + num_cpus = 36 + + # Test by specifying the number of nodes via setting a resource + for spec in ["nodes", "select"]: + # Test by setting nodes + settings = QsubBatchSettings() + settings.set_resource(spec, num_nodes) + settings.set_ncpus(36) + validate_settings(settings, spec, num_nodes, num_cpus) + + # Test when setting nodes through the constructor + settings = QsubBatchSettings(ncpus=num_cpus, nodes=num_nodes) + validate_settings(settings, "nodes", num_nodes, num_cpus) + + # Test when setting nodes through the constructor via resource + settings = QsubBatchSettings(ncpus=num_cpus, resources={"nodes": num_nodes}) + validate_settings(settings, "nodes", num_nodes, num_cpus) + + # Test when setting select through the constructor via resource + settings = QsubBatchSettings(ncpus=num_cpus, resources={"select": num_nodes}) + validate_settings(settings, "select", num_nodes, num_cpus) + + +def test_select_nodes_error(): + + # # Test failure on initialization + with pytest.raises(SSConfigError): + QsubBatchSettings(nodes=10, resources={"select": 10}) + + # Test setting via nodes and then select + settings = QsubBatchSettings() + settings.set_nodes(10) + with pytest.raises(SSConfigError): + settings.set_resource("select", 10) + + # Manually put "select" in the resource dictionary and + # make sure the resource formatter catches the error + settings = QsubBatchSettings() + settings.resources = {"nodes": 10, "select": 20} + with pytest.raises(SSConfigError): + settings._create_resource_list() + + # # Test setting via select and then nodes + settings = QsubBatchSettings() + settings.set_resource("select", 10) + with pytest.raises(SSConfigError): + settings.set_nodes(10) From 48defa2c26f009c4c626c2c0dde5292a87ad3413 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Thu, 7 Dec 2023 19:02:01 -0600 Subject: [PATCH 53/64] Delete extraneous scripts --- .../send_data_local_smartredis_with_dbid.py | 46 ------------------ ...a_local_smartredis_with_dbid_error_test.py | 48 ------------------- 2 files changed, 94 deletions(-) delete mode 100644 tests/test_configs/send_data_local_smartredis_with_dbid.py delete mode 100644 tests/test_configs/send_data_local_smartredis_with_dbid_error_test.py diff --git a/tests/test_configs/send_data_local_smartredis_with_dbid.py b/tests/test_configs/send_data_local_smartredis_with_dbid.py deleted file mode 100644 index 675f9f7c7..000000000 --- a/tests/test_configs/send_data_local_smartredis_with_dbid.py +++ /dev/null @@ -1,46 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -from smartredis import Client, ConfigOptions - -def main(): - # address should be set as we are launching through - # SmartSim. - - opts1 = ConfigOptions.create_from_environment("testdb_colo") - client = Client(opts1, logger_name="SmartSim") - - array = np.array([1, 2, 3, 4]) - client.put_tensor("test_array", array) - returned = client.get_tensor("test_array") - - np.testing.assert_array_equal(array, returned) - print(f"Test worked! Sent {str(array)} and received {str(returned)}") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/tests/test_configs/send_data_local_smartredis_with_dbid_error_test.py b/tests/test_configs/send_data_local_smartredis_with_dbid_error_test.py deleted file mode 100644 index f06a2a94e..000000000 --- a/tests/test_configs/send_data_local_smartredis_with_dbid_error_test.py +++ /dev/null @@ -1,48 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2023, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -from smartredis import Client, ConfigOptions - -def main(): - # address should be set as we are launching through - # SmartSim. - - opts1 = ConfigOptions.create_from_environment("my_db") - opts2 = ConfigOptions.create_from_environment("my_db") - client = Client(opts1, logger_name="SmartSim") - client = Client(opts2, logger_name="SmartSim") - - array = np.array([1, 2, 3, 4]) - client.put_tensor("test_array", array) - returned = client.get_tensor("test_array") - - np.testing.assert_array_equal(array, returned) - print(f"Test worked! Sent {str(array)} and received {str(returned)}") - - -if __name__ == "__main__": - main() From b929ba860d39a3871dacc615d3b53d767f356b3c Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Thu, 7 Dec 2023 19:28:28 -0600 Subject: [PATCH 54/64] Refactor QsubBatchSettings resources Further refactors the way that QsubBatchSettings is used and accessed to streamline the logic and make it fail faster if users attempt to set the number of nodes in multiple different ways --- .gitignore | 1 - conftest.py | 6 +-- smartsim/database/orchestrator.py | 3 +- smartsim/settings/base.py | 1 - smartsim/settings/pbsSettings.py | 66 ++++++++++++++++++------------- tests/backends/test_dataloader.py | 2 +- tests/test_pbs_settings.py | 9 +++-- 7 files changed, 49 insertions(+), 39 deletions(-) diff --git a/.gitignore b/.gitignore index 96dbd3fc1..3c1f7db48 100644 --- a/.gitignore +++ b/.gitignore @@ -22,7 +22,6 @@ venv/ .venv/ env/ .env/ -.env # written upon install smartsim/version.py diff --git a/conftest.py b/conftest.py index 75ad865f8..bcbc4e367 100644 --- a/conftest.py +++ b/conftest.py @@ -632,9 +632,9 @@ def setup_test_colo( application_file: str, db_args: t.Dict[str, t.Any], colo_settings: t.Optional[RunSettings] = None, - colo_model_name: t.Optional[str] = "colocated_model", - port: t.Optional[int] = test_port, - on_wlm: t.Optional[bool] = False, + colo_model_name: str = "colocated_model", + port: int = test_port, + on_wlm: bool = False, ) -> Model: """Setup database needed for the colo pinning tests""" diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index c721d9d16..0a7ae23fe 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -185,7 +185,6 @@ def __init__( _check_local_constraints(self.launcher, batch) single_cmd = _get_single_command(self.run_command, batch, single_cmd) self.ports: t.List[int] = [] - self.path = getcwd() self._hosts: t.List[str] = [] self._user_hostlist: t.List[str] = [] if isinstance(interface, str): @@ -204,7 +203,7 @@ def __init__( super().__init__( name=db_identifier, - path=self.path, + path=getcwd(), port=port, interface=interface, db_nodes=db_nodes, diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index bf725ea51..9d3b5e01a 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -587,7 +587,6 @@ def __init__( self._batch_cmd = batch_cmd self.batch_args = batch_args or {} self._preamble: t.List[str] = [] - self._nodes: t.Optional[int] = None self.set_nodes(kwargs.get("nodes", None)) self.set_walltime(kwargs.get("time", None)) self.set_queue(kwargs.get("queue", None)) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 2771f7bf5..585103793 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -69,10 +69,17 @@ def __init__( :param batch_args: overrides for PBS batch arguments, defaults to None :type batch_args: dict[str, str], optional """ - self._time: t.Optional[str] = None - self._nodes: t.Optional[int] = None + self._ncpus = ncpus - self.resources = init_default({}, resources, dict) + self._resources = resources or {} + + resource_nodes = self.resources.get("nodes", None) + + if nodes and resource_nodes: + raise ValueError( + "nodes was incorrectly specified as its own kwarg and also in the " + "resource kwarg." + ) # time, queue, nodes, and account set in parent class init super().__init__( @@ -85,14 +92,17 @@ def __init__( **kwargs, ) + self._hosts: t.List[str] = [] + + @property + def resources(self): + return self._resources.copy() + + @resources.setter + def resources(self, resources: dict[str, str | int]): + self._resources = resources.copy() self._sanity_check_resources() - # Set the number of nodes if it was specified, note this needs - # to be done after the super init because nodes might also be set - self._nodes = self.resources.get("nodes", None) or self.resources.get( - "select", None - ) - self._hosts: t.List[str] = [] def set_nodes(self, num_nodes: int) -> None: """Set the number of nodes for this batch job @@ -108,8 +118,7 @@ def set_nodes(self, num_nodes: int) -> None: """ if num_nodes: - self._nodes = num_nodes - self.set_resource("nodes", self._nodes) + self.set_resource("nodes", num_nodes) self._sanity_check_resources() def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: @@ -140,7 +149,7 @@ def set_walltime(self, walltime: str) -> None: :type walltime: str """ if walltime: - self._time = walltime + self.set_resource("walltime", walltime) def set_queue(self, queue: str) -> None: """Set the queue for the batch job @@ -185,7 +194,7 @@ def set_resource(self, resource_name: str, value: str) -> None: """ # TODO add error checking here # TODO include option to overwrite place (warning for orchestrator?) - self.resources[resource_name] = value + self._resources[resource_name] = value self._sanity_check_resources() # Capture the case where someone is setting the number of nodes # through 'select' or 'nodes' @@ -220,7 +229,7 @@ def _sanity_check_resources(self) -> None: if has_select and has_nodes: raise SSConfigError( "'select' and 'nodes' cannot both be specified. This can happen " - "if nodes were specified using the 'set_nodes' method and" + "if nodes were specified using the 'set_nodes' method and " "'select' was set using 'set_resource'. Please only specify one." ) @@ -229,11 +238,14 @@ def _create_resource_list(self) -> t.List[str]: self._sanity_check_resources() res = [] + # Pop off some specific keywords that need to be treated separately + resources = self.resources # Note this is a copy so not modifying original + # Construct the basic select/nodes statement - if self.resources.get("select", None): - select_command = f"-l select={self.resources['select']}" - elif self.resources.get("nodes", None): - select_command = f"-l nodes={self.resources['nodes']}" + if select := resources.pop("select", None): + select_command = f"-l select={select}" + elif nodes := resources.pop("nodes", None): + select_command = f"-l nodes={nodes}" else: raise SSConfigError( "Insufficient resource specification: no nodes or select statement" @@ -245,17 +257,15 @@ def _create_resource_list(self) -> t.List[str]: select_command += f":{'+'.join(hosts)}" res += [select_command] - if "place" in self.resources: - res += [f"-l place={str(self.resources['place'])}"] + if place := resources.pop("place", None): + res += [f"-l place={place}"] # get time from resources or kwargs - if "walltime" in self.resources: - res += [f"-l walltime={str(self.resources['walltime'])}"] - else: - if self._time: - res += [f"-l walltime={self._time}"] + if walltime := resources.pop("walltime", None): + res += [f"-l walltime={walltime}"] + + # All other "standard" resource specs + for resource, value in resources.items(): + res += [f"-l {resource}={str(value)}"] - for resource, value in self.resources.items(): - if resource not in ["nodes", "select", "walltime", "place"]: - res += [f"-l {resource}={str(value)}"] return res diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index 281f78bc4..fd28eb10e 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -56,7 +56,7 @@ tf.config.set_logical_device_configuration( device, [tf.config.LogicalDeviceConfiguration(memory_limit=5_000)]) - except: + except Exception: logger.warning("Could not set TF max memory limit for GPU") shouldrun_torch = True diff --git a/tests/test_pbs_settings.py b/tests/test_pbs_settings.py index a33fb5d2d..8a5776008 100644 --- a/tests/test_pbs_settings.py +++ b/tests/test_pbs_settings.py @@ -39,7 +39,6 @@ def validate_settings(settings, spec, num_nodes, num_cpus): f"-l {spec}={num_nodes}:ncpus={num_cpus}" ] assert settings._ncpus == num_cpus - assert settings._nodes == num_nodes num_nodes = 10 num_cpus = 36 @@ -80,12 +79,16 @@ def test_select_nodes_error(): # Manually put "select" in the resource dictionary and # make sure the resource formatter catches the error settings = QsubBatchSettings() - settings.resources = {"nodes": 10, "select": 20} with pytest.raises(SSConfigError): - settings._create_resource_list() + settings.resources = {"nodes": 10, "select": 20} # # Test setting via select and then nodes settings = QsubBatchSettings() settings.set_resource("select", 10) with pytest.raises(SSConfigError): settings.set_nodes(10) + +def test_resources_is_a_copy(): + settings = QsubBatchSettings() + resources = settings.resources + assert resources is not settings._resources \ No newline at end of file From a0d8328cb38b0fe8f51ffbd68a1e43b1937e92b0 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Fri, 8 Dec 2023 03:20:59 -0600 Subject: [PATCH 55/64] Merge branch 'develop' into fix_tests --- .gitignore | 5 + conftest.py | 42 +- doc/index.rst | 6 + doc/smartdashboard.rst | 7 + docker/docs/dev/Dockerfile | 6 + pyproject.toml | 1 + setup.py | 1 + smartsim/_core/_cli/__main__.py | 19 +- smartsim/_core/_cli/build.py | 10 +- smartsim/_core/_cli/clean.py | 9 +- smartsim/_core/_cli/cli.py | 57 +- smartsim/_core/_cli/dbcli.py | 10 +- smartsim/_core/_cli/info.py | 7 +- smartsim/_core/_cli/plugin.py | 55 + smartsim/_core/_cli/site.py | 6 +- smartsim/_core/_cli/utils.py | 7 +- smartsim/_core/_cli/validate.py | 11 +- smartsim/_core/config/config.py | 13 +- smartsim/_core/control/controller.py | 215 +++- smartsim/_core/control/job.py | 38 +- smartsim/_core/control/jobmanager.py | 22 +- smartsim/_core/control/manifest.py | 145 ++- smartsim/_core/entrypoints/indirect.py | 242 ++++ .../_core/entrypoints/telemetrymonitor.py | 691 ++++++++++ .../_core/launcher/cobalt/cobaltLauncher.py | 5 +- smartsim/_core/launcher/launcher.py | 12 +- smartsim/_core/launcher/local/local.py | 43 +- smartsim/_core/launcher/lsf/lsfLauncher.py | 9 +- smartsim/_core/launcher/pbs/pbsLauncher.py | 5 +- .../_core/launcher/slurm/slurmLauncher.py | 5 +- smartsim/_core/launcher/step/alpsStep.py | 6 +- smartsim/_core/launcher/step/localStep.py | 9 +- smartsim/_core/launcher/step/lsfStep.py | 3 +- smartsim/_core/launcher/step/mpiStep.py | 8 +- smartsim/_core/launcher/step/slurmStep.py | 10 +- smartsim/_core/launcher/step/step.py | 66 +- smartsim/_core/utils/helpers.py | 45 +- smartsim/_core/utils/serialize.py | 246 ++++ smartsim/entity/dbnode.py | 14 +- smartsim/error/errors.py | 30 +- smartsim/experiment.py | 35 + smartsim/log.py | 10 +- smartsim/wlm/slurm.py | 3 +- tests/backends/test_dbmodel.py | 1 + tests/full_wlm/test_generic_batch_launch.py | 15 +- .../full_wlm/test_generic_orc_launch_batch.py | 12 +- tests/full_wlm/test_mpmd.py | 3 +- tests/on_wlm/test_base_settings_on_wlm.py | 10 +- tests/on_wlm/test_colocated_model.py | 32 +- tests/on_wlm/test_generic_orc_launch.py | 9 +- tests/on_wlm/test_launch_errors.py | 8 +- tests/on_wlm/test_launch_ompi_lsf.py | 3 +- tests/on_wlm/test_restart.py | 5 +- .../test_simple_base_settings_on_wlm.py | 10 +- tests/on_wlm/test_simple_entity_launch.py | 15 +- tests/on_wlm/test_stop.py | 10 +- tests/test_cli.py | 130 +- tests/test_colo_model_local.py | 31 +- tests/test_config.py | 54 + tests/test_configs/echo.py | 42 + tests/test_configs/printing_model.py | 18 + .../telemetry/colocatedmodel.json | 69 + .../test_configs/telemetry/db_and_model.json | 86 ++ .../telemetry/db_and_model_1run.json | 79 ++ tests/test_configs/telemetry/ensembles.json | 329 +++++ .../test_configs/telemetry/serialmodels.json | 186 +++ tests/test_configs/telemetry/telemetry.json | 946 ++++++++++++++ tests/test_controller.py | 68 + tests/test_controller_errors.py | 2 +- tests/test_dbnode.py | 3 +- tests/test_experiment.py | 26 +- tests/test_generator.py | 6 +- tests/test_helpers.py | 15 + tests/test_indirect.py | 192 +++ tests/test_launch_errors.py | 6 +- tests/test_local_launch.py | 6 +- tests/test_local_multi_run.py | 3 +- tests/test_local_restart.py | 6 +- tests/test_manifest.py | 73 +- tests/test_model.py | 7 +- tests/test_multidb.py | 24 +- tests/test_orchestrator.py | 6 +- tests/test_pals_settings.py | 13 + tests/test_reconnect_orchestrator.py | 7 +- tests/test_serialize.py | 171 +++ tests/test_telemetry_monitor.py | 1121 +++++++++++++++++ 86 files changed, 5711 insertions(+), 336 deletions(-) create mode 100644 doc/smartdashboard.rst create mode 100644 smartsim/_core/_cli/plugin.py create mode 100644 smartsim/_core/entrypoints/indirect.py create mode 100644 smartsim/_core/entrypoints/telemetrymonitor.py create mode 100644 smartsim/_core/utils/serialize.py create mode 100644 tests/test_configs/echo.py create mode 100644 tests/test_configs/printing_model.py create mode 100644 tests/test_configs/telemetry/colocatedmodel.json create mode 100644 tests/test_configs/telemetry/db_and_model.json create mode 100644 tests/test_configs/telemetry/db_and_model_1run.json create mode 100644 tests/test_configs/telemetry/ensembles.json create mode 100644 tests/test_configs/telemetry/serialmodels.json create mode 100644 tests/test_configs/telemetry/telemetry.json create mode 100644 tests/test_controller.py create mode 100644 tests/test_indirect.py create mode 100644 tests/test_serialize.py create mode 100644 tests/test_telemetry_monitor.py diff --git a/.gitignore b/.gitignore index 96dbd3fc1..bc6b52293 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,8 @@ smartsim/_core/bin/*-cli # created upon install smartsim/_core/lib + +**/manifest/ +**/*.err +**/*.out +**/.smartsim/* diff --git a/conftest.py b/conftest.py index 75ad865f8..381a072d8 100644 --- a/conftest.py +++ b/conftest.py @@ -66,7 +66,7 @@ test_alloc_specs_path = os.getenv("SMARTSIM_TEST_ALLOC_SPEC_SHEET_PATH", None) test_port = CONFIG.test_port test_account = CONFIG.test_account or "" -test_batch_resources = CONFIG.test_batch_resources +test_batch_resources: t.Dict[t.Any,t.Any] = CONFIG.test_batch_resources # Fill this at runtime if needed test_hostlist = None @@ -390,9 +390,13 @@ def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: return Orchestrator(port=test_port, interface="lo") @staticmethod - def choose_host(rs): - return get_hostlist()[0] if isinstance(rs, (MpirunSettings, MpiexecSettings)) else None + def choose_host(rs: RunSettings) -> t.Optional[str]: + if isinstance(rs, (MpirunSettings, MpiexecSettings)): + hl = get_hostlist() + if hl is not None: + return hl[0] + return None @pytest.fixture def local_db( @@ -401,8 +405,7 @@ def local_db( """Yield fixture for startup and teardown of an local orchestrator""" exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) db = Orchestrator(port=wlmutils.get_test_port(), interface="lo") db.set_path(test_dir) exp.start(db) @@ -421,8 +424,7 @@ def db( launcher = wlmutils.get_test_launcher() exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) db = wlmutils.get_orchestrator() db.set_path(test_dir) exp.start(db) @@ -444,8 +446,7 @@ def db_cluster( launcher = wlmutils.get_test_launcher() exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) db = wlmutils.get_orchestrator(nodes=3) db.set_path(test_dir) exp.start(db) @@ -568,7 +569,7 @@ def is_accepted_char(char: str) -> bool: @pytest.fixture -def test_dir(request: pytest.FixtureRequest): +def test_dir(request: pytest.FixtureRequest) -> str: caller_function = _sanitize_caller_function(request.node.name) dir_path = FileUtils.get_test_output_path(caller_function, str(request.path)) @@ -602,6 +603,27 @@ def get_test_dir_path(dirname: str) -> str: dir_path = os.path.join(test_path, "tests", "test_configs", dirname) return dir_path + @staticmethod + def make_test_file(file_name: str, file_dir: str, file_content: t.Optional[str] = None) -> str: + """Create a dummy file in the test output directory. + + :param file_name: name of file to create, e.g. "file.txt" + :type file_name: str + :param file_dir: path + :type file_dir: str + :return: String path to test output file + :rtype: str + """ + file_path = os.path.join(file_dir, file_name) + os.makedirs(file_dir) + with open(file_path, "w+", encoding="utf-8") as dummy_file: + if not file_content: + dummy_file.write("dummy\n") + else: + dummy_file.write(file_content) + + return file_path + @pytest.fixture def mlutils() -> t.Type[MLUtils]: diff --git a/doc/index.rst b/doc/index.rst index d61fdb1ce..13d509257 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -48,6 +48,12 @@ sr_runtime api/smartredis_api +.. toctree:: + :maxdepth: 2 + :caption: SmartDashboard + + smartdashboard + .. toctree:: :maxdepth: 2 :caption: Reference diff --git a/doc/smartdashboard.rst b/doc/smartdashboard.rst new file mode 100644 index 000000000..532fa6db0 --- /dev/null +++ b/doc/smartdashboard.rst @@ -0,0 +1,7 @@ + +************** +SmartDashboard +************** + +.. include:: ../smartdashboard/doc/overview.rst + :start-line: 4 \ No newline at end of file diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index a27ae03c1..57fee67c9 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -52,6 +52,12 @@ RUN git clone https://github.com/CrayLabs/SmartRedis.git --branch develop --dept && python -m pip install . \ && rm -rf ~/.cache/pip +# Install smartdashboard +RUN git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop --depth=1 smartdashboard \ + && cd smartdashboard \ + && python -m pip install . \ + && rm -rf ~/.cache/pip + RUN cd doc/tutorials/ && \ ln -s ../../tutorials/* . diff --git a/pyproject.toml b/pyproject.toml index 24c12d8b6..cd517abb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,7 @@ module = [ "keras", "torch", "smartsim.ml.torch.*", # must solve/ignore inheritance issues + "watchdog", ] ignore_missing_imports = true ignore_errors = true diff --git a/setup.py b/setup.py index 66cc7f879..d38918f68 100644 --- a/setup.py +++ b/setup.py @@ -167,6 +167,7 @@ def has_ext_modules(_placeholder): "tqdm>=4.50.2", "filelock>=3.4.2", "protobuf~=3.20", + "watchdog>=3.0.0", ] # Add SmartRedis at specific version diff --git a/smartsim/_core/_cli/__main__.py b/smartsim/_core/_cli/__main__.py index 68d22d14f..399ca3b03 100644 --- a/smartsim/_core/_cli/__main__.py +++ b/smartsim/_core/_cli/__main__.py @@ -24,14 +24,31 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os import sys from smartsim._core._cli.cli import default_cli +from smartsim._core._cli.utils import SMART_LOGGER_FORMAT +from smartsim.error.errors import SmartSimCLIActionCancelled +from smartsim.log import get_logger + + +logger = get_logger("Smart", fmt=SMART_LOGGER_FORMAT) def main() -> int: smart_cli = default_cli() - return smart_cli.execute(sys.argv) + exception_trace_back_msg = "SmartSim exited with the following exception info:" + + try: + return smart_cli.execute(sys.argv) + except SmartSimCLIActionCancelled as ssi: + logger.info(str(ssi)) + logger.debug(exception_trace_back_msg, exc_info=ssi) + except KeyboardInterrupt as e: + logger.info("SmartSim was terminated by user") + logger.debug(exception_trace_back_msg, exc_info=e) + return os.EX_OK if __name__ == "__main__": diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index b2df26412..e3ba444ad 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -356,7 +356,9 @@ def _format_incompatible_python_env_message( ) -def execute(args: argparse.Namespace) -> int: +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: verbose = args.v keydb = args.keydb device: _TDeviceStr = args.device @@ -416,7 +418,7 @@ def execute(args: argparse.Namespace) -> int: ) except (SetupError, BuildError) as e: logger.error(str(e)) - return 1 + return os.EX_SOFTWARE backends = installed_redisai_backends() backends_str = ", ".join(s.capitalize() for s in backends) if backends else "No" @@ -431,10 +433,10 @@ def execute(args: argparse.Namespace) -> int: check_py_onnx_version(versions) except (SetupError, BuildError) as e: logger.error(str(e)) - return 1 + return os.EX_SOFTWARE logger.info("SmartSim build complete!") - return 0 + return os.EX_OK def configure_parser(parser: argparse.ArgumentParser) -> None: diff --git a/smartsim/_core/_cli/clean.py b/smartsim/_core/_cli/clean.py index fcf051f0c..d8a85f8a9 100644 --- a/smartsim/_core/_cli/clean.py +++ b/smartsim/_core/_cli/clean.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import typing as t from smartsim._core._cli.utils import clean, get_install_path @@ -39,10 +40,14 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: ) -def execute(args: argparse.Namespace) -> int: +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: return clean(get_install_path() / "_core", _all=args.clobber) -def execute_all(args: argparse.Namespace) -> int: +def execute_all( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: args.clobber = True return execute(args) diff --git a/smartsim/_core/_cli/cli.py b/smartsim/_core/_cli/cli.py index ce2376c15..3d50765fb 100644 --- a/smartsim/_core/_cli/cli.py +++ b/smartsim/_core/_cli/cli.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import os import typing as t from smartsim._core._cli.build import configure_parser as build_parser @@ -41,46 +42,68 @@ execute as validate_execute, configure_parser as validate_parser, ) +from smartsim._core._cli.plugin import plugins from smartsim._core._cli.utils import MenuItemConfig class SmartCli: def __init__(self, menu: t.List[MenuItemConfig]) -> None: - self.menu: t.Dict[str, MenuItemConfig] = {item.command: item for item in menu} - parser = argparse.ArgumentParser( + self.menu: t.Dict[str, MenuItemConfig] = {} + self.parser = argparse.ArgumentParser( prog="smart", description="SmartSim command line interface", ) - self.parser = parser - self.args: t.Optional[argparse.Namespace] = None - subparsers = parser.add_subparsers( + self.subparsers = self.parser.add_subparsers( dest="command", required=True, metavar="", help="Available commands", ) - for cmd, item in self.menu.items(): - parser = subparsers.add_parser( - cmd, description=item.description, help=item.description - ) - if item.configurator: - item.configurator(parser) + self.register_menu_items(menu) + self.register_menu_items([plugin() for plugin in plugins]) def execute(self, cli_args: t.List[str]) -> int: if len(cli_args) < 2: self.parser.print_help() - return 0 + return os.EX_USAGE - app_args = cli_args[1:] - self.args = self.parser.parse_args(app_args) + app_args = cli_args[1:] # exclude the path to executable + subcommand = cli_args[1] # first positional arg is the subcommand - if not (menu_item := self.menu.get(app_args[0], None)): + menu_item = self.menu.get(subcommand, None) + if not menu_item: self.parser.print_help() - return 0 + return os.EX_USAGE + + args = argparse.Namespace() + unparsed_args = [] + + if menu_item.is_plugin: + unparsed_args = app_args[1:] + else: + args = self.parser.parse_args(app_args) + + return menu_item.handler(args, unparsed_args) + + def _register_menu_item(self, item: MenuItemConfig) -> None: + parser = self.subparsers.add_parser( + item.command, description=item.description, help=item.description + ) + if item.configurator: + item.configurator(parser) + + if item.command in self.menu: + raise ValueError( + f"{item.command} cannot overwrite existing CLI command" + ) + + self.menu[item.command] = item - return menu_item.handler(self.args) + def register_menu_items(self, menu_items: t.List[MenuItemConfig]) -> None: + for item in menu_items: + self._register_menu_item(item) def default_cli() -> SmartCli: diff --git a/smartsim/_core/_cli/dbcli.py b/smartsim/_core/_cli/dbcli.py index 22a376588..ce0975bc4 100644 --- a/smartsim/_core/_cli/dbcli.py +++ b/smartsim/_core/_cli/dbcli.py @@ -25,13 +25,17 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import os +import typing as t from smartsim._core._cli.utils import get_db_path -def execute(_args: argparse.Namespace) -> int: +def execute( + _args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: if db_path := get_db_path(): print(db_path) - return 0 + return os.EX_OK print("Database (Redis or KeyDB) dependencies not found") - return 1 + return os.EX_SOFTWARE diff --git a/smartsim/_core/_cli/info.py b/smartsim/_core/_cli/info.py index 35ee9b9ec..c08fcb1a3 100644 --- a/smartsim/_core/_cli/info.py +++ b/smartsim/_core/_cli/info.py @@ -1,5 +1,6 @@ import argparse import importlib.metadata +import os import pathlib import typing as t @@ -12,7 +13,9 @@ _MISSING_DEP = _helpers.colorize("Not Installed", "red") -def execute(_args: argparse.Namespace, /) -> int: +def execute( + _args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: print("\nSmart Python Packages:") print( tabulate( @@ -66,7 +69,7 @@ def execute(_args: argparse.Namespace, /) -> int: ), end="\n\n", ) - return 0 + return os.EX_OK def _fmt_installed_db(db_path: t.Optional[pathlib.Path]) -> str: diff --git a/smartsim/_core/_cli/plugin.py b/smartsim/_core/_cli/plugin.py new file mode 100644 index 000000000..b263fe8b2 --- /dev/null +++ b/smartsim/_core/_cli/plugin.py @@ -0,0 +1,55 @@ +import argparse +import importlib.util +import os +import sys +import subprocess as sp +import typing as t + +import smartsim.log +from smartsim._core._cli.utils import MenuItemConfig, SMART_LOGGER_FORMAT +from smartsim.error.errors import SmartSimCLIActionCancelled + +_LOGGER = smartsim.log.get_logger("Smart", fmt=SMART_LOGGER_FORMAT) + + +def dynamic_execute( + cmd: str, plugin_name: str +) -> t.Callable[[argparse.Namespace, t.List[str]], int]: + def process_execute( + _args: argparse.Namespace, unparsed_args: t.List[str], / + ) -> int: + try: + spec = importlib.util.find_spec(cmd) + if spec is None: + raise AttributeError + except (ModuleNotFoundError, AttributeError): + _LOGGER.error(f"{cmd} plugin not found. Please ensure it is installed") + return os.EX_CONFIG + + combined_cmd = [sys.executable, "-m", cmd] + unparsed_args + + try: + completed_proc = sp.run(combined_cmd, check=False) + except KeyboardInterrupt as ex: + msg = f"{plugin_name} terminated by user" + raise SmartSimCLIActionCancelled(msg) from ex + return completed_proc.returncode + + return process_execute + + +def dashboard() -> MenuItemConfig: + return MenuItemConfig( + "dashboard", + ( + "Start the SmartSim dashboard to monitor experiment output from a " + "graphical user interface. This requires that the SmartSim Dashboard " + "Package be installed. For more infromation please visit " + "https://github.com/CrayLabs/SmartDashboard" + ), + dynamic_execute("smartdashboard", "Dashboard"), + is_plugin=True, + ) + + +plugins = (dashboard,) diff --git a/smartsim/_core/_cli/site.py b/smartsim/_core/_cli/site.py index 5fe667cde..c86e0341b 100644 --- a/smartsim/_core/_cli/site.py +++ b/smartsim/_core/_cli/site.py @@ -25,10 +25,12 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse +import os +import typing as t from smartsim._core._cli.utils import get_install_path -def execute(_args: argparse.Namespace) -> int: +def execute(_args: argparse.Namespace, _unparsed_args: t.List[str], /) -> int: print(get_install_path()) - return 0 + return os.EX_OK diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index 0be1b6ac9..d7b0f410d 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import importlib.util +import os import shutil import subprocess as sp import sys @@ -110,7 +111,7 @@ def clean(core_path: Path, _all: bool = False) -> int: if removed: logger.info("Successfully removed SmartSim database installation") - return 0 + return os.EX_OK def get_db_path() -> t.Optional[Path]: @@ -121,7 +122,7 @@ def get_db_path() -> t.Optional[Path]: return None -_CliHandler = t.Callable[[Namespace], int] +_CliHandler = t.Callable[[Namespace, t.List[str]], int] _CliParseConfigurator = t.Callable[[ArgumentParser], None] @@ -132,8 +133,10 @@ def __init__( description: str, handler: _CliHandler, configurator: t.Optional[_CliParseConfigurator] = None, + is_plugin: bool = False ): self.command = cmd self.description = description self.handler = handler self.configurator = configurator + self.is_plugin = is_plugin diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 78db15516..c796fc616 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -82,7 +82,9 @@ def __exit__( self._finalizer.detach() # type: ignore[attr-defined] -def execute(args: argparse.Namespace, /) -> int: +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: """Validate the SmartSim installation works as expected given a simple experiment """ @@ -101,10 +103,10 @@ def execute(args: argparse.Namespace, /) -> int: logger.error( "SmartSim failed to run a simple experiment!\n" f"Experiment failed due to the following exception:\n{e}\n\n" - f"Output files are available at `{temp_dir}`" + f"Output files are available at `{temp_dir}`", exc_info=True ) - return 2 - return 0 + return os.EX_SOFTWARE + return os.EX_OK def configure_parser(parser: argparse.ArgumentParser) -> None: @@ -138,6 +140,7 @@ def test_install( with_onnx: bool, ) -> None: exp = Experiment("ValidationExperiment", exp_path=location, launcher="local") + exp.disable_telemetry() port = _find_free_port() if port is None else port with _make_managed_local_orc(exp, port) as client: logger.info("Verifying Tensor Transfer") diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index b1f33ec8b..7eb847cbd 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -181,7 +181,7 @@ def test_port(self) -> int: # pragma: no cover return int(os.environ.get("SMARTSIM_TEST_PORT", 6780)) @property - def test_batch_resources(self) -> t.Any: # pragma: no cover + def test_batch_resources(self) -> t.Dict[t.Any,t.Any]: # pragma: no cover resource_str = os.environ.get("SMARTSIM_TEST_BATCH_RESOURCES", "{}") resources = json.loads(resource_str) if not isinstance(resources, dict): @@ -219,6 +219,17 @@ def test_account(self) -> t.Optional[str]: # pragma: no cover # no account by default return os.environ.get("SMARTSIM_TEST_ACCOUNT", None) + @property + def telemetry_frequency(self) -> int: + return int(os.environ.get("SMARTSIM_TELEMETRY_FREQUENCY", 5)) + + @property + def telemetry_enabled(self) -> bool: + return int(os.environ.get("SMARTSIM_FLAG_TELEMETRY", "0")) > 0 + + @property + def telemetry_cooldown(self) -> int: + return int(os.environ.get("SMARTSIM_TELEMETRY_COOLDOWN", 90)) @lru_cache(maxsize=128, typed=False) def get_config() -> Config: diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 66bb86044..14a38cfa7 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -28,8 +28,11 @@ import itertools import os.path as osp +import pathlib import pickle import signal +import subprocess +import sys import threading import time import typing as t @@ -48,7 +51,13 @@ shutdown_db_node, ) from ...database import Orchestrator -from ...entity import Ensemble, EntityList, EntitySequence, Model, SmartSimEntity +from ...entity import ( + Ensemble, + EntityList, + EntitySequence, + Model, + SmartSimEntity, +) from ...error import ( LauncherError, SmartSimError, @@ -58,7 +67,6 @@ ) from ...log import get_logger from ...servertype import CLUSTERED, STANDALONE -from ...settings.base import BatchSettings from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES from ..config import CONFIG from ..launcher import ( @@ -69,10 +77,14 @@ SlurmLauncher, ) from ..launcher.launcher import Launcher -from ..utils import check_cluster_status, create_cluster +from ..utils import check_cluster_status, create_cluster, serialize from .job import Job from .jobmanager import JobManager -from .manifest import Manifest +from .manifest import LaunchedManifest, LaunchedManifestBuilder, Manifest + +if t.TYPE_CHECKING: + from ..utils.serialize import TStepLaunchMetaData + logger = get_logger(__name__) @@ -94,9 +106,15 @@ def __init__(self, launcher: str = "local") -> None: """ self._jobs = JobManager(JM_LOCK) self.init_launcher(launcher) + self._telemetry_monitor: t.Optional[subprocess.Popen[bytes]] = None def start( - self, manifest: Manifest, block: bool = True, kill_on_interrupt: bool = True + self, + exp_name: str, + exp_path: str, + manifest: Manifest, + block: bool = True, + kill_on_interrupt: bool = True, ) -> None: """Start the passed SmartSim entities @@ -109,12 +127,20 @@ def start( self._jobs.kill_on_interrupt = kill_on_interrupt # register custom signal handler for ^C (SIGINT) signal.signal(signal.SIGINT, self._jobs.signal_interrupt) - self._launch(manifest) + launched = self._launch(exp_name, exp_path, manifest) # start the job manager thread if not already started if not self._jobs.actively_monitoring: self._jobs.start() + serialize.save_launch_manifest( + launched.map(_look_up_launched_data(self._launcher)) + ) + + # launch a telemetry monitor to track job progress + if CONFIG.telemetry_enabled: + self._start_telemetry_monitor(exp_path) + # block until all non-database jobs are complete if block: # poll handles its own keyboard interrupt as @@ -327,16 +353,25 @@ def init_launcher(self, launcher: str) -> None: else: raise TypeError("Must provide a 'launcher' argument") - def _launch(self, manifest: Manifest) -> None: + def _launch( + self, exp_name: str, exp_path: str, manifest: Manifest + ) -> LaunchedManifest[t.Tuple[str, Step]]: """Main launching function of the controller Orchestrators are always launched first so that the address of the database can be given to following entities + :param exp_name: The name of the launching experiment + :type exp_name: str + :param exp_path: path to location of ``Experiment`` directory if generated + :type exp_path: str :param manifest: Manifest of deployables to launch :type manifest: Manifest """ + manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]]( + exp_name=exp_name, exp_path=exp_path, launcher_name=str(self._launcher) + ) # Loop over deployables to launch and launch multiple orchestrators for orchestrator in manifest.dbs: for key in self._jobs.get_db_host_addresses(): @@ -354,7 +389,7 @@ def _launch(self, manifest: Manifest) -> None: raise SmartSimError( "Local launcher does not support multi-host orchestrators" ) - self._launch_orchestrator(orchestrator) + self._launch_orchestrator(orchestrator, manifest_builder) if self.orchestrator_active: self._set_dbobjects(manifest) @@ -363,33 +398,51 @@ def _launch(self, manifest: Manifest) -> None: steps: t.List[ t.Tuple[Step, t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]]] ] = [] - all_entity_lists = manifest.ensembles - for elist in all_entity_lists: + for elist in manifest.ensembles: + ens_telem_dir = manifest_builder.run_telemetry_subdirectory / "ensemble" if elist.batch: - batch_step = self._create_batch_job_step(elist) + batch_step, substeps = self._create_batch_job_step(elist, ens_telem_dir) + manifest_builder.add_ensemble( + elist, [(batch_step.name, step) for step in substeps] + ) steps.append((batch_step, elist)) else: - job_steps = [(self._create_job_step(e), e) for e in elist.entities] + # if ensemble is to be run as separate job steps, aka not in a batch + job_steps = [ + (self._create_job_step(e, ens_telem_dir / elist.name), e) + for e in elist.entities + ] + manifest_builder.add_ensemble( + elist, [(step.name, step) for step, _ in job_steps] + ) steps.extend(job_steps) # models themselves cannot be batch steps. If batch settings are # attached, wrap them in an anonymous batch job step for model in manifest.models: + model_telem_dir = manifest_builder.run_telemetry_subdirectory / "model" if model.batch_settings: - anon_entity_list = _AnonymousBatchJob( - model.name, model.path, model.batch_settings + anon_entity_list = _AnonymousBatchJob(model) + batch_step, _ = self._create_batch_job_step( + anon_entity_list, model_telem_dir ) - anon_entity_list.entities.append(model) - batch_step = self._create_batch_job_step(anon_entity_list) + manifest_builder.add_model(model, (batch_step.name, batch_step)) steps.append((batch_step, model)) else: - job_step = self._create_job_step(model) + job_step = self._create_job_step(model, model_telem_dir) + manifest_builder.add_model(model, (job_step.name, job_step)) steps.append((job_step, model)) # launch steps for step, entity in steps: self._launch_step(step, entity) - def _launch_orchestrator(self, orchestrator: Orchestrator) -> None: + return manifest_builder.finalize() + + def _launch_orchestrator( + self, + orchestrator: Orchestrator, + manifest_builder: LaunchedManifestBuilder[t.Tuple[str, Step]], + ) -> None: """Launch an Orchestrator instance This function will launch the Orchestrator instance and @@ -398,16 +451,32 @@ def _launch_orchestrator(self, orchestrator: Orchestrator) -> None: :param orchestrator: orchestrator to launch :type orchestrator: Orchestrator + :param manifest_builder: An `LaunchedManifestBuilder` to record the + names and `Step`s of the launched orchestrator + :type manifest_builder: LaunchedManifestBuilder[tuple[str, Step]] """ orchestrator.remove_stale_files() + orc_telem_dir = manifest_builder.run_telemetry_subdirectory / "database" + # if the orchestrator was launched as a batch workload if orchestrator.batch: - orc_batch_step = self._create_batch_job_step(orchestrator) + orc_batch_step, substeps = self._create_batch_job_step( + orchestrator, orc_telem_dir + ) + manifest_builder.add_database( + orchestrator, [(orc_batch_step.name, step) for step in substeps] + ) self._launch_step(orc_batch_step, orchestrator) # if orchestrator was run on existing allocation, locally, or in allocation else: - db_steps = [(self._create_job_step(db), db) for db in orchestrator.entities] + db_steps = [ + (self._create_job_step(db, orc_telem_dir / orchestrator.name), db) + for db in orchestrator.entities + ] + manifest_builder.add_database( + orchestrator, [(step.name, step) for step, _ in db_steps] + ) for db_step in db_steps: self._launch_step(*db_step) @@ -477,35 +546,52 @@ def _launch_step( self._jobs.add_job(job_step.name, job_id, entity, is_task) def _create_batch_job_step( - self, entity_list: t.Union[Orchestrator, Ensemble, _AnonymousBatchJob] - ) -> Step: + self, + entity_list: t.Union[Orchestrator, Ensemble, _AnonymousBatchJob], + telemetry_dir: pathlib.Path, + ) -> t.Tuple[Step, t.List[Step]]: """Use launcher to create batch job step :param entity_list: EntityList to launch as batch :type entity_list: EntityList - :return: job step instance - :rtype: Step + :param telemetry_dir: Path to a directory in which the batch job step + may write telemetry events + :type telemetry_dir: pathlib.Path + :return: batch job step instance and a list of run steps to be + executed within the batch job + :rtype: tuple[Step, list[Step]] """ if not entity_list.batch_settings: raise ValueError( "EntityList must have batch settings to be launched as batch" ) + telemetry_dir = telemetry_dir / entity_list.name batch_step = self._launcher.create_step( entity_list.name, entity_list.path, entity_list.batch_settings ) + batch_step.meta["entity_type"] = str(type(entity_list).__name__).lower() + batch_step.meta["status_dir"] = str(telemetry_dir / entity_list.name) + + substeps = [] for entity in entity_list.entities: # tells step creation not to look for an allocation entity.run_settings.in_batch = True - step = self._create_job_step(entity) + step = self._create_job_step(entity, telemetry_dir) + substeps.append(step) batch_step.add_to_batch(step) - return batch_step + return batch_step, substeps - def _create_job_step(self, entity: SmartSimEntity) -> Step: + def _create_job_step( + self, entity: SmartSimEntity, telemetry_dir: pathlib.Path + ) -> Step: """Create job steps for all entities with the launcher :param entity: an entity to create a step for :type entity: SmartSimEntity + :param telemetry_dir: Path to a directory in which the job step + may write telemetry events + :type telemetry_dir: pathlib.Path :return: the job step :rtype: Step """ @@ -514,6 +600,10 @@ def _create_job_step(self, entity: SmartSimEntity) -> Step: self._prep_entity_client_env(entity) step = self._launcher.create_step(entity.name, entity.path, entity.run_settings) + + step.meta["entity_type"] = str(type(entity).__name__).lower() + step.meta["status_dir"] = str(telemetry_dir / entity.name) + return step def _prep_entity_client_env(self, entity: Model) -> None: @@ -751,13 +841,74 @@ def _set_dbobjects(self, manifest: Manifest) -> None: if db_script not in ensemble.db_scripts: set_script(db_script, client) + def _start_telemetry_monitor(self, exp_dir: str) -> None: + """Spawns a telemetry monitor process to keep track of the life times + of the processes launched through this controller. + + :param exp_dir: An experiment directory + :type exp_dir: str + """ + logger.debug("Starting telemetry monitor process") + if ( + self._telemetry_monitor is None + or self._telemetry_monitor.returncode is not None + ): + cmd = [ + sys.executable, + "-m", + "smartsim._core.entrypoints.telemetrymonitor", + "-exp_dir", + exp_dir, + "-frequency", + str(CONFIG.telemetry_frequency), + "-cooldown", + str(CONFIG.telemetry_cooldown), + ] + # pylint: disable-next=consider-using-with + self._telemetry_monitor = subprocess.Popen( + cmd, + stderr=sys.stderr, + stdout=sys.stdout, + cwd=str(pathlib.Path(__file__).parent.parent.parent), + shell=False, + ) + class _AnonymousBatchJob(EntityList[Model]): - def __init__( - self, name: str, path: str, batch_settings: BatchSettings, **kwargs: t.Any - ) -> None: - super().__init__(name, path) - self.batch_settings = batch_settings + @staticmethod + def _validate(model: Model) -> None: + if model.batch_settings is None: + msg = "Unable to create _AnonymousBatchJob without batch_settings" + raise SmartSimError(msg) + + def __init__(self, model: Model) -> None: + self._validate(model) + super().__init__(model.name, model.path) + self.entities = [model] + self.batch_settings = model.batch_settings def _initialize_entities(self, **kwargs: t.Any) -> None: ... + + +def _look_up_launched_data( + launcher: Launcher, +) -> t.Callable[[t.Tuple[str, Step]], "TStepLaunchMetaData"]: + def _unpack_launched_data(data: t.Tuple[str, Step]) -> "TStepLaunchMetaData": + # NOTE: we cannot assume that the name of the launched step + # ``launched_step_name`` is equal to the name of the step referring to + # the entity ``step.name`` as is the case when an entity list is + # launched as a batch job + launched_step_name, step = data + launched_step_map = launcher.step_mapping[launched_step_name] + out_file, err_file = step.get_output_files() + return ( + launched_step_map.step_id, + launched_step_map.task_id, + launched_step_map.managed, + out_file, + err_file, + pathlib.Path(step.meta.get("status_dir", step.cwd)), + ) + + return _unpack_launched_data diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index 2842c3c14..3a54c0d00 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -27,10 +27,44 @@ import time import typing as t +from dataclasses import dataclass from ...entity import SmartSimEntity, EntitySequence from ...status import STATUS_NEW +@dataclass(frozen=True) +class _JobKey(): + step_id: str + task_id: str + + +class JobEntity: + """API required for a job processed in the JobManager with support for + telemetry monitoring + """ + + def __init__(self) -> None: + self.name: str = "" + self.path: str = "" + self.step_id: str = "" + self.task_id: str = "" + self.type: str = "" + self.timestamp: int = 0 + self.status_dir: str = "" + + @property + def is_db(self) -> bool: + return self.type in ["orchestrator", "dbnode"] + + @property + def is_managed(self) -> bool: + return bool(self.step_id) + + @property + def key(self) -> _JobKey: + return _JobKey(self.step_id, self.task_id) + + class Job: """Keep track of various information for the controller. In doing so, continuously add various fields of information @@ -42,7 +76,7 @@ def __init__( self, job_name: str, job_id: t.Optional[str], - entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], + entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity], launcher: str, is_task: bool, ) -> None: @@ -53,7 +87,7 @@ def __init__( :param job_id: The id associated with the job :type job_id: str :param entity: The SmartSim entity(list) associated with the job - :type entity: SmartSimEntity | EntitySequence + :type entity: SmartSimEntity | EntitySequence | JobEntity :param launcher: Launcher job was started with :type launcher: str :param is_task: process monitored by TaskManager (True) or the WLM (True) diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index edb807064..2d3995943 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -24,9 +24,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import itertools import time import typing as t +from collections import ChainMap from threading import Thread, RLock from types import FrameType @@ -37,7 +39,8 @@ from ..config import CONFIG from ..launcher import LocalLauncher, Launcher from ..utils.network import get_ip_from_host -from .job import Job +from .job import Job, JobEntity + logger = get_logger(__name__) @@ -144,13 +147,8 @@ def __getitem__(self, entity_name: str) -> Job: :rtype: Job """ with self._lock: - if entity_name in self.db_jobs: - return self.db_jobs[entity_name] - if entity_name in self.jobs: - return self.jobs[entity_name] - if entity_name in self.completed: - return self.completed[entity_name] - raise KeyError + entities = ChainMap(self.db_jobs, self.jobs, self.completed) + return entities[entity_name] def __call__(self) -> t.Dict[str, Job]: """Returns dictionary all jobs for () operator @@ -172,7 +170,7 @@ def add_job( self, job_name: str, job_id: t.Optional[str], - entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], + entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity], is_task: bool = True, ) -> None: """Add a job to the job manager which holds specific jobs by type. @@ -191,7 +189,8 @@ def add_job( job = Job(job_name, job_id, entity, launcher, is_task) if isinstance(entity, (DBNode, Orchestrator)): self.db_jobs[entity.name] = job - + elif isinstance(entity, JobEntity) and entity.is_db: + self.db_jobs[entity.name] = job else: self.jobs[entity.name] = job @@ -313,7 +312,8 @@ def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: for corresponding database identifiers :return: dictionary of host ip addresses - :rtype: Dict[str, list]""" + :rtype: Dict[str, list] + """ address_dict: t.Dict[str, t.List[str]] = {} for db_job in self.db_jobs.values(): diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 65aa8a898..ec1d79165 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -24,12 +24,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pathlib import typing as t +from dataclasses import dataclass, field from ...database import Orchestrator -from ...entity import EntitySequence, SmartSimEntity, Model, Ensemble +from ...entity import DBNode, Ensemble, EntitySequence, Model, SmartSimEntity from ...error import SmartSimError -from ..utils.helpers import fmt_dict +from ..utils import helpers as _helpers +from ..utils import serialize as _serialize + +_T = t.TypeVar("_T") +_U = t.TypeVar("_U") +_AtomicLaunchableT = t.TypeVar("_AtomicLaunchableT", Model, DBNode) + +if t.TYPE_CHECKING: + import os class Manifest: @@ -92,7 +102,6 @@ def all_entity_lists(self) -> t.List[EntitySequence[SmartSimEntity]]: """ _all_entity_lists: t.List[EntitySequence[SmartSimEntity]] = list(self.ensembles) - for db in self.dbs: _all_entity_lists.append(db) @@ -150,7 +159,7 @@ def __str__(self) -> str: output += f"{model.batch_settings}\n" output += f"{model.run_settings}\n" if model.params: - output += f"Parameters: \n{fmt_dict(model.params)}\n" + output += f"Parameters: \n{_helpers.fmt_dict(model.params)}\n" output += "\n" for adb in self.dbs: @@ -214,3 +223,131 @@ def has_db_scripts( # `has_db_objects` should be False here return has_db_objects + + + +class _LaunchedManifestMetadata(t.NamedTuple): + run_id: str + exp_name: str + exp_path: str + launcher_name: str + + @property + def exp_telemetry_subdirectory(self) -> pathlib.Path: + return _format_exp_telemetry_path(self.exp_path) + + @property + def run_telemetry_subdirectory(self) -> pathlib.Path: + return _format_run_telemetry_path(self.exp_path, self.exp_name, self.run_id) + + @property + def manifest_file_path(self) -> pathlib.Path: + return self.exp_telemetry_subdirectory / _serialize.MANIFEST_FILENAME + + +@dataclass(frozen=True) +class LaunchedManifest(t.Generic[_T]): + """Immutable manifest mapping launched entities or collections of launched + entities to other pieces of external data. This is commonly used to map a + launch-able entity to its constructed ``Step`` instance without assuming + that ``step.name == job.name`` or querying the ``JobManager`` which itself + can be ephemeral. + """ + + metadata: _LaunchedManifestMetadata + models: t.Tuple[t.Tuple[Model, _T], ...] + ensembles: t.Tuple[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]], ...] + databases: t.Tuple[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]], ...] + + def map(self, func: t.Callable[[_T], _U]) -> "LaunchedManifest[_U]": + def _map_entity_data( + fn: t.Callable[[_T], _U], + entity_list: t.Sequence[t.Tuple[_AtomicLaunchableT, _T]], + ) -> t.Tuple[t.Tuple[_AtomicLaunchableT, _U], ...]: + return tuple((entity, fn(data)) for entity, data in entity_list) + + return LaunchedManifest( + metadata=self.metadata, + models=_map_entity_data(func, self.models), + ensembles=tuple( + (ens, _map_entity_data(func, model_data)) + for ens, model_data in self.ensembles + ), + databases=tuple( + (db_, _map_entity_data(func, node_data)) + for db_, node_data in self.databases + ), + ) + + +@dataclass(frozen=True) +class LaunchedManifestBuilder(t.Generic[_T]): + """A class comprised of mutable collections of SmartSim entities that is + used to build a ``LaunchedManifest`` while going through the launching + process. + """ + + exp_name: str + exp_path: str + launcher_name: str + run_id: str = field(default_factory=_helpers.create_short_id_str) + + _models: t.List[t.Tuple[Model, _T]] = field(default_factory=list, init=False) + _ensembles: t.List[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]]] = field( + default_factory=list, init=False + ) + _databases: t.List[ + t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]] + ] = field(default_factory=list, init=False) + + @property + def exp_telemetry_subdirectory(self) -> pathlib.Path: + return _format_exp_telemetry_path(self.exp_path) + + @property + def run_telemetry_subdirectory(self) -> pathlib.Path: + return _format_run_telemetry_path(self.exp_path, self.exp_name, self.run_id) + + def add_model(self, model: Model, data: _T) -> None: + self._models.append((model, data)) + + def add_ensemble(self, ens: Ensemble, data: t.Sequence[_T]) -> None: + self._ensembles.append((ens, self._entities_to_data(ens.entities, data))) + + def add_database(self, db_: Orchestrator, data: t.Sequence[_T]) -> None: + self._databases.append((db_, self._entities_to_data(db_.entities, data))) + + @staticmethod + def _entities_to_data( + entities: t.Sequence[_AtomicLaunchableT], data: t.Sequence[_T] + ) -> t.Tuple[t.Tuple[_AtomicLaunchableT, _T], ...]: + if not entities: + raise ValueError("Cannot map data to an empty entity sequence") + if len(entities) != len(data): + raise ValueError( + f"Cannot map data sequence of length {len(data)} to entity " + f"sequence of length {len(entities)}" + ) + return tuple(zip(entities, data)) + + def finalize(self) -> LaunchedManifest[_T]: + return LaunchedManifest( + metadata=_LaunchedManifestMetadata( + self.run_id, self.exp_name, self.exp_path, self.launcher_name + ), + models=tuple(self._models), + ensembles=tuple(self._ensembles), + databases=tuple(self._databases), + ) + + +def _format_exp_telemetry_path( + exp_path: t.Union[str, "os.PathLike[str]"] +) -> pathlib.Path: + return pathlib.Path(exp_path, _serialize.TELMON_SUBDIR) + + +def _format_run_telemetry_path( + exp_path: t.Union[str, "os.PathLike[str]"], exp_name: str, run_id: str +) -> pathlib.Path: + return _format_exp_telemetry_path(exp_path) / f"{exp_name}/{run_id}" diff --git a/smartsim/_core/entrypoints/indirect.py b/smartsim/_core/entrypoints/indirect.py new file mode 100644 index 000000000..18d27601f --- /dev/null +++ b/smartsim/_core/entrypoints/indirect.py @@ -0,0 +1,242 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import logging +import os +import pathlib +import signal +import sys +import typing as t +from types import FrameType + +import coloredlogs +import psutil + +import smartsim.log +from smartsim._core.entrypoints.telemetrymonitor import track_event +from smartsim._core.utils.helpers import decode_cmd, get_ts + +STEP_PID: t.Optional[int] = None +logger = smartsim.log.get_logger(__name__) + +# kill is not catchable +SIGNALS = [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT, signal.SIGABRT] + + +def main( + cmd: str, + etype: str, + cwd: str, + status_dir: str, +) -> int: + """The main function of the entrypoint. This function takes an encoded step + command and runs it in a subprocess. In the background, this entrypoint + will then monitor the subprocess and write out status events such as when + the subprocess has started or stopped and write these events to a status + directory. + """ + global STEP_PID # pylint: disable=global-statement + proxy_pid = os.getpid() + + status_path = pathlib.Path(status_dir) + if not status_path.exists(): + status_path.mkdir(parents=True, exist_ok=True) + + if not cmd.strip(): + raise ValueError("Invalid cmd supplied") + + cleaned_cmd = decode_cmd(cmd) + ret_code: int = 1 + logger.debug("Indirect step starting") + + start_detail = f"Proxy process {proxy_pid}" + start_rc: t.Optional[int] = None + + try: + process = psutil.Popen( + cleaned_cmd, + cwd=cwd, + stdout=sys.stdout, + stderr=sys.stderr, + ) + STEP_PID = process.pid + logger.info(f"Indirect proxy {proxy_pid} child process {STEP_PID} started") + start_detail += f" started child process {STEP_PID}" + + except Exception as ex: + start_detail += f" failed to start child process. {ex}" + start_rc = 1 + logger.error("Failed to create process", exc_info=True) + cleanup() + return 1 + finally: + track_event( + get_ts(), + proxy_pid, + "", # step_id for unmanaged task is always empty + etype, + "start", + status_path, + logger, + detail=start_detail, + return_code=start_rc, + ) + + logger.info(f"Waiting for child process {STEP_PID} to complete") + ret_code = process.wait() + + logger.info( + f"Indirect proxy {proxy_pid} child process {STEP_PID} complete." + f" return code: {ret_code}" + ) + msg = f"Process {STEP_PID} finished with return code: {ret_code}" + track_event( + get_ts(), + proxy_pid, + "", # step_id for unmanaged task is always empty + etype, + "stop", + status_path, + logger, + detail=msg, + return_code=ret_code, + ) + cleanup() + + return ret_code + + +def cleanup() -> None: + """Perform cleanup required for clean termination""" + logger.info("Performing cleanup") + global STEP_PID # pylint: disable=global-statement + if STEP_PID is None: + return + + try: + # attempt to stop the subprocess performing step-execution + if psutil.pid_exists(STEP_PID): + process = psutil.Process(STEP_PID) + process.terminate() + except psutil.NoSuchProcess: + # swallow exception to avoid overwriting outputs from cmd + ... + + except OSError as ex: + logger.warning(f"Failed to clean up step executor gracefully: {ex}") + finally: + STEP_PID = None + + +def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: + """Helper function to ensure clean process termination""" + logger.info(f"handling signal {signo}") + if not signo: + logger.warning("Received signal with no signo") + + cleanup() + + +def register_signal_handlers() -> None: + """Register a signal handling function for all termination events""" + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def get_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prefix_chars="+", description="SmartSim Step Executor" + ) + parser.add_argument( + "+name", type=str, help="Name of the step being executed", required=True + ) + parser.add_argument( + "+command", type=str, help="The command to execute", required=True + ) + parser.add_argument( + "+entity_type", + type=str, + help="The type of entity related to the step", + required=True, + ) + parser.add_argument( + "+working_dir", + type=str, + help="The working directory of the executable", + required=True, + ) + parser.add_argument( + "+telemetry_dir", + type=str, + help="Directory for telemetry output", + required=True, + ) + return parser + + +if __name__ == "__main__": + arg_parser = get_parser() + os.environ["PYTHONUNBUFFERED"] = "1" + parsed_args = arg_parser.parse_args() + + # Set up a local private logger for when this module is run as an entry point + level = logger.getEffectiveLevel() + logger = logging.getLogger(f"{__name__}.{parsed_args.name}") + logger.propagate = False + logger.setLevel(level) + + fh = logging.FileHandler(f"{parsed_args.name}.indirect.log") + coloredlogs.HostNameFilter.install(fh) + fh.setFormatter( + logging.Formatter( + smartsim.log.DEFAULT_LOG_FORMAT, + datefmt=smartsim.log.DEFAULT_DATE_FORMAT, + ) + ) + logger.addHandler(fh) + + try: + logger.debug("Starting indirect step execution") + + # make sure to register the cleanup before the start the process + # so our signaller will be able to stop the database process. + register_signal_handlers() + + rc = main( + cmd=parsed_args.command, + etype=parsed_args.entity_type, + cwd=parsed_args.working_dir, + status_dir=parsed_args.telemetry_dir, + ) + sys.exit(rc) + + # gracefully exit the processes in the distributed application that + # we do not want to have start a colocated process. Only one process + # per node should be running. + except Exception as e: + logger.exception(f"An unexpected error caused step execution to fail: {e}") + sys.exit(1) diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py new file mode 100644 index 000000000..cb80e6918 --- /dev/null +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -0,0 +1,691 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import json +import logging +import os +import pathlib +import signal +import sys +import threading +import time +import typing as t + +from dataclasses import dataclass, field +from types import FrameType + +from watchdog.observers import Observer +from watchdog.observers.api import BaseObserver +from watchdog.events import PatternMatchingEventHandler, LoggingEventHandler +from watchdog.events import FileCreatedEvent, FileModifiedEvent + +from smartsim._core.config import CONFIG +from smartsim._core.control.job import JobEntity, _JobKey +from smartsim._core.control.jobmanager import JobManager +from smartsim._core.launcher.stepInfo import StepInfo + + +from smartsim._core.launcher.cobalt.cobaltLauncher import CobaltLauncher +from smartsim._core.launcher.launcher import Launcher +from smartsim._core.launcher.local.local import LocalLauncher +from smartsim._core.launcher.lsf.lsfLauncher import LSFLauncher +from smartsim._core.launcher.pbs.pbsLauncher import PBSLauncher +from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher +from smartsim._core.utils.helpers import get_ts +from smartsim._core.utils.serialize import TELMON_SUBDIR, MANIFEST_FILENAME + +from smartsim.error.errors import SmartSimError +from smartsim.status import STATUS_COMPLETED, TERMINAL_STATUSES + + +"""Telemetry Monitor entrypoint""" + +# kill is not catchable +SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] +_EventClass = t.Literal["start", "stop", "timestep"] +_MAX_MANIFEST_LOAD_ATTEMPTS: t.Final[int] = 6 + + +@dataclass +class Run: + """Model containing entities of an individual start call for an experiment""" + + timestamp: int + models: t.List[JobEntity] + orchestrators: t.List[JobEntity] + ensembles: t.List[JobEntity] + + def flatten( + self, filter_fn: t.Optional[t.Callable[[JobEntity], bool]] = None + ) -> t.List[JobEntity]: + """Flatten runs into a list of SmartSimEntity run events""" + entities = self.models + self.orchestrators + self.ensembles + if filter_fn: + entities = [entity for entity in entities if filter_fn(entity)] + return entities + + +@dataclass +class RuntimeManifest: + """The runtime manifest holds meta information about the experiment entities created + at runtime to satisfy the experiment requirements. + """ + + name: str + path: pathlib.Path + launcher: str + runs: t.List[Run] = field(default_factory=list) + + +def _hydrate_persistable( + persistable_entity: t.Dict[str, t.Any], + entity_type: str, + exp_dir: str, +) -> JobEntity: + """Populate JobEntity instance with supplied metdata and instance details""" + entity = JobEntity() + + metadata = persistable_entity["telemetry_metadata"] + status_dir = pathlib.Path(metadata.get("status_dir")) + + entity.type = entity_type + entity.name = persistable_entity["name"] + entity.step_id = str(metadata.get("step_id") or "") + entity.task_id = str(metadata.get("task_id") or "") + entity.timestamp = int(persistable_entity.get("timestamp", "0")) + entity.path = str(exp_dir) + entity.status_dir = str(status_dir) + + return entity + + +def hydrate_persistable( + entity_type: str, + persistable_entity: t.Dict[str, t.Any], + exp_dir: pathlib.Path, +) -> t.List[JobEntity]: + """Map entity data persisted in a manifest file to an object""" + entities = [] + + # an entity w/parent key creates persistables for entities it contains + parent_keys = {"shards", "models"} + parent_keys = parent_keys.intersection(persistable_entity.keys()) + if parent_keys: + container = "shards" if "shards" in parent_keys else "models" + child_type = "orchestrator" if container == "shards" else "model" + for child_entity in persistable_entity[container]: + entity = _hydrate_persistable(child_entity, child_type, str(exp_dir)) + entities.append(entity) + + return entities + + entity = _hydrate_persistable(persistable_entity, entity_type, str(exp_dir)) + entities.append(entity) + return entities + + +def hydrate_persistables( + entity_type: str, + run: t.Dict[str, t.Any], + exp_dir: pathlib.Path, +) -> t.Dict[str, t.List[JobEntity]]: + """Map a collection of entity data persisted in a manifest file to an object""" + persisted: t.Dict[str, t.List[JobEntity]] = { + "model": [], + "orchestrator": [], + } + for item in run[entity_type]: + entities = hydrate_persistable(entity_type, item, exp_dir) + for new_entity in entities: + persisted[new_entity.type].append(new_entity) + + return persisted + + +def hydrate_runs( + persisted_runs: t.List[t.Dict[str, t.Any]], exp_dir: pathlib.Path +) -> t.List[Run]: + """Map run data persisted in a manifest file to an object""" + the_runs: t.List[Run] = [] + for run_instance in persisted_runs: + run_entities: t.Dict[str, t.List[JobEntity]] = { + "model": [], + "orchestrator": [], + "ensemble": [], + } + + for key in run_entities: + _entities = hydrate_persistables(key, run_instance, exp_dir) + for entity_type, new_entities in _entities.items(): + if new_entities: + run_entities[entity_type].extend(new_entities) + + run = Run( + run_instance["timestamp"], + run_entities["model"], + run_entities["orchestrator"], + run_entities["ensemble"], + ) + the_runs.append(run) + + return the_runs + + +def load_manifest(file_path: str) -> t.Optional[RuntimeManifest]: + """Load a persisted manifest and return the content""" + manifest_dict: t.Optional[t.Dict[str, t.Any]] = None + try_count = 1 + + while manifest_dict is None and try_count < _MAX_MANIFEST_LOAD_ATTEMPTS: + source = pathlib.Path(file_path) + source = source.resolve() + + try: + if text := source.read_text(encoding="utf-8").strip(): + manifest_dict = json.loads(text) + except json.JSONDecodeError as ex: + print(f"Error loading manifest: {ex}") + # hack/fix: handle issues reading file before it is fully written + time.sleep(0.5 * try_count) + finally: + try_count += 1 + + if not manifest_dict: + return None + + exp = manifest_dict.get("experiment", None) + if not exp: + raise ValueError("Manifest missing required experiment") + + runs = manifest_dict.get("runs", None) + if runs is None: + raise ValueError("Manifest missing required runs") + + exp_dir = pathlib.Path(exp["path"]) + runs = hydrate_runs(runs, exp_dir) + + manifest = RuntimeManifest( + name=exp["name"], + path=exp_dir, + launcher=exp["launcher"], + runs=runs, + ) + return manifest + + +def track_event( + timestamp: int, + task_id: t.Union[int, str], + step_id: str, + etype: str, + action: _EventClass, + status_dir: pathlib.Path, + logger: logging.Logger, + detail: str = "", + return_code: t.Optional[int] = None, +) -> None: + """Persist a tracking event for an entity""" + tgt_path = status_dir / f"{action}.json" + tgt_path.parent.mkdir(parents=True, exist_ok=True) + + try: + task_id = int(task_id) + except ValueError: + pass + + entity_dict = { + "timestamp": timestamp, + "job_id": task_id, + "step_id": step_id, + "type": etype, + "action": action, + } + + if detail is not None: + entity_dict["detail"] = detail + + if return_code is not None: + entity_dict["return_code"] = return_code + + try: + if not tgt_path.exists(): + # Don't overwrite existing tracking files + bytes_written = tgt_path.write_text(json.dumps(entity_dict, indent=2)) + if bytes_written < 1: + logger.warning("event tracking failed to write tracking file.") + except Exception: + logger.error("Unable to write tracking file.", exc_info=True) + + +def faux_return_code(step_info: StepInfo) -> t.Optional[int]: + """Create a faux return code for a task run by the WLM. Must not be + called with non-terminal statuses or results may be confusing + """ + if step_info.status not in TERMINAL_STATUSES: + return None + + if step_info.status == STATUS_COMPLETED: + return os.EX_OK + + return 1 + + +class ManifestEventHandler(PatternMatchingEventHandler): + """The ManifestEventHandler monitors an experiment for changes and updates + a telemetry datastore as needed. + + It contains event handlers that are triggered by changes to a runtime experiment + manifest. The runtime manifest differs from a standard manifest. A runtime manifest + may contain multiple experiment executions in a `runs` collection. + + It also contains a long-polling loop that checks experiment entities for updates + at each timestep. + """ + + def __init__( + self, + pattern: str, + logger: logging.Logger, + ignore_patterns: t.Any = None, + ignore_directories: bool = True, + case_sensitive: bool = False, + ) -> None: + super().__init__( + [pattern], ignore_patterns, ignore_directories, case_sensitive + ) # type: ignore + self._logger = logger + self._tracked_runs: t.Dict[int, Run] = {} + self._tracked_jobs: t.Dict[_JobKey, JobEntity] = {} + self._completed_jobs: t.Dict[_JobKey, JobEntity] = {} + self._launcher: t.Optional[Launcher] = None + self.job_manager: JobManager = JobManager(threading.RLock()) + self._launcher_map: t.Dict[str, t.Type[Launcher]] = { + "slurm": SlurmLauncher, + "pbs": PBSLauncher, + "cobalt": CobaltLauncher, + "lsf": LSFLauncher, + "local": LocalLauncher, + } + + def init_launcher(self, launcher: str) -> Launcher: + """Initialize the controller with a specific type of launcher. + SmartSim currently supports slurm, pbs(pro), cobalt, lsf, + and local launching + + :param launcher: which launcher to initialize + :type launcher: str + :raises SSUnsupportedError: if a string is passed that is not + a supported launcher + :raises TypeError: if no launcher argument is provided. + """ + if not launcher: + raise TypeError("Must provide a 'launcher' argument") + + if launcher_type := self._launcher_map.get(launcher.lower(), None): + return launcher_type() + + raise ValueError("Launcher type not supported: " + launcher) + + def set_launcher(self, launcher_type: str) -> None: + """Set the launcher for the experiment""" + self._launcher = self.init_launcher(launcher_type) + self.job_manager.set_launcher(self._launcher) + self.job_manager.start() + + def process_manifest(self, manifest_path: str) -> None: + """Read the runtime manifest for the experiment and track new entities + + :param manifest_path: The full path to the manifest file + :type manifest_path: str + """ + try: + manifest = load_manifest(manifest_path) + if not manifest: + return + except json.JSONDecodeError: + self._logger.error(f"Malformed manifest encountered: {manifest_path}") + return + except ValueError: + self._logger.error("Manifest content error", exc_info=True) + return + + if self._launcher is None: + self.set_launcher(manifest.launcher) + + if not self._launcher: + raise SmartSimError(f"Unable to set launcher from {manifest_path}") + + runs = [run for run in manifest.runs if run.timestamp not in self._tracked_runs] + + exp_dir = pathlib.Path(manifest_path).parent.parent.parent + + for run in runs: + for entity in run.flatten( + filter_fn=lambda e: e.key not in self._tracked_jobs and e.is_managed + ): + entity.path = str(exp_dir) + + self._tracked_jobs[entity.key] = entity + track_event( + run.timestamp, + entity.task_id, + entity.step_id, + entity.type, + "start", + pathlib.Path(entity.status_dir), + self._logger, + ) + + if entity.is_managed: + self.job_manager.add_job( + entity.name, + entity.task_id, + entity, + False, + ) + self._launcher.step_mapping.add( + entity.name, entity.step_id, entity.task_id, True + ) + self._tracked_runs[run.timestamp] = run + + def on_modified(self, event: FileModifiedEvent) -> None: + """Event handler for when a file or directory is modified. + + :param event: Event representing file/directory modification. + :type event: FileModifiedEvent + """ + super().on_modified(event) # type: ignore + self._logger.info(f"processing manifest modified @ {event.src_path}") + self.process_manifest(event.src_path) + + def on_created(self, event: FileCreatedEvent) -> None: + """Event handler for when a file or directory is created. + + :param event: Event representing file/directory creation. + :type event: FileCreatedEvent + """ + super().on_created(event) # type: ignore + self._logger.info(f"processing manifest created @ {event.src_path}") + self.process_manifest(event.src_path) + + def _to_completed( + self, + timestamp: int, + entity: JobEntity, + step_info: StepInfo, + ) -> None: + """Move a monitored entity from the active to completed collection to + stop monitoring for updates during timesteps. + + :param timestamp: the current timestamp for event logging + :type timestamp: int + :param entity: the running SmartSim Job + :type entity: JobEntity + :param experiment_dir: the experiement directory to monitor for changes + :type experiment_dir: pathlib.Path + :param entity: the StepInfo received when requesting a Job status update + :type entity: StepInfo + """ + inactive_entity = self._tracked_jobs.pop(entity.key) + if entity.key not in self._completed_jobs: + self._completed_jobs[entity.key] = inactive_entity + + job = self.job_manager[entity.name] + self.job_manager.move_to_completed(job) + + status_clause = f"status: {step_info.status}" + error_clause = f", error: {step_info.error}" if step_info.error else "" + detail = f"{status_clause}{error_clause}" + + if hasattr(job.entity, "status_dir"): + write_path = pathlib.Path(job.entity.status_dir) + + track_event( + timestamp, + entity.task_id, + entity.step_id, + entity.type, + "stop", + write_path, + self._logger, + detail=detail, + return_code=faux_return_code(step_info), + ) + + def on_timestep(self, timestamp: int) -> None: + """Called at polling frequency to request status updates on + monitored entities + + :param timestamp: the current timestamp for event logging + :type timestamp: int + :param experiment_dir: the experiement directory to monitor for changes + :type experiment_dir: pathlib.Path + """ + entity_map = self._tracked_jobs + + if not self._launcher: + return + + # consider not using name to avoid collisions + names = {entity.name: entity for entity in entity_map.values()} + + if names: + step_updates = self._launcher.get_step_update(list(names.keys())) + + for step_name, step_info in step_updates: + if step_info and step_info.status in TERMINAL_STATUSES: + completed_entity = names[step_name] + self._to_completed(timestamp, completed_entity, step_info) + + +def can_shutdown(action_handler: ManifestEventHandler, logger: logging.Logger) -> bool: + jobs = action_handler.job_manager.jobs + db_jobs = action_handler.job_manager.db_jobs + + has_jobs = bool(jobs) + has_dbs = bool(db_jobs) + has_running_jobs = has_jobs or has_dbs + + if has_jobs: + logger.debug(f"telemetry monitor is monitoring {len(jobs)} jobs") + if has_dbs: + logger.debug(f"telemetry monitor is monitoring {len(db_jobs)} dbs") + + return not has_running_jobs + + +def event_loop( + observer: BaseObserver, + action_handler: ManifestEventHandler, + frequency: t.Union[int, float], + logger: logging.Logger, + cooldown_duration: int, +) -> None: + """Executes all attached timestep handlers every seconds + + :param observer: (optional) a preconfigured watchdog Observer to inject + :type observer: t.Optional[BaseObserver] + :param action_handler: The manifest event processor instance + :type action_handler: ManifestEventHandler + :param frequency: frequency (in seconds) of update loop + :type frequency: t.Union[int, float] + :param logger: a preconfigured Logger instance + :type logger: logging.Logger + :param cooldown_duration: number of seconds the telemetry monitor should + poll for new jobs before attempting to shutdown + :type cooldown_duration: int + """ + elapsed: int = 0 + last_ts: int = get_ts() + + while observer.is_alive(): + timestamp = get_ts() + logger.debug(f"Telemetry timestep: {timestamp}") + action_handler.on_timestep(timestamp) + + elapsed += timestamp - last_ts + last_ts = timestamp + + if can_shutdown(action_handler, logger): + if elapsed >= cooldown_duration: + logger.info("beginning telemetry manager shutdown") + observer.stop() # type: ignore + else: + # reset cooldown any time there are still jobs running + elapsed = 0 + + time.sleep(frequency) + + +def main( + frequency: t.Union[int, float], + experiment_dir: pathlib.Path, + logger: logging.Logger, + observer: t.Optional[BaseObserver] = None, + cooldown_duration: t.Optional[int] = 0, +) -> int: + """Setup the monitoring entities and start the timer-based loop that + will poll for telemetry data + + :param frequency: frequency (in seconds) of update loop + :type frequency: t.Union[int, float] + :param experiment_dir: the experiement directory to monitor for changes + :type experiment_dir: pathlib.Path + :param logger: a preconfigured Logger instance + :type logger: logging.Logger + :param observer: (optional) a preconfigured Observer to inject + :type observer: t.Optional[BaseObserver] + :param cooldown_duration: number of seconds the telemetry monitor should + poll for new jobs before attempting to shutdown + :type cooldown_duration: int + """ + manifest_relpath = pathlib.Path(TELMON_SUBDIR) / MANIFEST_FILENAME + manifest_path = experiment_dir / manifest_relpath + monitor_pattern = str(manifest_relpath) + + logger.info( + f"Executing telemetry monitor with frequency: {frequency}s" + f", on target directory: {experiment_dir}" + f" matching pattern: {monitor_pattern}" + ) + + cooldown_duration = cooldown_duration or CONFIG.telemetry_cooldown + log_handler = LoggingEventHandler(logger) # type: ignore + action_handler = ManifestEventHandler(monitor_pattern, logger) + + if observer is None: + observer = Observer() + + try: + if manifest_path.exists(): + # a manifest may not exist depending on startup timing + action_handler.process_manifest(str(manifest_path)) + + observer.schedule(log_handler, experiment_dir, recursive=True) # type:ignore + observer.schedule(action_handler, experiment_dir, recursive=True) # type:ignore + observer.start() # type: ignore + + event_loop(observer, action_handler, frequency, logger, cooldown_duration) + return os.EX_OK + except Exception as ex: + logger.error(ex) + finally: + if observer.is_alive(): + observer.stop() # type: ignore + observer.join() + + return os.EX_SOFTWARE + + +def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: + """Helper function to ensure clean process termination""" + if not signo: + logger = logging.getLogger() + logger.warning("Received signal with no signo") + + +def register_signal_handlers() -> None: + """Register a signal handling function for all termination events""" + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def get_parser() -> argparse.ArgumentParser: + """Instantiate a parser to process command line arguments""" + arg_parser = argparse.ArgumentParser(description="SmartSim Telemetry Monitor") + arg_parser.add_argument( + "-frequency", + type=int, + help="Frequency of telemetry updates (in seconds))", + required=True, + ) + arg_parser.add_argument( + "-exp_dir", + type=str, + help="Experiment root directory", + required=True, + ) + arg_parser.add_argument( + "-cooldown", + type=int, + help="Default lifetime of telemetry monitor (in seconds) before auto-shutdown", + default=CONFIG.telemetry_cooldown, + ) + return arg_parser + + +if __name__ == "__main__": + os.environ["PYTHONUNBUFFERED"] = "1" + + parser = get_parser() + args = parser.parse_args() + + log = logging.getLogger(f"{__name__}.TelemetryMonitor") + log.setLevel(logging.DEBUG) + log.propagate = False + + log_path = os.path.join(args.exp_dir, TELMON_SUBDIR, "telemetrymonitor.log") + fh = logging.FileHandler(log_path, "a") + log.addHandler(fh) + + # Must register cleanup before the main loop is running + register_signal_handlers() + + try: + main( + int(args.frequency), + pathlib.Path(args.exp_dir), + log, + cooldown_duration=args.cooldown, + ) + sys.exit(0) + except Exception: + log.exception( + "Shutting down telemetry monitor due to unexpected error", exc_info=True + ) + + sys.exit(1) diff --git a/smartsim/_core/launcher/cobalt/cobaltLauncher.py b/smartsim/_core/launcher/cobalt/cobaltLauncher.py index ca0b88a3b..4c7206969 100644 --- a/smartsim/_core/launcher/cobalt/cobaltLauncher.py +++ b/smartsim/_core/launcher/cobalt/cobaltLauncher.py @@ -117,16 +117,13 @@ def run(self, step: Step) -> t.Optional[str]: # aprun doesn't direct output for us. out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) # if batch submission did not successfully retrieve job ID diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index ec8bb0120..1441fe8b0 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -47,11 +47,6 @@ class Launcher(abc.ABC): # pragma: no cover step_mapping: StepMapping task_manager: TaskManager - @property - @abc.abstractmethod - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: - raise NotImplementedError - @abc.abstractmethod def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: raise NotImplementedError @@ -86,6 +81,11 @@ def __init__(self) -> None: self.task_manager = TaskManager() self.step_mapping = StepMapping() + @property + @abc.abstractmethod + def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: + raise NotImplementedError + # every launcher utilizing this interface must have a map # of supported RunSettings types (see slurmLauncher.py for ex) def create_step( @@ -176,6 +176,6 @@ def _get_unmanaged_step_update( # pylint: disable-next=no-self-use def _get_managed_step_update( self, - step_ids: t.List[str], # pylint: disable=unused-argument + step_ids: t.List[str], # pylint: disable=unused-argument ) -> t.List[StepInfo]: # pragma: no cover return [] diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index 7e5c56f7b..3f0f2d8d2 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -24,29 +24,24 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os +import sys import typing as t from ..launcher import Launcher from ....log import get_logger from ....settings import RunSettings, SettingsBase -from ..step import LocalStep -from ..step import Step +from ..step import LocalStep, Step from ..stepInfo import UnmanagedStepInfo, StepInfo from ..stepMapping import StepMapping from ..taskManager import TaskManager - -logger = get_logger(__name__) +from ...utils.helpers import encode_cmd +from ...config import CONFIG class LocalLauncher(Launcher): """Launcher used for spawning proceses on a localhost machine.""" - @property - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: - return { - RunSettings: LocalStep, - } - def __init__(self) -> None: self.task_manager = TaskManager() self.step_mapping = StepMapping() @@ -60,16 +55,17 @@ def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: raise TypeError( f"Local Launcher only supports entities with RunSettings, not {type(step_settings)}" ) - step = LocalStep(name, cwd, step_settings) - return step + return LocalStep(name, cwd, step_settings) - def get_step_update(self, step_names: t.List[str]) -> t.List[t.Tuple[str, t.Optional[StepInfo]]]: + def get_step_update( + self, step_names: t.List[str] + ) -> t.List[t.Tuple[str, t.Optional[StepInfo]]]: """Get status updates of each job step name provided :param step_names: list of step_names :type step_names: list[str] :return: list of tuples for update - :rtype: list[(str, UnmanagedStepInfo)] + :rtype: list[tuple[str, StepInfo | None]] """ # step ids are process ids of the tasks # as there is no WLM intermediary @@ -85,8 +81,12 @@ def get_step_update(self, step_names: t.List[str]) -> t.List[t.Tuple[str, t.Opti def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: """Return the address of nodes assigned to the step + :param step_names: list of step_names + :type step_names: list[str] + :return: list of node addresses + :rtype: list[list[str]] + TODO: Use socket to find the actual Lo address? - :return: a list containing the local host address """ return [["127.0.0.1"] * len(step_names)] @@ -104,16 +104,17 @@ def run(self, step: Step) -> str: self.task_manager.start() out, err = step.get_output_files() - output = open(out, "w+") - error = open(err, "w+") cmd = step.get_launch_cmd() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None + # pylint: disable-next=consider-using-with + output = open(out, "w+", encoding="utf-8") + # pylint: disable-next=consider-using-with + error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd, step.cwd, env=passed_env, out=output.fileno(), err=error.fileno() + cmd, step.cwd, env=step.env, out=output.fileno(), err=error.fileno() ) + self.step_mapping.add(step.name, task_id=task_id, managed=False) return task_id @@ -127,7 +128,7 @@ def stop(self, step_name: str) -> UnmanagedStepInfo: """ # step_id is task_id for local. Naming for consistency step_id = self.step_mapping[step_name].task_id - + self.task_manager.remove_task(str(step_id)) _, rc, out, err = self.task_manager.get_task_update(str(step_id)) step_info = UnmanagedStepInfo("Cancelled", rc, out, err) diff --git a/smartsim/_core/launcher/lsf/lsfLauncher.py b/smartsim/_core/launcher/lsf/lsfLauncher.py index a8d0e27aa..13b3be9bb 100644 --- a/smartsim/_core/launcher/lsf/lsfLauncher.py +++ b/smartsim/_core/launcher/lsf/lsfLauncher.py @@ -42,13 +42,13 @@ from ...config import CONFIG from ..launcher import WLMLauncher from ..step import ( - Step, BsubBatchStep, JsrunStep, LocalStep, MpiexecStep, MpirunStep, OrterunStep, + Step, ) from ..stepInfo import LSFBatchStepInfo, LSFJsrunStepInfo, StepInfo from .lsfCommands import bjobs, bkill, jskill, jslist @@ -115,19 +115,16 @@ def run(self, step: Step) -> t.Optional[str]: time.sleep(1) step_id = self._get_lsf_step_id(step) logger.debug(f"Gleaned jsrun step id: {step_id} for {step.name}") - else: # isinstance(step, MpirunStep) or isinstance(step, LocalStep) + else: # mpirun and local launch don't direct output for us out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) self.step_mapping.add(step.name, step_id, task_id, step.managed) diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index cbb85337c..f7d854a7b 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -111,15 +111,12 @@ def run(self, step: Step) -> t.Optional[str]: # aprun/local doesn't direct output for us. out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) # if batch submission did not successfully retrieve job ID diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index 70bdab5a2..ae44ddc8e 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -155,15 +155,12 @@ def run(self, step: Step) -> t.Optional[str]: # MPI/local steps don't direct output like slurm steps out, err = step.get_output_files() - # LocalStep.run_command omits env, include it here - passed_env = step.env if isinstance(step, LocalStep) else None - # pylint: disable-next=consider-using-with output = open(out, "w+", encoding="utf-8") # pylint: disable-next=consider-using-with error = open(err, "w+", encoding="utf-8") task_id = self.task_manager.start_task( - cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno() + cmd_list, step.cwd, step.env, out=output.fileno(), err=error.fileno() ) if not step_id and step.managed: diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index 80e7e7658..6169df083 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -31,7 +31,7 @@ from ....error import AllocationError from ....log import get_logger -from .step import Step +from .step import Step, proxyable_launch_cmd from ....settings import AprunSettings, RunSettings, Singularity logger = get_logger(__name__) @@ -56,9 +56,11 @@ def __init__(self, name: str, cwd: str, run_settings: AprunSettings) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ return self.run_settings.mpmd + @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step diff --git a/smartsim/_core/launcher/step/localStep.py b/smartsim/_core/launcher/step/localStep.py index d15a48381..709137e5b 100644 --- a/smartsim/_core/launcher/step/localStep.py +++ b/smartsim/_core/launcher/step/localStep.py @@ -28,7 +28,7 @@ import shutil import typing as t -from .step import Step +from .step import Step, proxyable_launch_cmd from ....settings.base import RunSettings from ....settings import Singularity @@ -37,8 +37,13 @@ class LocalStep(Step): def __init__(self, name: str, cwd: str, run_settings: RunSettings): super().__init__(name, cwd, run_settings) self.run_settings = run_settings - self.env = self._set_env() + self._env = self._set_env() + @property + def env(self) -> t.Dict[str, str]: + return self._env + + @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: cmd = [] diff --git a/smartsim/_core/launcher/step/lsfStep.py b/smartsim/_core/launcher/step/lsfStep.py index ae6c3525b..a10827950 100644 --- a/smartsim/_core/launcher/step/lsfStep.py +++ b/smartsim/_core/launcher/step/lsfStep.py @@ -213,7 +213,8 @@ def _set_alloc(self) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ if isinstance(self.step_settings, JsrunSettings): return self.step_settings.mpmd return [] diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 9a0796c0f..8ab6c0d47 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -26,12 +26,12 @@ import os import shutil -from shlex import split as sh_split import typing as t +from shlex import split as sh_split from ....error import AllocationError, SmartSimError from ....log import get_logger -from .step import Step +from .step import Step, proxyable_launch_cmd from ....settings import MpirunSettings, MpiexecSettings, OrterunSettings from ....settings.base import RunSettings @@ -59,6 +59,7 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings) -> None: _supported_launchers = ["PBS", "COBALT", "SLURM", "LSB"] + @proxyable_launch_cmd def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step @@ -118,7 +119,8 @@ def _set_alloc(self) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ if hasattr(self.run_settings, "mpmd") and self.run_settings.mpmd: rs_mpmd: t.List[RunSettings] = self.run_settings.mpmd return rs_mpmd diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 18575e4e9..67353faa7 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -26,13 +26,13 @@ import os import shutil -from shlex import split as sh_split import typing as t +from shlex import split as sh_split from ....error import AllocationError from ....log import get_logger from .step import Step -from ....settings import SrunSettings, SbatchSettings, RunSettings, Singularity +from ....settings import RunSettings, SbatchSettings, Singularity, SrunSettings logger = get_logger(__name__) @@ -189,13 +189,15 @@ def _set_alloc(self) -> None: def _get_mpmd(self) -> t.List[RunSettings]: """Temporary convenience function to return a typed list - of attached RunSettings""" + of attached RunSettings + """ return self.run_settings.mpmd @staticmethod def _get_exe_args_list(run_setting: RunSettings) -> t.List[str]: """Convenience function to encapsulate checking the - runsettings.exe_args type to always return a list""" + runsettings.exe_args type to always return a list + """ exe_args = run_setting.exe_args args: t.List[str] = exe_args if isinstance(exe_args, list) else [exe_args] return args diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index 2aa995768..d77616cc2 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -26,17 +26,20 @@ from __future__ import annotations +import functools import os.path as osp +import sys import time import typing as t - from os import makedirs -from smartsim.error.errors import SmartSimError + +from smartsim.error.errors import SmartSimError, UnproxyableStepError +from smartsim._core.config import CONFIG from ....log import get_logger -from ...utils.helpers import get_base_36_repr +from ...utils.helpers import get_base_36_repr, encode_cmd from ..colocated import write_colocated_launch_script -from ....settings.base import SettingsBase, RunSettings +from ....settings.base import RunSettings, SettingsBase logger = get_logger(__name__) @@ -48,6 +51,12 @@ def __init__(self, name: str, cwd: str, step_settings: SettingsBase) -> None: self.cwd = cwd self.managed = False self.step_settings = step_settings + self.meta: t.Dict[str, str] = {} + + @property + def env(self) -> t.Optional[t.Dict[str, str]]: + """Overridable, read only property for step to specify its environment""" + return None def get_launch_cmd(self) -> t.List[str]: raise NotImplementedError @@ -68,7 +77,8 @@ def get_step_file( ) -> str: """Get the name for a file/script created by the step class - Used for Batch scripts, mpmd scripts, etc""" + Used for Batch scripts, mpmd scripts, etc. + """ if script_name: script_name = script_name if "." in script_name else script_name + ending return osp.join(self.cwd, script_name) @@ -107,3 +117,49 @@ def add_to_batch(self, step: Step) -> None: :type step: Step """ raise SmartSimError("add_to_batch not implemented for this step type") + + +_StepT = t.TypeVar("_StepT", bound=Step) + + +def proxyable_launch_cmd( + fn: t.Callable[[_StepT], t.List[str]], / +) -> t.Callable[[_StepT], t.List[str]]: + @functools.wraps(fn) + def _get_launch_cmd(self: _StepT) -> t.List[str]: + original_cmd_list = fn(self) + + if not CONFIG.telemetry_enabled: + return original_cmd_list + + if self.managed: + raise UnproxyableStepError( + f"Attempting to proxy managed step of type {type(self)}" + "through the unmanaged step proxy entry point" + ) + + proxy_module = "smartsim._core.entrypoints.indirect" + etype = self.meta["entity_type"] + status_dir = self.meta["status_dir"] + encoded_cmd = encode_cmd(original_cmd_list) + + # NOTE: this is NOT safe. should either 1) sign cmd and verify OR 2) + # serialize step and let the indirect entrypoint rebuild the + # cmd... for now, test away... + return [ + sys.executable, + "-m", + proxy_module, + "+name", + self.name, + "+command", + encoded_cmd, + "+entity_type", + etype, + "+telemetry_dir", + status_dir, + "+working_dir", + self.cwd, + ] + + return _get_launch_cmd diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index e69cbdcce..d88faea1a 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -27,9 +27,11 @@ """ A file of helper functions for SmartSim """ +import base64 import os import uuid import typing as t +from datetime import datetime from functools import lru_cache from pathlib import Path from shutil import which @@ -64,21 +66,20 @@ def unpack_colo_db_identifier(db_id: str) -> str: return "_" + db_id if db_id else "" +def create_short_id_str() -> str: + return str(uuid.uuid4())[:7] + + def create_lockfile_name() -> str: """Generate a unique lock filename using UUID""" - lock_suffix = str(uuid.uuid4())[:7] + lock_suffix = create_short_id_str() return f"smartsim-{lock_suffix}.lock" @lru_cache(maxsize=20, typed=False) def check_dev_log_level() -> bool: - try: - lvl = os.environ["SMARTSIM_LOG_LEVEL"] - if lvl == "developer": - return True - return False - except KeyError: - return False + lvl = os.environ.get("SMARTSIM_LOG_LEVEL", "") + return lvl == "developer" def fmt_dict(value: t.Dict[str, t.Any]) -> str: @@ -273,3 +274,31 @@ def installed_redisai_backends( } return {backend for backend in backends if _installed(base_path, backend)} + + +def get_ts() -> int: + """Return the current timestamp (accurate to seconds) cast to an integer""" + return int(datetime.timestamp(datetime.now())) + + +def encode_cmd(cmd: t.List[str]) -> str: + """Transform a standard command list into an encoded string safe for providing as an + argument to a proxy entrypoint + """ + if not cmd: + raise ValueError("Invalid cmd supplied") + + ascii_cmd = "|".join(cmd).encode("ascii") + encoded_cmd = base64.b64encode(ascii_cmd).decode("ascii") + return encoded_cmd + + +def decode_cmd(encoded_cmd: str) -> t.List[str]: + """Decode an encoded command string to the original command list format""" + if not encoded_cmd.strip(): + raise ValueError("Invalid cmd supplied") + + decoded_cmd = base64.b64decode(encoded_cmd.encode("ascii")) + cleaned_cmd = decoded_cmd.decode("ascii").split("|") + + return cleaned_cmd diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py new file mode 100644 index 000000000..5547a49f8 --- /dev/null +++ b/smartsim/_core/utils/serialize.py @@ -0,0 +1,246 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import json +import time +import typing as t +from pathlib import Path + +import smartsim.log +import smartsim._core._cli.utils as _utils +from smartsim._core.config import CONFIG + +if t.TYPE_CHECKING: + from smartsim import Experiment + from smartsim._core.control.manifest import LaunchedManifest as _Manifest + from smartsim.database.orchestrator import Orchestrator + from smartsim.entity import DBNode, Ensemble, Model + from smartsim.entity.dbobject import DBModel, DBScript + from smartsim.settings.base import BatchSettings, RunSettings + + +TStepLaunchMetaData = t.Tuple[ + t.Optional[str], t.Optional[str], t.Optional[bool], str, str, Path +] +TELMON_SUBDIR: t.Final[str] = ".smartsim/telemetry" +MANIFEST_FILENAME: t.Final[str] = "manifest.json" + +_LOGGER = smartsim.log.get_logger(__name__) + + +def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: + if not CONFIG.telemetry_enabled: + return + + manifest.metadata.run_telemetry_subdirectory.mkdir(parents=True, exist_ok=True) + + new_run = { + "run_id": manifest.metadata.run_id, + "timestamp": int(time.time_ns()), + "model": [ + _dictify_model(model, *telemetry_metadata) + for model, telemetry_metadata in manifest.models + ], + "orchestrator": [ + _dictify_db(db, nodes_info) for db, nodes_info in manifest.databases + ], + "ensemble": [ + _dictify_ensemble(ens, member_info) + for ens, member_info in manifest.ensembles + ], + } + try: + with open(manifest.metadata.manifest_file_path, "r", encoding="utf-8") as file: + manifest_dict = json.load(file) + except (FileNotFoundError, json.JSONDecodeError): + manifest_dict = { + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1", + }, + "experiment": { + "name": manifest.metadata.exp_name, + "path": manifest.metadata.exp_path, + "launcher": manifest.metadata.launcher_name, + }, + "runs": [new_run], + } + else: + manifest_dict["runs"].append(new_run) + finally: + with open(manifest.metadata.manifest_file_path, "w", encoding="utf-8") as file: + json.dump(manifest_dict, file, indent=2) + + +def _dictify_model( + model: Model, + step_id: t.Optional[str], + task_id: t.Optional[str], + managed: t.Optional[bool], + out_file: str, + err_file: str, + telemetry_data_path: Path, +) -> t.Dict[str, t.Any]: + colo_settings = (model.run_settings.colocated_db_settings or {}).copy() + db_scripts = t.cast("t.List[DBScript]", colo_settings.pop("db_scripts", [])) + db_models = t.cast("t.List[DBModel]", colo_settings.pop("db_models", [])) + return { + "name": model.name, + "path": model.path, + "exe_args": model.run_settings.exe_args, + "run_settings": _dictify_run_settings(model.run_settings), + "batch_settings": _dictify_batch_settings(model.batch_settings) + if model.batch_settings + else {}, + "params": model.params, + "files": { + "Symlink": model.files.link, + "Configure": model.files.tagged, + "Copy": model.files.copy, + } + if model.files + else { + "Symlink": [], + "Configure": [], + "Copy": [], + }, + "colocated_db": { + "settings": colo_settings, + "scripts": [ + { + script.name: { + "backend": "TORCH", + "device": script.device, + } + } + for script in db_scripts + ], + "models": [ + { + model.name: { + "backend": model.backend, + "device": model.device, + } + } + for model in db_models + ], + } + if colo_settings + else {}, + "telemetry_metadata": { + "status_dir": str(telemetry_data_path), + "step_id": step_id, + "task_id": task_id, + "managed": managed, + }, + "out_file": out_file, + "err_file": err_file, + } + + +def _dictify_ensemble( + ens: Ensemble, + members: t.Sequence[t.Tuple[Model, TStepLaunchMetaData]], +) -> t.Dict[str, t.Any]: + return { + "name": ens.name, + "params": ens.params, + "batch_settings": _dictify_batch_settings(ens.batch_settings) + # FIXME: Typehint here is wrong, ``ens.batch_settings`` can + # also be an empty dict for no discernible reason... + if ens.batch_settings else {}, + "models": [ + _dictify_model(model, *launching_metadata) + for model, launching_metadata in members + ], + } + + +def _dictify_run_settings(run_settings: RunSettings) -> t.Dict[str, t.Any]: + # TODO: remove this downcast + if hasattr(run_settings, "mpmd") and run_settings.mpmd: + _LOGGER.warning( + "SmartSim currently cannot properly serialize all information in " + "MPMD run settings" + ) + return { + "exe": run_settings.exe, + # TODO: We should try to move this back + # "exe_args": run_settings.exe_args, + "run_command": run_settings.run_command, + "run_args": run_settings.run_args, + # TODO: We currently do not have a way to represent MPMD commands! + # Maybe add a ``"mpmd"`` key here that is a + # ``list[TDictifiedRunSettings]``? + } + + +def _dictify_batch_settings(batch_settings: BatchSettings) -> t.Dict[str, t.Any]: + return { + "batch_command": batch_settings.batch_cmd, + "batch_args": batch_settings.batch_args, + } + + +def _dictify_db( + db: Orchestrator, + nodes: t.Sequence[t.Tuple[DBNode, TStepLaunchMetaData]], +) -> t.Dict[str, t.Any]: + db_path = _utils.get_db_path() + if db_path: + db_type, _ = db_path.name.split("-", 1) + else: + db_type = "Unknown" + return { + "name": db.name, + "type": db_type, + "interface": db._interfaces, # pylint: disable=protected-access + "shards": [ + { + **shard.to_dict(), + "conf_file": shard.cluster_conf_file, + "out_file": out_file, + "err_file": err_file, + "telemetry_metadata": { + "status_dir": str(status_dir), + "step_id": step_id, + "task_id": task_id, + "managed": managed, + }, + } + for dbnode, ( + step_id, + task_id, + managed, + out_file, + err_file, + status_dir, + ) in nodes + for shard in dbnode.get_launched_shard_info() + ], + } diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 92df73d6a..403984d16 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -76,11 +76,12 @@ def __init__( @property def num_shards(self) -> int: - try: - return len(self.run_settings.mpmd) + 1 # type: ignore[attr-defined] - except AttributeError: + if not hasattr(self.run_settings, "mpmd"): + # return default number of shards if mpmd is not set return 1 + return len(self.run_settings.mpmd) + 1 + @property def host(self) -> str: try: @@ -102,11 +103,12 @@ def clear_hosts(self) -> None: @property def is_mpmd(self) -> bool: - try: - return bool(self.run_settings.mpmd) # type: ignore[attr-defined] - except AttributeError: + if not hasattr(self.run_settings, "mpmd"): + # missing mpmd property guarantees this is not an mpmd run return False + return bool(self.run_settings.mpmd) + def set_hosts(self, hosts: t.List[str]) -> None: self._hosts = [str(host) for host in hosts] diff --git a/smartsim/error/errors.py b/smartsim/error/errors.py index ffa1cfb17..ad67ae88b 100644 --- a/smartsim/error/errors.py +++ b/smartsim/error/errors.py @@ -39,12 +39,14 @@ class SSUnsupportedError(Exception): class EntityExistsError(SmartSimError): """Raised when a user tries to create an entity or files/directories for - an entity and either the entity/files/directories already exist""" + an entity and either the entity/files/directories already exist + """ class UserStrategyError(SmartSimError): """Raised when there is an error with model creation inside an ensemble - that is from a user provided permutation strategy""" + that is from a user provided permutation strategy + """ def __init__(self, perm_strat: str) -> None: message = self.create_message(perm_strat) @@ -80,16 +82,15 @@ class SSReservedKeywordError(SmartSimError): class SSDBIDConflictError(SmartSimError): """Raised in the event that a database identifier - is not unique when multiple databases are created""" + is not unique when multiple databases are created + """ # Internal Exceptions class SSInternalError(Exception): - """ - SSInternalError is raised when an internal error is encountered. - """ + """SSInternalError is raised when an internal error is encountered""" class SSConfigError(SSInternalError): @@ -106,7 +107,8 @@ class AllocationError(LauncherError): class ShellError(LauncherError): """Raised when error arises from function within launcher.shell - Closely related to error from subprocess(Popen) commands""" + Closely related to error from subprocess(Popen) commands + """ def __init__( self, @@ -130,3 +132,17 @@ def create_message( if details: msg += f"\nError from shell: {details}" return msg + + +class TelemetryError(SSInternalError): + """Raised when SmartSim runs into trouble establishing or communicating + telemetry information + """ + +class UnproxyableStepError(TelemetryError): + """Raised when a user attempts to proxy a managed ``Step`` through the + unmanaged step proxy entry point + """ + +class SmartSimCLIActionCancelled(SmartSimError): + """Raised when a `smart` CLI command is terminated""" diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 245293e27..ae9633d9d 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os import os.path as osp import typing as t from os import getcwd @@ -194,6 +195,8 @@ def start( if summary: self._launch_summary(start_manifest) self._control.start( + exp_name=self.name, + exp_path=self.exp_path, manifest=start_manifest, block=block, kill_on_interrupt=kill_on_interrupt, @@ -878,3 +881,35 @@ def append_to_db_identifier_list(self, db_identifier: str) -> None: ) # Otherwise, add self.db_identifiers.add(db_identifier) + + def enable_telemetry(self) -> None: + """Experiments will start producing telemetry for all entities run + through ``Experiment.start`` + + .. warning:: + + This method is currently implemented so that ALL ``Experiment`` + instances will begin producing telemetry data. In the future it + is planned to have this method work on a "per instance" basis! + """ + self._set_telemetry(True) + + def disable_telemetry(self) -> None: + """Experiments will stop producing telemetry for all entities run + through ``Experiment.start`` + + .. warning:: + + This method is currently implemented so that ALL ``Experiment`` + instances will stop producing telemetry data. In the future it + is planned to have this method work on a "per instance" basis! + """ + self._set_telemetry(False) + + @staticmethod + def _set_telemetry(switch: bool, /) -> None: + tm_key = "SMARTSIM_FLAG_TELEMETRY" + if switch: + os.environ[tm_key] = "1" + else: + os.environ[tm_key] = "0" diff --git a/smartsim/log.py b/smartsim/log.py index 9011b3d1b..72d5ad817 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -30,12 +30,16 @@ import coloredlogs -# constants for logging -coloredlogs.DEFAULT_DATE_FORMAT = "%H:%M:%S" -coloredlogs.DEFAULT_LOG_FORMAT = ( +# constants +DEFAULT_DATE_FORMAT: t.Final[str] = "%H:%M:%S" +DEFAULT_LOG_FORMAT: t.Final[str] = ( "%(asctime)s %(hostname)s %(name)s[%(process)d] %(levelname)s %(message)s" ) +# configure colored loggs +coloredlogs.DEFAULT_DATE_FORMAT = DEFAULT_DATE_FORMAT +coloredlogs.DEFAULT_LOG_FORMAT = DEFAULT_LOG_FORMAT + def _get_log_level() -> str: """Get the logging level based on environment variable diff --git a/smartsim/wlm/slurm.py b/smartsim/wlm/slurm.py index 8fe12b3f9..ba46fb64c 100644 --- a/smartsim/wlm/slurm.py +++ b/smartsim/wlm/slurm.py @@ -237,7 +237,8 @@ def _get_alloc_cmd( options: t.Optional[t.Dict[str, str]] = None, ) -> t.List[str]: """Return the command to request an allocation from Slurm with - the class variables as the slurm options.""" + the class variables as the slurm options. + """ salloc_args = [ "--no-shell", diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index ad99d0038..a1d948959 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -827,6 +827,7 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): with pytest.raises(SSUnsupportedError): colo_ensemble.add_model(colo_model) + @pytest.mark.skipif(not should_run_tf, reason="Test needs TensorFlow to run") def test_inconsistent_params_db_model(): """Test error when devices_per_node parameter>1 when devices is set to CPU in DBModel""" diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 157641422..0384de307 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -50,8 +50,9 @@ def test_batch_model(fileutils, test_dir, wlmutils): """Test the launch of a manually construced batch model""" exp_name = "test-batch-model" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") @@ -76,8 +77,9 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): """Test the launch of a manually constructed batch ensemble""" exp_name = "test-batch-ensemble" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") @@ -102,8 +104,9 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): exp_name = "test-batch-ensemble-replicas" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = wlmutils.get_run_settings("python", f"{script} --time=5") diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 3966d0382..5219cca99 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -44,8 +44,7 @@ def test_launch_orc_auto_batch(test_dir, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-orc-batch" - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -83,8 +82,7 @@ def test_launch_cluster_orc_batch_single(test_dir, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-batch-single" - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -126,8 +124,7 @@ def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-batch-multi" - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -163,8 +160,7 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): """test reconnecting to clustered 3-node orchestrator""" launcher = wlmutils.get_test_launcher() exp_name = "test-launch-cluster-orc-batch-reconect" - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index 14401351b..18e918cfd 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -61,7 +61,7 @@ def test_mpmd(fileutils, test_dir, wlmutils): "cobalt": ["mpirun"], } - exp = Experiment(exp_name, launcher=launcher) + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) def prune_commands(launcher): available_commands = [] @@ -77,7 +77,6 @@ def prune_commands(launcher): f"MPMD on {launcher} only supported for run commands {by_launcher[launcher]}" ) - for run_command in run_commands: script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings( diff --git a/tests/on_wlm/test_base_settings_on_wlm.py b/tests/on_wlm/test_base_settings_on_wlm.py index f324153c5..6e26a1f76 100644 --- a/tests/on_wlm/test_base_settings_on_wlm.py +++ b/tests/on_wlm/test_base_settings_on_wlm.py @@ -42,8 +42,9 @@ def test_model_on_wlm(fileutils, test_dir, wlmutils): exp_name = "test-base-settings-model-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") @@ -60,8 +61,9 @@ def test_model_on_wlm(fileutils, test_dir, wlmutils): def test_model_stop_on_wlm(fileutils, test_dir, wlmutils): exp_name = "test-base-settings-model-stop" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index 0f6d1fccd..a37f91137 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -50,7 +50,7 @@ def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, db_type db_args = { "debug": DEBUG_DB } - exp = Experiment(f"colocated_model_defaults_{db_type}", launcher=launcher, exp_path=test_dir) + exp = Experiment("colocated_model_defaults", launcher=launcher, exp_path=test_dir) colo_model = coloutils.setup_test_colo( fileutils, db_type, @@ -73,7 +73,11 @@ def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, db_type @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type): - exp = Experiment(f"colocated_model_pinning_auto_1cpu_{db_type}", launcher=launcher, exp_path=test_dir) + exp = Experiment( + "colocated_model_pinning_auto_1cpu", + launcher=launcher, + exp_path=test_dir + ) db_args = { "db_cpus": 1, "custom_pinning": [], @@ -98,7 +102,11 @@ def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type @pytest.mark.parametrize("db_type", supported_dbs) def test_colocated_model_pinning_auto_2cpu(fileutils, test_dir, coloutils, db_type): - exp = Experiment(f"colocated_model_pinning_auto_2cpu_{db_type}", launcher=launcher, exp_path=test_dir) + exp = Experiment( + "colocated_model_pinning_auto_2cpu", + launcher=launcher, + exp_path=test_dir, + ) db_args = { "db_cpus": 2, @@ -125,7 +133,11 @@ def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # Assume that there are at least 4 cpus on the node - exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=test_dir) + exp = Experiment( + "colocated_model_pinning_manual", + launcher=launcher, + exp_path=test_dir, + ) db_args = { "db_cpus": 4, @@ -152,7 +164,11 @@ def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this has more than 2 CPUs on the supercomputer node - exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=test_dir) + exp = Experiment( + "colocated_model_pinning_manual", + launcher=launcher, + exp_path=test_dir, + ) db_args = { "db_cpus": 2, @@ -178,7 +194,11 @@ def test_colocated_model_pinning_mixed(fileutils, test_dir, coloutils, db_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this at least 4 CPUs on the supercomputer node - exp = Experiment(f"colocated_model_pinning_manual_{db_type}", launcher=launcher, exp_path=test_dir) + exp = Experiment( + "colocated_model_pinning_manual", + launcher=launcher, + exp_path=test_dir, + ) db_args = { "db_cpus": 2, diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index b5e2e4394..ab100d1a7 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -38,8 +38,7 @@ def test_launch_orc_auto(test_dir, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-orc" - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -71,8 +70,7 @@ def test_launch_cluster_orc_single(test_dir, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-single" - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -105,8 +103,7 @@ def test_launch_cluster_orc_multi(test_dir, wlmutils): launcher = wlmutils.get_test_launcher() exp_name = "test-launch-auto-cluster-orc-multi" - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() diff --git a/tests/on_wlm/test_launch_errors.py b/tests/on_wlm/test_launch_errors.py index bba0b6b00..3da55ccf9 100644 --- a/tests/on_wlm/test_launch_errors.py +++ b/tests/on_wlm/test_launch_errors.py @@ -40,8 +40,9 @@ def test_failed_status(fileutils, test_dir, wlmutils): """Test when a failure occurs deep into model execution""" exp_name = "test-report-failure" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment(exp_name, + launcher=wlmutils.get_test_launcher(), + exp_path=test_dir) script = fileutils.get_test_conf_path("bad.py") settings = exp.create_run_settings( @@ -69,8 +70,7 @@ def test_bad_run_command_args(fileutils, test_dir, wlmutils): pytest.skip(f"Only fails with slurm. Launcher is {launcher}") exp_name = "test-bad-run-command-args" - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) script = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_launch_ompi_lsf.py b/tests/on_wlm/test_launch_ompi_lsf.py index d0bafe2eb..ed082d22e 100644 --- a/tests/on_wlm/test_launch_ompi_lsf.py +++ b/tests/on_wlm/test_launch_ompi_lsf.py @@ -39,8 +39,7 @@ def test_launch_openmpi_lsf(fileutils, test_dir, wlmutils): if launcher != "lsf": pytest.skip("Test only runs on systems with LSF as WLM") exp_name = "test-launch-openmpi-lsf" - exp = Experiment(exp_name, launcher=launcher) - + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", script, "mpirun") diff --git a/tests/on_wlm/test_restart.py b/tests/on_wlm/test_restart.py index 72c2d2311..ab74b1733 100644 --- a/tests/on_wlm/test_restart.py +++ b/tests/on_wlm/test_restart.py @@ -38,8 +38,9 @@ def test_restart(fileutils, test_dir, wlmutils): exp_name = "test-restart" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment(exp_name, + launcher=wlmutils.get_test_launcher(), + exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index bfcac0d5e..45f1972f4 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -56,8 +56,9 @@ def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): ) exp_name = "test-simplebase-settings-model-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") @@ -77,8 +78,9 @@ def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): ) exp_name = "test-simplebase-settings-model-stop" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = RunSettings("python", exe_args=f"{script} --time=5") diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index c6146d517..707cb5876 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -48,8 +48,9 @@ def test_models(fileutils, test_dir, wlmutils): exp_name = "test-models-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -65,8 +66,9 @@ def test_models(fileutils, test_dir, wlmutils): def test_ensemble(fileutils, test_dir, wlmutils): exp_name = "test-ensemble-launch" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") @@ -84,8 +86,9 @@ def test_summary(fileutils, test_dir, wlmutils): """Fairly rudimentary test of the summary dataframe""" exp_name = "test-launch-summary" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) sleep = fileutils.get_test_conf_path("sleep.py") bad = fileutils.get_test_conf_path("bad.py") diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index 36f8c5400..41f3bd54e 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -44,8 +44,9 @@ def test_stop_entity(fileutils, test_dir, wlmutils): exp_name = "test-launch-stop-model" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") @@ -62,8 +63,9 @@ def test_stop_entity(fileutils, test_dir, wlmutils): def test_stop_entity_list(fileutils, test_dir, wlmutils): exp_name = "test-launch-stop-ensemble" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) - + exp = Experiment( + exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir + ) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=10") diff --git a/tests/test_cli.py b/tests/test_cli.py index 79471a355..31fce4cd0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -26,12 +26,15 @@ import argparse from contextlib import contextmanager +import logging +import os +import pathlib import typing as t import pytest import smartsim -from smartsim._core._cli import build, cli +from smartsim._core._cli import build, cli, plugin from smartsim._core._cli.build import configure_parser as build_parser from smartsim._core._cli.build import execute as build_execute from smartsim._core._cli.clean import configure_parser as clean_parser @@ -48,6 +51,14 @@ # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a +_TEST_LOGGER = logging.getLogger(__name__) + +try: + import smartdashboard +except: + test_dash_plugin = False +else: + test_dash_plugin = True def mock_execute_custom(msg: str = None, good: bool = True) -> int: retval = 0 if good else 1 @@ -55,11 +66,11 @@ def mock_execute_custom(msg: str = None, good: bool = True) -> int: return retval -def mock_execute_good(_ns: argparse.Namespace) -> int: +def mock_execute_good(_ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None) -> int: return mock_execute_custom("GOOD THINGS", good = True) -def mock_execute_fail(_ns: argparse.Namespace) -> int: +def mock_execute_fail(_ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None) -> int: return mock_execute_custom("BAD THINGS", good = False) @@ -220,8 +231,8 @@ def test_cli_command_execution(capsys): exp_b_help = "this is my mock help text for build" exp_b_cmd = "build" - dbcli_exec = lambda x: mock_execute_custom(msg="Database", good=True) - build_exec = lambda x: mock_execute_custom(msg="Builder", good=True) + dbcli_exec = lambda x, y: mock_execute_custom(msg="Database", good=True) + build_exec = lambda x, y: mock_execute_custom(msg="Builder", good=True) menu = [cli.MenuItemConfig(exp_a_cmd, exp_a_help, @@ -269,7 +280,7 @@ def test_cli_default_cli(capsys): # show that `smart dbcli` calls the build parser and build execute function assert "usage: smart [-h] " in captured.out assert "Available commands" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `build` argument, expect build-specific help text with pytest.raises(SystemExit) as e: @@ -281,7 +292,7 @@ def test_cli_default_cli(capsys): assert "usage: smart build [-h]" in captured.out assert "Build SmartSim dependencies" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `clean` argument, expect clean-specific help text with pytest.raises(SystemExit) as e: @@ -294,7 +305,7 @@ def test_cli_default_cli(capsys): assert "Remove previous ML runtime installation" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out assert "--clobber" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `dbcli` argument, expect dbcli-specific help text with pytest.raises(SystemExit) as e: @@ -306,7 +317,7 @@ def test_cli_default_cli(capsys): assert "usage: smart dbcli [-h]" in captured.out assert "Print the path to the redis-cli binary" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `site` argument, expect site-specific help text with pytest.raises(SystemExit) as e: @@ -318,7 +329,7 @@ def test_cli_default_cli(capsys): assert "usage: smart site [-h]" in captured.out assert "Print the installation site of SmartSim" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE # execute with `clobber` argument, expect clobber-specific help text with pytest.raises(SystemExit) as e: @@ -331,8 +342,61 @@ def test_cli_default_cli(capsys): assert "Remove all previous dependency installations" in captured.out assert "optional arguments:" in captured.out or "options:" in captured.out # assert "--clobber" not in captured.out - assert ret_val == 0 + assert ret_val == os.EX_USAGE + + +@pytest.mark.skipif(not test_dash_plugin, reason="plugin not found") +def test_cli_plugin_dashboard(capfd): + """Ensure expected dashboard CLI plugin commands are supported""" + smart_cli = cli.default_cli() + capfd.readouterr() # throw away existing output + + # execute with `dashboard` argument, expect dashboard-specific help text + build_args = ["smart", "dashboard", "-h"] + rc = smart_cli.execute(build_args) + + captured = capfd.readouterr() # capture new output + + assert "[-d DIRECTORY]" in captured.out + assert "[-p PORT]" in captured.out + + assert "optional arguments:" in captured.out + assert rc == 0 + + +def test_cli_plugin_invalid( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +): + """Ensure unexpected CLI plugins are reported""" + import smartsim._core._cli.cli + import smartsim._core._cli.plugin + plugin_module = "notinstalled.Experiment_Overview" + bad_plugins = [ + lambda: MenuItemConfig( + "dashboard", + "Start the SmartSim dashboard", + plugin.dynamic_execute(plugin_module, "Dashboard!"), + is_plugin=True, + ) + ] + monkeypatch.setattr(smartsim._core._cli.cli, "plugins", bad_plugins) + # Coloredlogs doesn't play nice with caplog + monkeypatch.setattr( + smartsim._core._cli.plugin, + "_LOGGER", + _TEST_LOGGER, + ) + + smart_cli = cli.default_cli() + # execute with `dashboard` argument, expect failure to find dashboard plugin + build_args = ["smart", "dashboard", "-h"] + + rc = smart_cli.execute(build_args) + + assert plugin_module in caplog.text + assert "not found" in caplog.text + assert rc == os.EX_CONFIG @pytest.mark.parametrize( "command,mock_location,exp_output", @@ -348,7 +412,7 @@ def test_cli_default_cli(capsys): ) def test_cli_action(capsys, monkeypatch, command, mock_location, exp_output): """Ensure the default CLI executes the build action""" - def mock_execute(ns: argparse.Namespace): + def mock_execute(ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None): print(exp_output) return 0 @@ -400,7 +464,7 @@ def test_cli_optional_args(capsys, check_prop: str, exp_prop_val: t.Any): """Ensure the parser for a command handles expected optional arguments""" - def mock_execute(ns: argparse.Namespace): + def mock_execute(ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None): print(exp_output) return 0 @@ -418,9 +482,6 @@ def mock_execute(ns: argparse.Namespace): assert exp_output in captured.out # did the expected execution method occur? assert ret_val == 0 # is the retval is non-failure code? - - # is the value from the optional argument set in the parsed args? - assert smart_cli.args.__dict__[check_prop] == exp_prop_val else: with pytest.raises(SystemExit) as e: ret_val = smart_cli.execute(build_args) @@ -449,7 +510,7 @@ def test_cli_help_support(capsys, mock_output: str, exp_output: str): """Ensure the parser supports help optional for commands as expected""" - def mock_execute(ns: argparse.Namespace): + def mock_execute(ns: argparse.Namespace, unparsed: t.Optional[t.List[str]] = None): print(mock_output) return 0 @@ -487,7 +548,7 @@ def test_cli_invalid_optional_args(capsys, mock_location: str, exp_output: str): """Ensure the parser throws expected error for an invalid argument""" - def mock_execute(ns: argparse.Namespace): + def mock_execute(ns: argparse.Namespace, unparsed: t.Optional[t.List[str]] = None): print(exp_output) return 0 @@ -540,12 +601,12 @@ def test_cli_full_clean_execute(capsys, monkeypatch): exp_retval = 0 exp_output = "mocked-clean utility" - def mock_operation(*args, **kwargs) -> int: + # mock out the internal clean method so we don't actually delete anything + def mock_clean(core_path: pathlib.Path, _all: bool = False) -> int: print(exp_output) return exp_retval - - # mock out the internal clean method so we don't actually delete anything - monkeypatch.setattr(smartsim._core._cli.clean, "clean", mock_operation) + + monkeypatch.setattr(smartsim._core._cli.clean, "clean", mock_clean) command = "clean" cfg = MenuItemConfig(command, @@ -692,7 +753,7 @@ def mock_operation(*args, **kwargs) -> int: def _good_build(*args, **kwargs): - print("LGTM") + _TEST_LOGGER.info("LGTM") def _bad_build(*args, **kwargs): @@ -707,17 +768,17 @@ def _mock_temp_dir(*a, **kw): @pytest.mark.parametrize( "mock_verify_fn, expected_stdout, expected_retval", [ - pytest.param(_good_build, 'LGTM', 0, id="Configured Correctly"), + pytest.param(_good_build, 'LGTM', os.EX_OK, id="Configured Correctly"), pytest.param( _bad_build, "SmartSim failed to run a simple experiment", - 2, + os.EX_SOFTWARE, id="Configured Incorrectly", ) ], ) -def test_cli_build_test_execute( - capsys, +def test_cli_validation_test_execute( + caplog, monkeypatch, mock_verify_fn, expected_stdout, @@ -728,6 +789,7 @@ def test_cli_build_test_execute( checks that if at any point the test raises an exception an appropriate error code and error msg are returned. """ + caplog.set_level(logging.INFO) # Mock out the verification tests/avoid file system ops monkeypatch.setattr(smartsim._core._cli.validate, "test_install", mock_verify_fn) @@ -736,11 +798,11 @@ def test_cli_build_test_execute( "_VerificationTempDir", _mock_temp_dir, ) - # Coloredlogs doesn't play nice with capsys + # Coloredlogs doesn't play nice with caplog monkeypatch.setattr( - smartsim._core._cli.validate.logger, - "error", - print, + smartsim._core._cli.validate, + "logger", + _TEST_LOGGER, ) command = "validate" @@ -751,12 +813,8 @@ def test_cli_build_test_execute( menu = [cfg] smart_cli = cli.SmartCli(menu) - captured = capsys.readouterr() # throw away existing output - verify_args = ["smart", command] actual_retval = smart_cli.execute(verify_args) - captured = capsys.readouterr() # capture new output - - assert expected_stdout in captured.out + assert expected_stdout in caplog.text assert actual_retval == expected_retval diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index bfeacad8c..9550e9b87 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -45,11 +45,11 @@ @pytest.mark.skipif(not is_mac, reason="MacOS-only test") -def test_macosx_warning(fileutils, coloutils): +def test_macosx_warning(fileutils, test_dir, coloutils): db_args = {"custom_pinning": [1]} db_type = "uds" # Test is insensitive to choice of db - exp = Experiment("colocated_model_defaults", launcher="local") + exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.warns( RuntimeWarning, match="CPU pinning is not supported on MacOSX. Ignoring pinning specification.", @@ -63,11 +63,11 @@ def test_macosx_warning(fileutils, coloutils): ) -def test_unsupported_limit_app(fileutils, coloutils): +def test_unsupported_limit_app(fileutils, test_dir, coloutils): db_args = {"limit_app_cpus": True} db_type = "uds" # Test is insensitive to choice of db - exp = Experiment("colocated_model_defaults", launcher="local") + exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.raises(SSUnsupportedError): coloutils.setup_test_colo( fileutils, @@ -80,11 +80,11 @@ def test_unsupported_limit_app(fileutils, coloutils): @pytest.mark.skipif(is_mac, reason="Unsupported on MacOSX") @pytest.mark.parametrize("custom_pinning", [1, "10", "#", 1.0, ["a"], [1.0]]) -def test_unsupported_custom_pinning(fileutils, coloutils, custom_pinning): +def test_unsupported_custom_pinning(fileutils, test_dir, coloutils, custom_pinning): db_type = "uds" # Test is insensitive to choice of db db_args = {"custom_pinning": custom_pinning} - exp = Experiment("colocated_model_defaults", launcher="local") + exp = Experiment("colocated_model_defaults", launcher="local", exp_path=test_dir) with pytest.raises(TypeError): coloutils.setup_test_colo( fileutils, @@ -120,8 +120,7 @@ def test_launch_colocated_model_defaults( db_args = {} - - exp = Experiment(f"colocated_model_defaults_{db_type}", test_dir, launcher=launcher) + exp = Experiment("colocated_model_defaults", launcher=launcher, exp_path=test_dir) colo_model = coloutils.setup_test_colo( fileutils, db_type, @@ -193,9 +192,7 @@ def test_colocated_model_disable_pinning( fileutils, test_dir, coloutils, db_type, launcher="local" ): exp = Experiment( - f"colocated_model_pinning_auto_1cpu_{db_type}", - launcher=launcher, - exp_path=test_dir, + "colocated_model_pinning_auto_1cpu", launcher=launcher, exp_path=test_dir ) db_args = { "db_cpus": 1, @@ -221,9 +218,7 @@ def test_colocated_model_pinning_auto_2cpu( fileutils, test_dir, coloutils, db_type, launcher="local" ): exp = Experiment( - f"colocated_model_pinning_auto_2cpu_{db_type}", - launcher=launcher, - exp_path=test_dir, + "colocated_model_pinning_auto_2cpu", launcher=launcher, exp_path=test_dir ) db_args = { @@ -259,9 +254,7 @@ def test_colocated_model_pinning_range( # Check to make sure that the CPU mask was correctly generated exp = Experiment( - f"colocated_model_pinning_manual_{db_type}", - launcher=launcher, - exp_path=test_dir, + "colocated_model_pinning_manual", launcher=launcher, exp_path=test_dir ) db_args = {"db_cpus": 2, "custom_pinning": range(2)} @@ -288,9 +281,7 @@ def test_colocated_model_pinning_list( # Check to make sure that the CPU mask was correctly generated exp = Experiment( - f"colocated_model_pinning_manual_{db_type}", - launcher=launcher, - exp_path=test_dir, + "colocated_model_pinning_manual", launcher=launcher, exp_path=test_dir ) db_args = {"db_cpus": 1, "custom_pinning": [1]} diff --git a/tests/test_config.py b/tests/test_config.py index 762f5c9a7..1198871cc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -194,3 +194,57 @@ def test_redis_cli(): with pytest.raises(SSConfigError): config.database_cli os.environ.pop("REDIS_CLI_PATH") + + +@pytest.mark.parametrize( + "value, exp_result", [ + pytest.param("0", False, id="letter zero"), + pytest.param("1", True, id="letter one"), + pytest.param("-1", False, id="letter negative one"), + pytest.param(None, False, id="not in env"), + ] +) +def test_telemetry_flag(monkeypatch: pytest.MonkeyPatch, + value: t.Optional[str], + exp_result: bool): + if value is not None: + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", value) + else: + monkeypatch.delenv("SMARTSIM_FLAG_TELEMETRY", raising=False) + config = Config() + assert config.telemetry_enabled == exp_result + +@pytest.mark.parametrize( + "value, exp_result", [ + pytest.param("1", 1, id="1"), + pytest.param("123", 123, id="123"), + pytest.param(None, 5, id="not in env"), + ] +) +def test_telemetry_frequency( + monkeypatch: pytest.MonkeyPatch, value: t.Optional[str], exp_result: int +): + if value is not None: + monkeypatch.setenv("SMARTSIM_TELEMETRY_FREQUENCY", value) + else: + monkeypatch.delenv("SMARTSIM_TELEMETRY_FREQUENCY", raising=False) + config = Config() + assert config.telemetry_frequency == exp_result + + +@pytest.mark.parametrize( + "value, exp_result", [ + pytest.param("30", 30, id="30"), + pytest.param("123", 123, id="123"), + pytest.param(None, 90, id="not in env"), + ] +) +def test_telemetry_cooldown( + monkeypatch: pytest.MonkeyPatch, value: t.Optional[str], exp_result: bool +): + if value is not None: + monkeypatch.setenv("SMARTSIM_TELEMETRY_COOLDOWN", value) + else: + monkeypatch.delenv("SMARTSIM_TELEMETRY_COOLDOWN", raising=False) + config = Config() + assert config.telemetry_cooldown == exp_result diff --git a/tests/test_configs/echo.py b/tests/test_configs/echo.py new file mode 100644 index 000000000..8a34a0b6f --- /dev/null +++ b/tests/test_configs/echo.py @@ -0,0 +1,42 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import time + + +def echo(message: str, sleep_time: int): + if sleep_time > 0: + time.sleep(sleep_time) + print(f"Echoing: {message}") + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("--message", type=str, default="Lorem ipsum") + parser.add_argument("--sleep_time", type=int, default=0) + args = parser.parse_args() + echo(args.message, args.sleep_time) diff --git a/tests/test_configs/printing_model.py b/tests/test_configs/printing_model.py new file mode 100644 index 000000000..044b2a03b --- /dev/null +++ b/tests/test_configs/printing_model.py @@ -0,0 +1,18 @@ +import time +import sys + + +def main() -> int: + print(";START;") + time.sleep(20) + print(";MID;") + print("This is an error msg", file=sys.stderr) + time.sleep(20) + print(";END;") + + print("yay!!") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_configs/telemetry/colocatedmodel.json b/tests/test_configs/telemetry/colocatedmodel.json new file mode 100644 index 000000000..f3e93ac76 --- /dev/null +++ b/tests/test_configs/telemetry/colocatedmodel.json @@ -0,0 +1,69 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "002816b", + "timestamp": 1699037041106269774, + "model": [ + { + "name": "colocated_model", + "path": "/tmp/my-exp/colocated_model", + "exe_args": [ + "/path/to/my/script.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": {} + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "unix_socket": "/tmp/redis.socket", + "socket_permissions": 755, + "port": 0, + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null + }, + "extra_db_args": {} + }, + "scripts": [], + "models": [] + }, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_ensemble/002816b/model/colocated_model", + "step_id": "4139111.21", + "task_id": "21529", + "managed": true + }, + "out_file": "/tmp/my-exp/colocated_model/colocated_model.out", + "err_file": "/tmp/my-exp/colocated_model/colocated_model.err" + } + ], + "orchestrator": [], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/db_and_model.json b/tests/test_configs/telemetry/db_and_model.json new file mode 100644 index 000000000..58c1c841a --- /dev/null +++ b/tests/test_configs/telemetry/db_and_model.json @@ -0,0 +1,86 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "2ca19ad", + "timestamp": 1699038647234488933, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_0", + "hostname": "10.128.0.4", + "port": 6780, + "cluster": false, + "conf_file": null, + "out_file": "/path/to/some/file.out", + "err_file": "/path/to/some/file.err", + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", + "step_id": "4139111.27", + "task_id": "1452", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "4b5507a", + "timestamp": 1699038661491043211, + "model": [ + { + "name": "perroquet", + "path": "/tmp/my-exp/perroquet", + "exe_args": [ + "/path/to/my/script.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", + "step_id": "4139111.28", + "task_id": "2929", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet/perroquet.out", + "err_file": "/tmp/my-exp/perroquet/perroquet.err" + } + ], + "orchestrator": [], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/db_and_model_1run.json b/tests/test_configs/telemetry/db_and_model_1run.json new file mode 100644 index 000000000..44e32bfe4 --- /dev/null +++ b/tests/test_configs/telemetry/db_and_model_1run.json @@ -0,0 +1,79 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "4b5507a", + "timestamp": 1699038661491043211, + "model": [ + { + "name": "perroquet", + "path": "/tmp/my-exp/perroquet", + "exe_args": [ + "/path/to/my/script.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", + "step_id": "4139111.28", + "task_id": "2929", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet/perroquet.out", + "err_file": "/tmp/my-exp/perroquet/perroquet.err" + } + ], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_0", + "hostname": "10.128.0.4", + "port": 6780, + "cluster": false, + "conf_file": null, + "out_file": "/path/to/some/file.out", + "err_file": "/path/to/some/file.err", + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", + "step_id": "4139111.27", + "task_id": "1452", + "managed": true + } + } + ] + } + ], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/ensembles.json b/tests/test_configs/telemetry/ensembles.json new file mode 100644 index 000000000..841324ec6 --- /dev/null +++ b/tests/test_configs/telemetry/ensembles.json @@ -0,0 +1,329 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/home/someuser/code/ss/my-exp", + "launcher": "Local" + }, + "runs": [ + { + "run_id": "d041b90", + "timestamp": 1698679830384608928, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_0", + "step_id": null, + "task_id": "88118", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_0.out", + "err_file": "/home/someuser/code/ss/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_1", + "step_id": null, + "task_id": "88131", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_1.out", + "err_file": "/home/someuser/code/ss/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_2", + "step_id": null, + "task_id": "88146", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_2.out", + "err_file": "/home/someuser/code/ss/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_3", + "step_id": null, + "task_id": "88170", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_3.out", + "err_file": "/home/someuser/code/ss/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_4", + "step_id": null, + "task_id": "88178", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_4.out", + "err_file": "/home/someuser/code/ss/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_5", + "step_id": null, + "task_id": "88193", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_5.out", + "err_file": "/home/someuser/code/ss/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_6", + "step_id": null, + "task_id": "88221", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_6.out", + "err_file": "/home/someuser/code/ss/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_7", + "step_id": null, + "task_id": "88241", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_7.out", + "err_file": "/home/someuser/code/ss/my-ens_7.err" + } + ] + } + ] + } + ] + } diff --git a/tests/test_configs/telemetry/serialmodels.json b/tests/test_configs/telemetry/serialmodels.json new file mode 100644 index 000000000..40337eceb --- /dev/null +++ b/tests/test_configs/telemetry/serialmodels.json @@ -0,0 +1,186 @@ +{ + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/tmp/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "8c0fbb1", + "timestamp": 1699037881502730708, + "model": [ + { + "name": "perroquet_0", + "path": "/tmp/my-exp/perroquet_0", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_0", + "step_id": "4139111.22", + "task_id": "17966", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_0/perroquet_0.out", + "err_file": "/tmp/my-exp/perroquet_0/perroquet_0.err" + }, + { + "name": "perroquet_1", + "path": "/tmp/my-exp/perroquet_1", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_1", + "step_id": "4139111.23", + "task_id": "18100", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_1/perroquet_1.out", + "err_file": "/tmp/my-exp/perroquet_1/perroquet_1.err" + }, + { + "name": "perroquet_2", + "path": "/tmp/my-exp/perroquet_2", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_2", + "step_id": "4139111.24", + "task_id": "18159", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_2/perroquet_2.out", + "err_file": "/tmp/my-exp/perroquet_2/perroquet_2.err" + }, + { + "name": "perroquet_3", + "path": "/tmp/my-exp/perroquet_3", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_3", + "step_id": "4139111.25", + "task_id": "18499", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_3/perroquet_3.out", + "err_file": "/tmp/my-exp/perroquet_3/perroquet_3.err" + }, + { + "name": "perroquet_4", + "path": "/tmp/my-exp/perroquet_4", + "exe_args": [ + "/tmp/echo.py" + ], + "run_settings": { + "exe": [ + "/path/to/some/python" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks-per-node": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_4", + "step_id": "4139111.26", + "task_id": "18832", + "managed": true + }, + "out_file": "/tmp/my-exp/perroquet_4/perroquet_4.out", + "err_file": "/tmp/my-exp/perroquet_4/perroquet_4.err" + } + ], + "orchestrator": [], + "ensemble": [] + } + ] +} diff --git a/tests/test_configs/telemetry/telemetry.json b/tests/test_configs/telemetry/telemetry.json new file mode 100644 index 000000000..a380bc5fb --- /dev/null +++ b/tests/test_configs/telemetry/telemetry.json @@ -0,0 +1,946 @@ +{ + "experiment": { + "name": "my-exp", + "path": "/path/to/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "d999ad89-020f-4e6a-b834-dbd88658ce84", + "timestamp": 1697824072792854287, + "model": [ + { + "name": "my-model", + "path": "/path/to/my-exp/my-model", + "exe_args": [ + "hello", + "world" + ], + "run_settings": { + "exe": [ + "/usr/bin/echo" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "port": 5757, + "ifname": "lo", + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "COLO", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null + }, + "extra_db_args": {} + }, + "scripts": [], + "models": [ + { + "cnn": { + "backend": "TORCH", + "device": "CPU" + } + } + ] + }, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d999ad89-020f-4e6a-b834-dbd88658ce84/model/my-model", + "step_id": "4121050.30", + "task_id": "25230", + "managed": true + }, + "out_file": "/path/to/my-exp/my-model/my-model.out", + "err_file": "/path/to/my-exp/my-model/my-model.err" + } + ], + "orchestrator": [], + "ensemble": [] + }, + { + "run_id": "fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa", + "timestamp": 1697824102122439975, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_1", + "hostname": "10.128.0.70", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_1-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } + }, + { + "name": "orchestrator_2", + "hostname": "10.128.0.71", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_2-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } + }, + { + "name": "orchestrator_0", + "hostname": "10.128.0.69", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_0-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "d65ae1df-cb5e-45d9-ab09-6fa641755997", + "timestamp": 1697824127962219505, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/path/to/my-exp/my-ens/my-ens_0", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_0", + "step_id": "4121050.32", + "task_id": "25639", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/path/to/my-exp/my-ens/my-ens_1", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_1", + "step_id": "4121050.33", + "task_id": "25768", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/path/to/my-exp/my-ens/my-ens_2", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_2", + "step_id": "4121050.34", + "task_id": "25817", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/path/to/my-exp/my-ens/my-ens_3", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_3", + "step_id": "4121050.35", + "task_id": "25837", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/path/to/my-exp/my-ens/my-ens_4", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_4", + "step_id": "4121050.36", + "task_id": "25872", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/path/to/my-exp/my-ens/my-ens_5", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_5", + "step_id": "4121050.37", + "task_id": "25930", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/path/to/my-exp/my-ens/my-ens_6", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_6", + "step_id": "4121050.38", + "task_id": "25945", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/path/to/my-exp/my-ens/my-ens_7", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_7", + "step_id": "4121050.39", + "task_id": "25967", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" + } + ] + } + ] + }, + { + "run_id": "e41f8e17-c4b2-441d-adf9-707443ee2c72", + "timestamp": 1697835227560376025, + "model": [ + { + "name": "my-model", + "path": "/path/to/my-exp/my-model", + "exe_args": [ + "hello", + "world" + ], + "run_settings": { + "exe": [ + "/usr/bin/echo" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "port": 5757, + "ifname": "lo", + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "COLO", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null + }, + "extra_db_args": {} + }, + "scripts": [], + "models": [ + { + "cnn": { + "backend": "TORCH", + "device": "CPU" + } + } + ] + }, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/e41f8e17-c4b2-441d-adf9-707443ee2c72/model/my-model", + "step_id": "4121904.0", + "task_id": "28277", + "managed": true + }, + "out_file": "/path/to/my-exp/my-model/my-model.out", + "err_file": "/path/to/my-exp/my-model/my-model.err" + } + ], + "orchestrator": [], + "ensemble": [] + }, + { + "run_id": "b33a5d27-6822-4795-8e0e-cfea18551fa4", + "timestamp": 1697835261956135240, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_0", + "hostname": "10.128.0.2", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_0-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } + }, + { + "name": "orchestrator_2", + "hostname": "10.128.0.4", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_2-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } + }, + { + "name": "orchestrator_1", + "hostname": "10.128.0.3", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_1-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "45772df2-fd80-43fd-adf0-d5e319870182", + "timestamp": 1697835287798613875, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/path/to/my-exp/my-ens/my-ens_0", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_0", + "step_id": "4121904.2", + "task_id": "28333", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/path/to/my-exp/my-ens/my-ens_1", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_1", + "step_id": "4121904.3", + "task_id": "28342", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/path/to/my-exp/my-ens/my-ens_2", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_2", + "step_id": "4121904.4", + "task_id": "28353", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/path/to/my-exp/my-ens/my-ens_3", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_3", + "step_id": "4121904.5", + "task_id": "28362", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/path/to/my-exp/my-ens/my-ens_4", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_4", + "step_id": "4121904.6", + "task_id": "28371", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/path/to/my-exp/my-ens/my-ens_5", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_5", + "step_id": "4121904.7", + "task_id": "28380", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/path/to/my-exp/my-ens/my-ens_6", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_6", + "step_id": "4121904.8", + "task_id": "28389", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/path/to/my-exp/my-ens/my-ens_7", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_7", + "step_id": "4121904.9", + "task_id": "28398", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" + } + ] + } + ] + } + ] + } + diff --git a/tests/test_controller.py b/tests/test_controller.py new file mode 100644 index 000000000..c00adce91 --- /dev/null +++ b/tests/test_controller.py @@ -0,0 +1,68 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import pathlib + +from smartsim._core.control.controller import Controller +from smartsim.settings.slurmSettings import SbatchSettings, SrunSettings +from smartsim._core.launcher.step import Step +from smartsim.entity.ensemble import Ensemble +from smartsim.database.orchestrator import Orchestrator + +controller = Controller() + +rs = SrunSettings('echo', ['spam', 'eggs']) +bs = SbatchSettings() + +ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) +orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") + +class MockStep(Step): + @staticmethod + def _create_unique_name(name): + return name + + def add_to_batch(self, step): + ... + + def get_launch_cmd(self): + return [] + +@pytest.mark.parametrize("collection", [ + pytest.param(ens, id="Ensemble"), + pytest.param(orc, id="Database"), +]) +def test_controller_batch_step_creation_preserves_entity_order(collection, monkeypatch): + monkeypatch.setattr(controller._launcher, "create_step", + lambda name, path, settings: MockStep(name, path, settings)) + entity_names = [x.name for x in collection.entities] + assert len(entity_names) == len(set(entity_names)) + _, steps = controller._create_batch_job_step(collection, pathlib.Path("mock/exp/path")) + assert entity_names == [step.name for step in steps] + + diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index 30d9870cf..a40ccdf66 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -100,7 +100,7 @@ def test_wrong_orchestrator(wlmutils): cont = Controller(launcher="local") manifest = Manifest(orc) with pytest.raises(SmartSimError): - cont._launch(manifest) + cont._launch("exp_name", "exp_path", manifest) def test_bad_orc_checkpoint(): diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index 0a656babe..62597b280 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -51,8 +51,7 @@ def test_parse_db_host_error(): def test_hosts(test_dir, wlmutils): exp_name = "test_hosts" - exp = Experiment(exp_name) - + exp = Experiment(exp_name, exp_path=test_dir) orc = Orchestrator(port=wlmutils.get_test_port(), interface="lo", launcher="local") orc.set_path(test_dir) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 369fef95a..8650425b4 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -25,10 +25,13 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest +import os + from smartsim import Experiment from smartsim.entity import Model from smartsim.error import SmartSimError from smartsim.settings import RunSettings +from smartsim._core.config import CONFIG from smartsim.status import STATUS_NEVER_STARTED # The tests in this file belong to the slow_tests group @@ -110,8 +113,7 @@ def test_bad_ensemble_init_no_rs_bs(): def test_stop_entity(test_dir): exp_name = "test_stop_entity" - exp = Experiment(exp_name) - + exp = Experiment(exp_name, exp_path=test_dir) m = exp.create_model("model", path=test_dir, run_settings=RunSettings("sleep", "5")) exp.start(m, block=False) assert exp.finished(m) == False @@ -122,8 +124,7 @@ def test_stop_entity(test_dir): def test_poll(test_dir): # Ensure that a SmartSimError is not raised exp_name = "test_exp_poll" - exp = Experiment(exp_name) - + exp = Experiment(exp_name, exp_path=test_dir) model = exp.create_model( "model", path=test_dir, run_settings=RunSettings("sleep", "5") ) @@ -134,8 +135,7 @@ def test_poll(test_dir): def test_summary(test_dir): exp_name = "test_exp_summary" - exp = Experiment(exp_name) - + exp = Experiment(exp_name, exp_path=test_dir) m = exp.create_model( "model", path=test_dir, run_settings=RunSettings("echo", "Hello") ) @@ -155,6 +155,7 @@ def test_summary(test_dir): assert 0 == int(row["RunID"]) assert 0 == int(row["Returncode"]) + def test_launcher_detection(wlmutils, monkeypatch): if wlmutils.get_test_launcher() == "pals": pytest.skip(reason="Launcher detection cannot currently detect pbs vs pals") @@ -164,3 +165,16 @@ def test_launcher_detection(wlmutils, monkeypatch): exp = Experiment("test-launcher-detection", launcher="auto") assert exp._launcher == wlmutils.get_test_launcher() + + +def test_enable_disable_telemtery(monkeypatch): + # TODO: Currently these are implemented by setting an environment variable + # so that ALL experiments instanced in a driver script will begin + # producing telemetry data. In the future it is planned to have this + # work on a "per-instance" basis + monkeypatch.setattr(os, "environ", {}) + exp = Experiment("my-exp") + exp.enable_telemetry() + assert CONFIG.telemetry_enabled + exp.disable_telemetry() + assert not CONFIG.telemetry_enabled diff --git a/tests/test_generator.py b/tests/test_generator.py index 256c3e3dc..6daef2ae3 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -264,10 +264,8 @@ def test_multiple_tags(fileutils, test_dir): exp.start(parameterized_model, block=True) with open(osp.join(parameterized_model.path, "multi-tags.out")) as f: - line = f.readline() - assert ( - line.strip() == "My two parameters are 6379 and unbreakable_password, OK?" - ) + log_content = f.read() + assert "My two parameters are 6379 and unbreakable_password, OK?" in log_content def test_generation_log(fileutils, test_dir): diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 55dd7cbe3..ca145042e 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -27,6 +27,7 @@ import pytest from smartsim._core.utils.helpers import cat_arg_and_value +from smartsim._core.utils import helpers # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -47,3 +48,17 @@ def test_single_char_concat(): def test_fallthrough_concat(): result = cat_arg_and_value("xx", "FOO") # <-- no dashes, > 1 char assert result == "--xx=FOO" + +def test_encode_decode_cmd_round_trip(): + orig_cmd = ["this", "is", "a", "cmd"] + decoded_cmd = helpers.decode_cmd(helpers.encode_cmd(orig_cmd)) + assert orig_cmd == decoded_cmd + assert orig_cmd is not decoded_cmd + +def test_encode_raises_on_empty(): + with pytest.raises(ValueError): + helpers.encode_cmd([]) + +def test_decode_raises_on_empty(): + with pytest.raises(ValueError): + helpers.decode_cmd("") diff --git a/tests/test_indirect.py b/tests/test_indirect.py new file mode 100644 index 000000000..2f9eab3fa --- /dev/null +++ b/tests/test_indirect.py @@ -0,0 +1,192 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import pathlib +import psutil +import pytest +import sys +import uuid + +from smartsim._core.entrypoints.indirect import get_parser, cleanup, get_ts, main +from smartsim._core.utils.serialize import TELMON_SUBDIR, MANIFEST_FILENAME +from smartsim._core.utils.helpers import encode_cmd + +ALL_ARGS = {"+command", "+entity_type", "+telemetry_dir", "+output_file", "+error_file", "+working_dir"} + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +@pytest.mark.parametrize( + ["cmd", "missing"], + [ + pytest.param("indirect.py", {"+name", "+command", "+entity_type", "+telemetry_dir", "+working_dir"}, id="no args"), + pytest.param("indirect.py -c echo +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="cmd typo"), + pytest.param("indirect.py -t orchestrator +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="etype typo"), + pytest.param("indirect.py -d /foo/bar +entity_type ttt +command ccc +output_file ooo +working_dir www +error_file eee", {"+telemetry_dir"}, id="dir typo"), + pytest.param("indirect.py +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="no cmd"), + pytest.param("indirect.py +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="no etype"), + pytest.param("indirect.py +command ccc +entity_type ttt +output_file ooo +working_dir www +error_file eee", {"+telemetry_dir"}, id="no dir"), + ] +) +def test_parser(capsys, cmd, missing): + """Test that the parser reports any missing required arguments""" + parser = get_parser() + + args = cmd.split() + + captured = capsys.readouterr() # throw away existing output + with pytest.raises(SystemExit) as ex: + ns = parser.parse_args(args) + + captured = capsys.readouterr() + assert "the following arguments are required" in captured.err + for arg in missing: + assert arg in captured.err + + expected = ALL_ARGS - missing + msg_tuple = captured.err.split("the following arguments are required: ") + if len(msg_tuple) < 2: + assert False, "error message indicates no missing arguments" + + actual_missing = msg_tuple[1].strip() + for exp in expected: + assert f"{exp}/" not in actual_missing + + +def test_cleanup(capsys, monkeypatch): + """Ensure cleanup attempts termination of correct process""" + mock_pid = 123 + create_msg = "creating: {0}" + term_msg = "terminating: {0}" + + class MockProc: + def __init__(self, pid: int): + print(create_msg.format(pid)) + def terminate(self): + print(term_msg.format(mock_pid)) + + captured = capsys.readouterr() # throw away existing output + + with monkeypatch.context() as ctx: + ctx.setattr('psutil.pid_exists', lambda pid: True) + ctx.setattr('psutil.Process', MockProc) + ctx.setattr('smartsim._core.entrypoints.indirect.STEP_PID', mock_pid) + cleanup() + + captured = capsys.readouterr() + assert create_msg.format(mock_pid) in captured.out + assert term_msg.format(mock_pid) in captured.out + + +def test_cleanup_late(capsys, monkeypatch): + """Ensure cleanup exceptions are swallowed if a process is already terminated""" + mock_pid = 123 + create_msg = "creating: {0}" + term_msg = "terminating: {0}" + + class MockMissingProc: + def __init__(self, pid: int) -> None: + print(create_msg.format(mock_pid)) + raise psutil.NoSuchProcess(pid) + def terminate(self) -> None: + print(term_msg.format(mock_pid)) + + captured = capsys.readouterr() # throw away existing output + + with monkeypatch.context() as ctx: + ctx.setattr('psutil.pid_exists', lambda pid: True) + ctx.setattr('psutil.Process', MockMissingProc) + ctx.setattr('smartsim._core.entrypoints.indirect.STEP_PID', mock_pid) + cleanup() + + captured = capsys.readouterr() + assert create_msg.format(mock_pid) in captured.out + + +def test_ts(): + """Ensure expected output type""" + ts = get_ts() + assert isinstance(ts, int) + + +def test_indirect_main_dir_check(test_dir): + """Ensure that the proxy validates the test directory exists""" + exp_dir = pathlib.Path(test_dir) + + cmd = ["echo", "unit-test"] + encoded_cmd = encode_cmd(cmd) + + status_path = exp_dir / TELMON_SUBDIR + + # show that a missing status_path is created when missing + main(encoded_cmd, "application", exp_dir, status_path) + + assert status_path.exists() + + +def test_indirect_main_cmd_check(capsys, test_dir, monkeypatch): + """Ensure that the proxy validates the cmd is not empty or whitespace-only""" + exp_dir = pathlib.Path(test_dir) + + captured = capsys.readouterr() # throw away existing output + with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: + ctx.setattr('smartsim._core.entrypoints.indirect.logger.error', print) + _ = main("", "application", exp_dir, exp_dir / TELMON_SUBDIR) + + captured = capsys.readouterr() + assert "Invalid cmd supplied" in ex.value.args[0] + + # test with non-emptystring cmd + with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: + ctx.setattr('smartsim._core.entrypoints.indirect.logger.error', print) + _ = main(" \n \t ", "application", exp_dir, exp_dir / TELMON_SUBDIR) + + captured = capsys.readouterr() + assert "Invalid cmd supplied" in ex.value.args[0] + + +def test_complete_process(fileutils, test_dir): + """Ensure the happy-path completes and returns a success return code""" + script = fileutils.get_test_conf_path("sleep.py") + + exp_dir = pathlib.Path(test_dir) + + raw_cmd = f"{sys.executable} {script} --time=1" + cmd = encode_cmd(raw_cmd.split()) + + rc = main(cmd, "application", exp_dir, exp_dir / TELMON_SUBDIR) + assert rc == 0 + + assert exp_dir.exists() + + # NOTE: don't have a manifest so we're falling back to default event path + data_dir = exp_dir / TELMON_SUBDIR + start_events = list(data_dir.rglob("start.json")) + stop_events = list(data_dir.rglob("stop.json")) + + assert start_events + assert stop_events diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index eace188b9..51d8b60a6 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -48,8 +48,7 @@ def test_unsupported_run_settings(): def test_model_failure(fileutils, test_dir): exp_name = "test-model-failure" - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("bad.py") settings = RunSettings("python", f"{script} --time=3") @@ -64,8 +63,7 @@ def test_model_failure(fileutils, test_dir): def test_orchestrator_relaunch(test_dir, wlmutils): """Test when users try to launch second orchestrator""" exp_name = "test-orc-on-relaunch" - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) orc = Orchestrator(port=wlmutils.get_test_port()) orc.set_path(test_dir) diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index f34eaa7e7..b3d463ca3 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -38,8 +38,7 @@ def test_models(fileutils, test_dir): exp_name = "test-models-local-launch" - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -54,8 +53,7 @@ def test_models(fileutils, test_dir): def test_ensemble(fileutils, test_dir): exp_name = "test-ensemble-launch" - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_local_multi_run.py b/tests/test_local_multi_run.py index 39d0dce52..e84fe2364 100644 --- a/tests/test_local_multi_run.py +++ b/tests/test_local_multi_run.py @@ -38,8 +38,7 @@ def test_models(fileutils, test_dir): exp_name = "test-models-local-launch" - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=5") diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index e06d9cc64..782611606 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -39,8 +39,7 @@ def test_restart(fileutils, test_dir): exp_name = "test-models-local-restart" - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") @@ -59,8 +58,7 @@ def test_restart(fileutils, test_dir): def test_ensemble(fileutils, test_dir): exp_name = "test-ensemble-restart" - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") settings = exp.create_run_settings("python", f"{script} --time=3") diff --git a/tests/test_manifest.py b/tests/test_manifest.py index f68219c73..5bb373fc1 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -26,11 +26,17 @@ from copy import deepcopy +import os.path import pytest from smartsim import Experiment -from smartsim._core.control import Manifest +from smartsim._core.control.manifest import ( + Manifest, + LaunchedManifest, + LaunchedManifestBuilder, + _LaunchedManifestMetadata as LaunchedManifestMetadata, +) from smartsim.database import Orchestrator from smartsim.error import SmartSimError from smartsim.settings import RunSettings @@ -48,7 +54,6 @@ model_2 = exp.create_model("model_1", run_settings=rs) ensemble = exp.create_ensemble("ensemble", run_settings=rs, replicas=1) - orc = Orchestrator() orc_1 = deepcopy(orc) orc_1.name = "orc2" @@ -97,3 +102,67 @@ class Person: p = Person() with pytest.raises(TypeError): _ = Manifest(p) + +def test_launched_manifest_transform_data(): + models = [(model, 1), (model_2, 2)] + ensembles = [(ensemble, [(m, i) for i, m in enumerate(ensemble.entities)])] + dbs = [(orc, [(n, i) for i, n in enumerate(orc.entities)])] + launched = LaunchedManifest( + metadata=LaunchedManifestMetadata("name", "path", "launcher", "run_id"), + models=models, + ensembles=ensembles, + databases=dbs, + ) + transformed = launched.map(lambda x: str(x)) + assert transformed.models == tuple((m, str(i)) for m, i in models) + assert transformed.ensembles[0][1] == tuple((m, str(i)) for m, i in ensembles[0][1]) + assert transformed.databases[0][1] == tuple((n, str(i)) for n, i in dbs[0][1]) + + +def test_launched_manifest_builder_correctly_maps_data(): + lmb = LaunchedManifestBuilder("name", "path", "launcher name") + lmb.add_model(model, 1) + lmb.add_model(model_2, 1) + lmb.add_ensemble(ensemble, [i for i in range(len(ensemble.entities))]) + lmb.add_database(orc, [i for i in range(len(orc.entities))]) + + manifest = lmb.finalize() + assert len(manifest.models) == 2 + assert len(manifest.ensembles) == 1 + assert len(manifest.databases) == 1 + + +def test_launced_manifest_builder_raises_if_lens_do_not_match(): + lmb = LaunchedManifestBuilder("name", "path", "launcher name") + with pytest.raises(ValueError): + lmb.add_ensemble(ensemble, list(range(123))) + with pytest.raises(ValueError): + lmb.add_database(orc, list(range(123))) + + +def test_launched_manifest_builer_raises_if_attaching_data_to_empty_collection( + monkeypatch +): + lmb = LaunchedManifestBuilder("name", "path", "launcher") + monkeypatch.setattr(ensemble, "entities", []) + with pytest.raises(ValueError): + lmb.add_ensemble(ensemble, []) + + +def test_lmb_and_launched_manifest_have_same_paths_for_launched_metadata(): + exp_path = "/path/to/some/exp" + lmb = LaunchedManifestBuilder("exp_name", exp_path, "launcher") + manifest = lmb.finalize() + assert lmb.exp_telemetry_subdirectory == manifest.metadata.exp_telemetry_subdirectory + assert lmb.run_telemetry_subdirectory == manifest.metadata.run_telemetry_subdirectory + assert os.path.commonprefix([ + manifest.metadata.run_telemetry_subdirectory, + manifest.metadata.exp_telemetry_subdirectory, + manifest.metadata.manifest_file_path, + exp_path, + ]) == exp_path + assert os.path.commonprefix([ + manifest.metadata.run_telemetry_subdirectory, + manifest.metadata.exp_telemetry_subdirectory, + manifest.metadata.manifest_file_path, + ]) == str(manifest.metadata.exp_telemetry_subdirectory) diff --git a/tests/test_model.py b/tests/test_model.py index 103e8a09c..76af50b54 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -28,6 +28,7 @@ from smartsim import Experiment from smartsim._core.launcher.step import SbatchStep, SrunStep +from smartsim._core.control.manifest import LaunchedManifestBuilder from smartsim.entity import Ensemble, Model from smartsim.error import EntityExistsError, SSUnsupportedError from smartsim.settings import RunSettings, SbatchSettings, SrunSettings @@ -88,8 +89,10 @@ def monkeypatch_exp_controller(monkeypatch): def _monkeypatch_exp_controller(exp): entity_steps = [] - def start_wo_job_manager(self, manifest, block=True, kill_on_interrupt=True): - self._launch(manifest) + def start_wo_job_manager(self, exp_name, exp_path, manifest, + block=True, kill_on_interrupt=True): + self._launch(exp_name, exp_path, manifest) + return LaunchedManifestBuilder("name", "path", "launcher").finalize() def launch_step_nop(self, step, entity): entity_steps.append((step, entity)) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 1bd33f88e..b2b10b0e7 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -133,12 +133,20 @@ def test_db_identifier_colo_then_standard(fileutils, wlmutils, coloutils, db_typ test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_script = fileutils.get_test_conf_path("smartredis/dbid.py") # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + # Create run settings + colo_settings = exp.create_run_settings("python", test_script) + colo_settings.set_nodes(1) + colo_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("colocated_model", colo_settings) + smartsim_model.set_path(test_dir) + db_args = { "port": test_port, "db_cpus": 1, @@ -287,7 +295,18 @@ def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): test_script = fileutils.get_test_conf_path("smartredis/dbid.py") # start a new Experiment for this section - exp = Experiment("test_multidb_colo_once", launcher=test_launcher, exp_path=test_dir) + exp = Experiment("test_multidb_colo_once", + launcher=test_launcher, + exp_path=test_dir) + + # create run settings + run_settings = exp.create_run_settings("python", test_script) + run_settings.set_nodes(1) + run_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model.set_path(test_dir) db_args = { "port": test_port + 1, @@ -418,7 +437,6 @@ def test_launch_cluster_orc_single_dbid(test_dir, coloutils, fileutils, wlmutils launcher = wlmutils.get_test_launcher() test_port = wlmutils.get_test_port() test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") - exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index a75e35bca..e95cea7ed 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -70,8 +70,7 @@ def test_inactive_orc_get_address(): def test_orc_active_functions(test_dir, wlmutils): exp_name = "test_orc_active_functions" - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) db = Orchestrator(port=wlmutils.get_test_port()) db.set_path(test_dir) @@ -98,8 +97,7 @@ def test_orc_active_functions(test_dir, wlmutils): def test_multiple_interfaces(test_dir, wlmutils): exp_name = "test_multiple_interfaces" - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) net_if_addrs = psutil.net_if_addrs() net_if_addrs = [ diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index 603728110..5622c6280 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -30,6 +30,9 @@ import shutil import sys +import smartsim._core.config.config +from smartsim.error import SSUnsupportedError + from smartsim.settings import PalsMpiexecSettings from smartsim._core.launcher import PBSLauncher from smartsim._core.launcher.step.mpiStep import MpiexecStep @@ -41,6 +44,15 @@ default_exe = sys.executable default_kwargs = {"fail_if_missing_exec": False} + +@pytest.fixture(autouse=True) +def turn_off_telemetry_indirect(monkeypatch): + monkeypatch.setattr( + smartsim._core.config.config.Config, + "telemetry_enabled", False) + yield + + # Uncomment when # @pytest.mark.parametrize( # "function_name",[ @@ -56,6 +68,7 @@ # with pytest.raises(SSUnsupportedError): # func(None) + def test_affinity_script(): settings = PalsMpiexecSettings(default_exe, **default_kwargs) settings.set_gpu_affinity_script("/path/to/set_affinity_gpu.sh", 1, 2) diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index cc38583b5..87a014be5 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -45,8 +45,7 @@ def test_local_orchestrator(test_dir, wlmutils): """Test launching orchestrator locally""" global first_dir exp_name = "test-orc-launch-local" - exp = Experiment(exp_name, launcher="local") - + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) first_dir = test_dir orc = Orchestrator(port=wlmutils.get_test_port()) @@ -61,12 +60,12 @@ def test_local_orchestrator(test_dir, wlmutils): exp._control._launcher.task_manager.actively_monitoring = False -def test_reconnect_local_orc(): +def test_reconnect_local_orc(test_dir): """Test reconnecting to orchestrator from first experiment""" global first_dir # start new experiment exp_name = "test-orc-local-reconnect-2nd" - exp_2 = Experiment(exp_name, launcher="local") + exp_2 = Experiment(exp_name, launcher="local", exp_path=test_dir) checkpoint = osp.join(first_dir, "smartsim_db.dat") reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) diff --git a/tests/test_serialize.py b/tests/test_serialize.py new file mode 100644 index 000000000..228c8eb29 --- /dev/null +++ b/tests/test_serialize.py @@ -0,0 +1,171 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import logging +from pathlib import Path +import json + +from smartsim import Experiment +from smartsim.database.orchestrator import Orchestrator +from smartsim._core.utils import serialize +from smartsim._core._cli import utils +from smartsim._core.control.manifest import LaunchedManifestBuilder +import smartsim._core.config.config + +_REL_MANIFEST_PATH = f"{serialize.TELMON_SUBDIR}/{serialize.MANIFEST_FILENAME}" +_CFG_TM_ENABLED_ATTR = "telemetry_enabled" + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + +@pytest.fixture(autouse=True) +def turn_on_tm(monkeypatch): + monkeypatch.setattr( + smartsim._core.config.config.Config, + _CFG_TM_ENABLED_ATTR, + property(lambda self: True)) + yield + + +def test_serialize_creates_a_manifest_json_file_if_dne(test_dir): + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + serialize.save_launch_manifest(lmb.finalize()) + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + + assert manifest_json.is_file() + with open(manifest_json, 'r') as f: + manifest = json.load(f) + assert manifest["experiment"]["name"] == "exp" + assert manifest["experiment"]["launcher"] == "launcher" + assert isinstance(manifest["runs"], list) + assert len(manifest["runs"]) == 1 + + +def test_serialize_does_not_write_manifest_json_if_telemetry_monitor_is_off( + test_dir, monkeypatch +): + monkeypatch.setattr( + smartsim._core.config.config.Config, + _CFG_TM_ENABLED_ATTR, + property(lambda self: False)) + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + serialize.save_launch_manifest(lmb.finalize()) + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + assert not manifest_json.exists() + + +def test_serialize_appends_a_manifest_json_exists(test_dir): + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + serialize.save_launch_manifest( + LaunchedManifestBuilder("exp", test_dir, "launcher").finalize()) + serialize.save_launch_manifest( + LaunchedManifestBuilder("exp", test_dir, "launcher").finalize()) + serialize.save_launch_manifest( + LaunchedManifestBuilder("exp", test_dir, "launcher").finalize()) + + assert manifest_json.is_file() + with open(manifest_json, 'r') as f: + manifest = json.load(f) + assert isinstance(manifest["runs"], list) + assert len(manifest["runs"]) == 3 + assert len({run["run_id"] for run in manifest["runs"]}) == 3 + + +def test_serialize_overwites_file_if_not_json(test_dir): + manifest_json = Path(test_dir) / _REL_MANIFEST_PATH + manifest_json.parent.mkdir(parents=True, exist_ok=True) + with open(manifest_json, 'w') as f: + f.write("This is not a json\n") + + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + serialize.save_launch_manifest(lmb.finalize()) + with open(manifest_json, 'r') as f: + assert isinstance(json.load(f), dict) + + +def test_started_entities_are_serialized(test_dir): + exp_name = "test-exp" + test_dir = Path(test_dir) / exp_name + test_dir.mkdir(parents=True) + exp = Experiment(exp_name, exp_path=str(test_dir), launcher="local") + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hello_world_model = exp.create_model("echo-hello", run_settings=rs1) + spam_eggs_model = exp.create_model("echo-spam", run_settings=rs2) + hello_ensemble = exp.create_ensemble('echo-ensemble', run_settings=rs1, replicas=3) + + exp.generate(hello_world_model, spam_eggs_model, hello_ensemble) + exp.start(hello_world_model, spam_eggs_model, block=False) + exp.start(hello_ensemble, block=False) + + manifest_json = Path(exp.exp_path) / _REL_MANIFEST_PATH + try: + with open(manifest_json, 'r') as f: + manifest = json.load(f) + assert len(manifest["runs"]) == 2 + assert len(manifest["runs"][0]["model"]) == 2 + assert len(manifest["runs"][0]["ensemble"]) == 0 + assert len(manifest["runs"][1]["model"]) == 0 + assert len(manifest["runs"][1]["ensemble"]) == 1 + assert len(manifest["runs"][1]["ensemble"][0]["models"]) == 3 + finally: + exp.stop(hello_world_model, spam_eggs_model, hello_ensemble) + + +def test_serialzed_database_does_not_break_if_using_a_non_standard_install( + monkeypatch +): + monkeypatch.setattr(utils, "get_db_path", lambda: None) + db = Orchestrator() + dict_ = serialize._dictify_db(db, []) + assert dict_["type"] == "Unknown" + + +def test_dictify_run_settings_warns_when_attepting_to_dictify_mpmd( + monkeypatch, caplog, test_dir +): + # TODO: Eventually this test should be removed and we should be able to + # handle MPMD run settings as part of the output dict + exp_name = "test-exp" + test_dir = Path(test_dir) / exp_name + test_dir.mkdir(parents=True) + exp = Experiment(exp_name, exp_path=str(test_dir), launcher="local") + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + # Make rs "MPMD" + monkeypatch.setattr(rs1, "mpmd", [rs2], raising=False) + # Make work with colored logs + monkeypatch.setattr(serialize, "_LOGGER", logging.getLogger()) + serialize._dictify_run_settings(rs1) + rec ,= caplog.records + assert rec.levelno == logging.WARNING + assert "MPMD run settings" in rec.msg diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py new file mode 100644 index 000000000..6060f488c --- /dev/null +++ b/tests/test_telemetry_monitor.py @@ -0,0 +1,1121 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import logging +import pathlib +from random import sample +import pytest +import sys +import typing as t +import time +import uuid +from conftest import FileUtils, WLMUtils + +from smartsim._core.control.jobmanager import JobManager +from smartsim._core.control.job import Job, JobEntity +from smartsim._core.launcher.launcher import WLMLauncher +from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher +from smartsim._core.launcher.step.step import Step, proxyable_launch_cmd +from smartsim._core.launcher.stepInfo import StepInfo +from smartsim.error.errors import UnproxyableStepError +from smartsim.settings.base import RunSettings +from smartsim.status import ( + STATUS_COMPLETED, + STATUS_CANCELLED, + STATUS_FAILED, + STATUS_NEW, + STATUS_PAUSED, + STATUS_RUNNING, + TERMINAL_STATUSES, +) +import smartsim._core.config.config as cfg + +from smartsim._core.entrypoints.telemetrymonitor import ( + can_shutdown, + event_loop, + faux_return_code, + get_parser, + get_ts, + track_event, + load_manifest, + hydrate_persistable, + ManifestEventHandler, +) +from smartsim._core.utils import serialize +from smartsim import Experiment + + +ALL_ARGS = {"-exp_dir", "-frequency"} +PROXY_ENTRY_POINT = "smartsim._core.entrypoints.indirect" +CFG_TM_ENABLED_ATTR = "telemetry_enabled" + + +for_all_wlm_launchers = pytest.mark.parametrize( + "wlm_launcher", + [pytest.param(cls(), id=cls.__name__) for cls in WLMLauncher.__subclasses__()], +) + +requires_wlm = pytest.mark.skipif( + pytest.test_launcher == "local", + reason="Test requires WLM" +) + + +logger = logging.getLogger() + +# The tests in this file belong to the slow_tests group +pytestmark = pytest.mark.slow_tests + + +@pytest.fixture(autouse=True) +def turn_on_tm(monkeypatch): + monkeypatch.setattr( + cfg.Config, + CFG_TM_ENABLED_ATTR, + property(lambda self: True)) + yield + + +def snooze_nonblocking(test_dir: str, max_delay: int = 20, post_data_delay: int = 2): + telmon_subdir = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + # let the non-blocking experiment complete. + for _ in range(max_delay): + time.sleep(1) + if telmon_subdir.exists(): + time.sleep(post_data_delay) + break + + +@pytest.mark.parametrize( + ["cmd", "missing"], + [ + pytest.param("", {"-exp_dir", "-frequency"}, id="no args"), + pytest.param("-exp_dir /foo/bar", {"-frequency"}, id="no freq"), + pytest.param("-frequency 123", {"-exp_dir"}, id="no dir"), + ], +) +def test_parser_reqd_args(capsys, cmd, missing): + """Test that the parser reports any missing required arguments""" + parser = get_parser() + + args = cmd.split() + + captured = capsys.readouterr() # throw away existing output + with pytest.raises(SystemExit) as ex: + ns = parser.parse_args(args) + + captured = capsys.readouterr() + assert "the following arguments are required" in captured.err + err_desc = captured.err.split("the following arguments are required:")[-1] + for arg in missing: + assert arg in err_desc + + expected = ALL_ARGS - missing + for exp in expected: + assert exp not in err_desc + + +def test_parser(): + """Test that the parser succeeds when receiving expected args""" + parser = get_parser() + + test_dir = "/foo/bar" + test_freq = 123 + + cmd = f"-exp_dir {test_dir} -frequency {test_freq}" + args = cmd.split() + + ns = parser.parse_args(args) + + assert ns.exp_dir == test_dir + assert ns.frequency == test_freq + + +def test_ts(): + """Ensure expected output type""" + ts = get_ts() + assert isinstance(ts, int) + + +@pytest.mark.parametrize( + ["etype", "task_id", "step_id", "timestamp", "evt_type"], + [ + pytest.param("ensemble", "", "123", get_ts(), "start", id="start event"), + pytest.param("ensemble", "", "123", get_ts(), "stop", id="stop event"), + ], +) +def test_track_event( + etype: str, + task_id: str, + step_id: str, + timestamp: int, + evt_type: str, + test_dir, +): + """Ensure that track event writes a file to the expected location""" + exp_path = pathlib.Path(test_dir) + track_event(timestamp, task_id, step_id, etype, evt_type, exp_path, logger) + + expected_output = exp_path / f"{evt_type}.json" + + assert expected_output.exists() + assert expected_output.is_file() + + +def test_load_manifest(fileutils: FileUtils, test_dir: str): + """Ensure that the runtime manifest loads correctly""" + sample_manifest_path = fileutils.get_test_conf_path("telemetry/telemetry.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + test_manifest_path = fileutils.make_test_file( + serialize.MANIFEST_FILENAME, + pathlib.Path(test_dir) / serialize.TELMON_SUBDIR, + sample_manifest.read_text(), + ) + test_manifest = pathlib.Path(test_manifest_path) + assert test_manifest.exists() + + manifest = load_manifest(test_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/path/to/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 6 + + assert len(manifest.runs[0].models) == 1 + assert len(manifest.runs[2].models) == 8 # 8 models in ensemble + assert len(manifest.runs[0].orchestrators) == 0 + assert len(manifest.runs[1].orchestrators) == 3 # 3 shards in db + + +def test_load_manifest_colo_model(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing a colocated model""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path("telemetry/colocatedmodel.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert ( + str(manifest.path) + == "/tmp/my-exp" + ) + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 1 + + assert len(manifest.runs[0].models) == 1 + + +def test_load_manifest_serial_models(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing multiple models""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path("telemetry/serialmodels.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/tmp/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 1 + + assert len(manifest.runs[0].models) == 5 + + +def test_load_manifest_db_and_models(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing models & + orchestrator across 2 separate runs""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path("telemetry/db_and_model.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/tmp/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 2 + + assert len(manifest.runs[0].orchestrators) == 1 + assert len(manifest.runs[1].models) == 1 + + +def test_load_manifest_db_and_models_1run(fileutils: FileUtils): + """Ensure that the runtime manifest loads correctly when containing models & + orchestrator in a single run""" + # NOTE: for regeneration, this manifest can use `test_telemetry_colo` + sample_manifest_path = fileutils.get_test_conf_path( + "telemetry/db_and_model_1run.json" + ) + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest.name == "my-exp" + assert str(manifest.path) == "/tmp/my-exp" + assert manifest.launcher == "Slurm" + assert len(manifest.runs) == 1 + + assert len(manifest.runs[0].orchestrators) == 1 + assert len(manifest.runs[0].models) == 1 + + +@pytest.mark.parametrize( + ["task_id", "step_id", "etype", "exp_isorch", "exp_ismanaged"], + [ + pytest.param("123", "", "model", False, False, id="unmanaged, non-orch"), + pytest.param("456", "123", "ensemble", False, True, id="managed, non-orch"), + pytest.param("789", "987", "orchestrator", True, True, id="managed, orch"), + pytest.param("987", "", "orchestrator", True, False, id="unmanaged, orch"), + ], +) +def test_persistable_computed_properties( + task_id: str, step_id: str, etype: str, exp_isorch: bool, exp_ismanaged: bool +): + name = f"test-{etype}-{uuid.uuid4()}" + timestamp = get_ts() + exp_dir = pathlib.Path("/foo/bar") + stored = { + "name": name, + "run_id": timestamp, + "telemetry_metadata": { + "status_dir": str(exp_dir), + "task_id": task_id, + "step_id": step_id, + }, + } + persistables = hydrate_persistable(etype, stored, exp_dir) + persistable = persistables[0] if persistables else None + + assert persistable.is_managed == exp_ismanaged + assert persistable.is_db == exp_isorch + + +def test_deserialize_ensemble(fileutils: FileUtils): + """Ensure that the children of ensembles (models) are correctly + placed in the models collection""" + sample_manifest_path = fileutils.get_test_conf_path("telemetry/ensembles.json") + sample_manifest = pathlib.Path(sample_manifest_path) + assert sample_manifest.exists() + + manifest = load_manifest(sample_manifest_path) + assert manifest + + assert len(manifest.runs) == 1 + + # NOTE: no longer returning ensembles, only children... + # assert len(manifest.runs[0].ensembles) == 1 + assert len(manifest.runs[0].models) == 8 + + +def test_shutdown_conditions(): + """Ensure conditions to shutdown telemetry monitor are correctly evaluated""" + job_entity1 = JobEntity() + job_entity1.name = "xyz" + job_entity1.step_id = "123" + job_entity1.task_id = "" + + logger = logging.getLogger() + + # show that an event handler w/no monitored jobs can shutdown + mani_handler = ManifestEventHandler("xyz", logger) + assert can_shutdown(mani_handler, logger) + + # show that an event handler w/a monitored job cannot shutdown + mani_handler = ManifestEventHandler("xyz", logger) + mani_handler.job_manager.add_job( + job_entity1.name, job_entity1.step_id, job_entity1, False + ) + assert not can_shutdown(mani_handler, logger) + assert not bool(mani_handler.job_manager.db_jobs) + assert bool(mani_handler.job_manager.jobs) + + # show that an event handler w/a monitored db cannot shutdown + mani_handler = ManifestEventHandler("xyz", logger) + job_entity1.type = "orchestrator" + mani_handler.job_manager.add_job( + job_entity1.name, job_entity1.step_id, job_entity1, False + ) + assert not can_shutdown(mani_handler, logger) + assert bool(mani_handler.job_manager.db_jobs) + assert not bool(mani_handler.job_manager.jobs) + + # show that an event handler w/a dbs & tasks cannot shutdown + job_entity2 = JobEntity() + job_entity2.name = "xyz" + job_entity2.step_id = "123" + job_entity2.task_id = "" + + mani_handler = ManifestEventHandler("xyz", logger) + job_entity1.type = "orchestrator" + mani_handler.job_manager.add_job( + job_entity1.name, job_entity1.step_id, job_entity1, False + ) + + mani_handler.job_manager.add_job( + job_entity2.name, job_entity2.step_id, job_entity2, False + ) + assert not can_shutdown(mani_handler, logger) + assert bool(mani_handler.job_manager.db_jobs) + assert bool(mani_handler.job_manager.jobs) + + # ... now, show that removing 1 of 2 jobs still doesn't shutdown + mani_handler.job_manager.db_jobs.popitem() + assert not can_shutdown(mani_handler, logger) + + # ... now, show that removing final job will allow shutdown + mani_handler.job_manager.jobs.popitem() + assert can_shutdown(mani_handler, logger) + + +def test_auto_shutdown(): + """Ensure that the cooldown timer is respected""" + + class FauxObserver: + def __init__(self): + self.stop_count = 0 + + def stop(self): + self.stop_count += 1 + + def is_alive(self) -> bool: + if self.stop_count > 0: + return False + + return True + + job_entity1 = JobEntity() + job_entity1.name = "xyz" + job_entity1.step_id = "123" + job_entity1.task_id = "" + + frequency = 1 + + # show that an event handler w/out a monitored task will automatically stop + mani_handler = ManifestEventHandler("xyz", logger) + observer = FauxObserver() + duration = 2 + + ts0 = get_ts() + event_loop(observer, mani_handler, frequency, logger, cooldown_duration=duration) + ts1 = get_ts() + + assert ts1 - ts0 >= duration + assert observer.stop_count == 1 + + # show that the new cooldown duration is respected + mani_handler = ManifestEventHandler("xyz", logger) + observer = FauxObserver() + duration = 5 + + ts0 = get_ts() + event_loop(observer, mani_handler, frequency, logger, cooldown_duration=duration) + ts1 = get_ts() + + assert ts1 - ts0 >= duration + assert observer.stop_count == 1 + + +def test_telemetry_single_model(fileutils, test_dir, wlmutils): + """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp + with unique db_identifiers""" + + # Set experiment name + exp_name = "telemetry_single_model" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("perroquet", app_settings) + exp.generate(smartsim_model) + exp.start(smartsim_model, block=True) + assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +def test_telemetry_single_model_nonblocking(fileutils, test_dir, wlmutils, monkeypatch): + """Ensure that the telemetry monitor logs exist when the experiment + is non-blocking""" + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "test_telemetry_single_model_nonblocking" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("perroquet", app_settings) + exp.generate(smartsim_model) + exp.start(smartsim_model) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + + assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch): + """ + Test telemetry with models being run in serial (one after each other) + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_serial_models" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_models = [ + exp.create_model(f"perroquet_{i}", app_settings) for i in range(5) + ] + exp.generate(*smartsim_models) + exp.start(*smartsim_models, block=True) + assert all( + [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] + ) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 5 + assert len(stop_events) == 5 + + +def test_telemetry_serial_models_nonblocking(fileutils, test_dir, wlmutils, monkeypatch): + """ + Test telemetry with models being run in serial (one after each other) + in a non-blocking experiment + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_serial_models" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_models = [ + exp.create_model(f"perroquet_{i}", app_settings) for i in range(5) + ] + exp.generate(*smartsim_models) + exp.start(*smartsim_models) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + + assert all( + [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] + ) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 5 + assert len(stop_events) == 5 + + +def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch): + """ + Test telemetry with only a database running + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_db_with_generate" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create regular database + orc = exp.create_database(port=test_port, interface=test_interface) + exp.generate(orc) + try: + exp.start(orc, block=True) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) <= 1 + finally: + exp.stop(orc) + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + + assert exp.get_status(orc)[0] == STATUS_CANCELLED + + stop_events = list(telemetry_output_path.rglob("stop.json")) + assert len(stop_events) == 1 + + +def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch): + """ + Test telemetry with only a database running + """ + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_db_only_without_generate" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create regular database + orc = exp.create_database(port=test_port, interface=test_interface) + try: + exp.start(orc) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 0 + finally: + exp.stop(orc) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=10) + assert exp.get_status(orc)[0] == STATUS_CANCELLED + + stop_events = list(telemetry_output_path.rglob("stop.json")) + assert len(stop_events) == 1 + + +def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch): + """ + Test telemetry with only a database running + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_db_and_model" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + # create regular database + orc = exp.create_database(port=test_port, interface=test_interface) + try: + exp.start(orc) + + # create run settings + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + # Create the SmartSim Model + smartsim_model = exp.create_model("perroquet", app_settings) + exp.generate(smartsim_model) + exp.start(smartsim_model, block=True) + finally: + exp.stop(orc) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + + assert exp.get_status(orc)[0] == STATUS_CANCELLED + assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + + start_events = list(telemetry_output_path.rglob("database/**/start.json")) + stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) + + assert len(start_events) == 1 + assert len(stop_events) == 1 + + start_events = list(telemetry_output_path.rglob("model/**/start.json")) + stop_events = list(telemetry_output_path.rglob("model/**/stop.json")) + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch): + """ + Test telemetry with only a database running + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_ensemble" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_script = fileutils.get_test_conf_path("echo.py") + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + app_settings = exp.create_run_settings("python", test_script) + app_settings.set_nodes(1) + app_settings.set_tasks_per_node(1) + + ens = exp.create_ensemble("troupeau", run_settings=app_settings, replicas=5) + exp.generate(ens) + exp.start(ens, block=True) + assert all([status == STATUS_COMPLETED for status in exp.get_status(ens)]) + + snooze_nonblocking(test_dir, max_delay=60, post_data_delay=30) + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + assert len(start_events) == 5 + assert len(stop_events) == 5 + + +def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch): + """ + Test telemetry with only a database running + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + + # Set experiment name + exp_name = "telemetry_ensemble" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + smartsim_model = coloutils.setup_test_colo( + fileutils, + "uds", + exp, + "echo.py", + {}, + ) + + exp.generate(smartsim_model) + exp.start(smartsim_model, block=True) + assert all( + [status == STATUS_COMPLETED for status in exp.get_status(smartsim_model)] + ) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + start_events = list(telemetry_output_path.rglob("start.json")) + stop_events = list(telemetry_output_path.rglob("stop.json")) + + # the colodb does NOT show up as a unique entity in the telemetry + assert len(start_events) == 1 + assert len(stop_events) == 1 + + +@pytest.mark.parametrize( + "frequency, cooldown", + [ + pytest.param(1, 1, id="1s shutdown"), + pytest.param(1, 5, id="5s shutdown"), + pytest.param(1, 15, id="15s shutdown"), + ], +) +def test_telemetry_autoshutdown(test_dir, wlmutils, monkeypatch, frequency, cooldown): + """ + Ensure that the telemetry monitor process shuts down after the desired + cooldown period + """ + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", frequency) + ctx.setattr(cfg.Config, "telemetry_cooldown", cooldown) + + # Set experiment name + exp_name = "telemetry_ensemble" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + start_time = get_ts() + stop_time = start_time + exp.start(block=False) + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + empty_mani = list(telemetry_output_path.rglob("manifest.json")) + assert len(empty_mani) == 1, "an manifest.json should be created" + + popen = exp._control._telemetry_monitor + assert popen.pid > 0 + assert popen.returncode is None + + # give some leeway during testing for the cooldown to get hit + for i in range(10): + if popen.poll() is not None: + stop_time = get_ts() + print(f"Completed polling for telemetry shutdown after {i} attempts") + break + time.sleep(3) + + assert popen.returncode is not None + assert stop_time >= (start_time + cooldown) + + +class MockStep(Step): + """Mock step to implement any abstract methods so that it can be + instanced for test purposes + """ + + def get_launch_cmd(self): + return ["spam", "eggs"] + + +@pytest.fixture +def mock_step_meta_dict(test_dir): + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + yield { + "entity_type": "mock", + "status_dir": telemetry_output_path, + } + + +@pytest.fixture +def mock_step(test_dir, mock_step_meta_dict): + rs = RunSettings("echo") + step = MockStep("mock-step", test_dir, rs) + step.meta = mock_step_meta_dict + yield step + + +def test_proxy_launch_cmd_decorator_reformats_cmds(mock_step, monkeypatch): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) + get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) + cmd = get_launch_cmd(mock_step) + assert cmd != ["some", "cmd", "list"] + assert sys.executable in cmd + assert PROXY_ENTRY_POINT in cmd + + +def test_proxy_launch_cmd_decorator_does_not_reformat_cmds_if_the_tm_is_off( + mock_step, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, False) + get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) + cmd = get_launch_cmd(mock_step) + assert cmd == ["some", "cmd", "list"] + + +def test_proxy_launch_cmd_decorator_errors_if_attempt_to_proxy_a_managed_step( + mock_step, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) + mock_step.managed = True + get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) + with pytest.raises(UnproxyableStepError): + get_launch_cmd(mock_step) + + +@for_all_wlm_launchers +def test_unmanaged_steps_are_proxyed_through_indirect( + wlm_launcher, mock_step_meta_dict, test_dir, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) + rs = RunSettings("echo", ["hello", "world"]) + step = wlm_launcher.create_step("test-step", test_dir, rs) + step.meta = mock_step_meta_dict + assert isinstance(step, Step) + assert not step.managed + cmd = step.get_launch_cmd() + assert sys.executable in cmd + assert PROXY_ENTRY_POINT in cmd + assert "hello" not in cmd + assert "world" not in cmd + + +@for_all_wlm_launchers +def test_unmanaged_steps_are_not_proxied_if_the_telemetry_monitor_is_disabled( + wlm_launcher, mock_step_meta_dict, test_dir, monkeypatch +): + monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, False) + rs = RunSettings("echo", ["hello", "world"]) + step = wlm_launcher.create_step("test-step", test_dir, rs) + step.meta = mock_step_meta_dict + assert isinstance(step, Step) + assert not step.managed + cmd = step.get_launch_cmd() + assert PROXY_ENTRY_POINT not in cmd + assert "hello" in cmd + assert "world" in cmd + + +@requires_wlm +@pytest.mark.parametrize( + "run_command", + [ + pytest.param("", id="Unmanaged"), + pytest.param("auto", id="Managed"), + ], +) +def test_multistart_experiment( + wlmutils: WLMUtils, + fileutils: FileUtils, + test_dir: str, + monkeypatch: pytest.MonkeyPatch, + run_command: str, +): + """Run an experiment with multiple start calls to ensure that telemetry is + saved correctly for each run + """ + + exp_name = "my-exp" + exp = Experiment(exp_name, + launcher=wlmutils.get_test_launcher(), + exp_path=test_dir) + rs_e = exp.create_run_settings( + sys.executable, ["printing_model.py"], run_command=run_command + ) + rs_e.set_nodes(1) + rs_e.set_tasks(1) + ens = exp.create_ensemble( + "my-ens", + run_settings=rs_e, + perm_strategy="all_perm", + params={ + "START": ["spam"], + "MID": ["eggs"], + "END": ["sausage", "and spam"], + }, + ) + + test_script_path = fileutils.get_test_conf_path("printing_model.py") + ens.attach_generator_files(to_configure=[test_script_path]) + + rs_m = exp.create_run_settings("echo", ["hello", "world"], run_command=run_command) + rs_m.set_nodes(1) + rs_m.set_tasks(1) + model = exp.create_model("my-model", run_settings=rs_m) + + db = exp.create_database( + db_nodes=1, + port=wlmutils.get_test_port(), + interface=wlmutils.get_test_interface(), + ) + + exp.generate(db, ens, model, overwrite=True) + + with monkeypatch.context() as ctx: + ctx.setattr(cfg.Config, "telemetry_frequency", 1) + ctx.setattr(cfg.Config, "telemetry_cooldown", 45) + + exp.start(model, block=False) + + # track PID to see that telmon cooldown avoids restarting process + tm_pid = exp._control._telemetry_monitor.pid + + exp.start(db, block=False) + # check that same TM proc is active + assert tm_pid == exp._control._telemetry_monitor.pid + try: + exp.start(ens, block=True, summary=True) + finally: + exp.stop(db) + assert tm_pid == exp._control._telemetry_monitor.pid + time.sleep(3) # time for telmon to write db stop event + + telemetry_output_path = pathlib.Path(test_dir) / serialize.TELMON_SUBDIR + + db_start_events = list(telemetry_output_path.rglob("database/**/start.json")) + db_stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) + assert len(db_start_events) == 1 + assert len(db_stop_events) == 1 + + m_start_events = list(telemetry_output_path.rglob("model/**/start.json")) + m_stop_events = list(telemetry_output_path.rglob("model/**/stop.json")) + assert len(m_start_events) == 1 + assert len(m_stop_events) == 1 + + e_start_events = list(telemetry_output_path.rglob("ensemble/**/start.json")) + e_stop_events = list(telemetry_output_path.rglob("ensemble/**/stop.json")) + assert len(e_start_events) == 2 + assert len(e_stop_events) == 2 + + +@pytest.mark.parametrize( + "status_in, expected_out", + [ + pytest.param(STATUS_CANCELLED, 1, id="failure on cancellation"), + pytest.param(STATUS_COMPLETED, 0, id="success on completion"), + pytest.param(STATUS_FAILED, 1, id="failure on failed"), + pytest.param(STATUS_NEW, None, id="failure on new"), + pytest.param(STATUS_PAUSED, None, id="failure on paused"), + pytest.param(STATUS_RUNNING, None, id="failure on running"), + ], +) +def test_faux_rc(status_in: str, expected_out: t.Optional[int]): + """Ensure faux response codes match expectations.""" + step_info = StepInfo(status=status_in) + + rc = faux_return_code(step_info) + assert rc == expected_out + + +@pytest.mark.parametrize( + "status_in, expected_out, expected_has_jobs", + [ + pytest.param(STATUS_CANCELLED, 1, False, id="failure on cancellation"), + pytest.param(STATUS_COMPLETED, 0, False, id="success on completion"), + pytest.param(STATUS_FAILED, 1, False, id="failure on failed"), + pytest.param(STATUS_NEW, None, True, id="failure on new"), + pytest.param(STATUS_PAUSED, None, True, id="failure on paused"), + pytest.param(STATUS_RUNNING, None, True, id="failure on running"), + ], +) +def test_wlm_completion_handling( + test_dir: str, + monkeypatch: pytest.MonkeyPatch, + status_in: str, + expected_out: t.Optional[int], + expected_has_jobs: bool, +): + + def get_faux_update(status: str) -> t.Callable: + def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: + return [("faux-name", StepInfo(status=status))] + return _faux_updates + + ts = get_ts() + with monkeypatch.context() as ctx: + # don't actually start a job manager + ctx.setattr(JobManager, "start", lambda x: ...) + ctx.setattr(SlurmLauncher, "get_step_update", get_faux_update(status_in)) + + mani_handler = ManifestEventHandler("xyz", logger) + mani_handler.set_launcher("slurm") + + # prep a fake job to request updates for + job_entity = JobEntity() + job_entity.name = "faux-name" + job_entity.step_id = "faux-step-id" + job_entity.task_id = 1234 + job_entity.status_dir = test_dir + job_entity.type = "orchestrator" + + job = Job(job_entity.name, job_entity.step_id, job_entity, "slurm", True) + + # populate our tracking collections + mani_handler._tracked_jobs = {job_entity.key: job_entity} + mani_handler.job_manager.jobs[job.name] = job + + mani_handler.on_timestep(ts) + + # see that the job queue was properly manipulated + has_jobs = bool(mani_handler._tracked_jobs) + assert expected_has_jobs == has_jobs + + # see that the event was properly written + stop_event_path = pathlib.Path(test_dir) / "stop.json" + + # if a status wasn't terminal, no stop event should have been written + should_have_stop_event = False if expected_out is None else True + assert should_have_stop_event == stop_event_path.exists() From 2e726047ce43f03a9d5f120147cb1170bf891162 Mon Sep 17 00:00:00 2001 From: Ale Rigazzi Date: Fri, 8 Dec 2023 12:21:01 -0600 Subject: [PATCH 56/64] Delete misleading comment --- tests/test_colo_model_local.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index 9550e9b87..509f46014 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -301,8 +301,6 @@ def test_colocated_model_pinning_list( def test_colo_uds_verifies_socket_file_name(test_dir, launcher="local"): - # Check to make sure that the CPU mask was correctly generated - exp = Experiment(f"colo_uds_wrong_name", launcher=launcher, exp_path=test_dir) colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=["--version"]) From 14c420d4e436576c1be3e51d663b036b3e1ebca3 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Fri, 8 Dec 2023 17:51:41 -0600 Subject: [PATCH 57/64] Correct type hints and more robust resource validation We now check a number of extra cases that the user can follow when trying to specify resources. These include: - Validating prior to assignment, the additio of a resource to the dictionary - Validating the types of keys and their values to be str or int --- smartsim/settings/pbsSettings.py | 64 +++++++++++++++++++------------- tests/test_pbs_settings.py | 34 ++++++++++++++++- 2 files changed, 71 insertions(+), 27 deletions(-) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 585103793..47564d428 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -26,7 +26,6 @@ import typing as t -from .._core.utils import init_default from ..error import SSConfigError from ..log import get_logger from .base import BatchSettings @@ -42,7 +41,7 @@ def __init__( time: t.Optional[str] = None, queue: t.Optional[str] = None, account: t.Optional[str] = None, - resources: t.Optional[t.Dict[str, t.Optional[str]]] = None, + resources: t.Optional[t.Dict[str, t.Union[str, int]]] = None, batch_args: t.Optional[t.Dict[str, t.Optional[str]]] = None, **kwargs: t.Any, ): @@ -71,14 +70,15 @@ def __init__( """ self._ncpus = ncpus - self._resources = resources or {} + self.resources: dict[str, t.Union[str,int]] = {} + self.resources = resources or {} resource_nodes = self.resources.get("nodes", None) if nodes and resource_nodes: raise ValueError( - "nodes was incorrectly specified as its own kwarg and also in the " - "resource kwarg." + "nodes was incorrectly specified as constructor parameter and also " + "in the as a key in the resource mapping" ) # time, queue, nodes, and account set in parent class init @@ -95,13 +95,13 @@ def __init__( self._hosts: t.List[str] = [] @property - def resources(self): + def resources(self) -> t.Dict[str, t.Union[str,int]]: return self._resources.copy() @resources.setter - def resources(self, resources: dict[str, str | int]): + def resources(self, resources: dict[str, str | int]) -> None: + self._sanity_check_resources(resources) self._resources = resources.copy() - self._sanity_check_resources() def set_nodes(self, num_nodes: int) -> None: @@ -119,7 +119,6 @@ def set_nodes(self, num_nodes: int) -> None: if num_nodes: self.set_resource("nodes", num_nodes) - self._sanity_check_resources() def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job @@ -181,7 +180,7 @@ def set_account(self, account: str) -> None: if account: self.batch_args["A"] = str(account) - def set_resource(self, resource_name: str, value: str) -> None: + def set_resource(self, resource_name: str, value: str | int) -> None: """Set a resource value for the Qsub batch If a select statement is provided, the nodes and ncpus @@ -194,12 +193,10 @@ def set_resource(self, resource_name: str, value: str) -> None: """ # TODO add error checking here # TODO include option to overwrite place (warning for orchestrator?) - self._resources[resource_name] = value - self._sanity_check_resources() - # Capture the case where someone is setting the number of nodes - # through 'select' or 'nodes' - if resource_name in ["select", "nodes"] and value: - self._nodes = int(value) + updated_dict = self.resources + updated_dict.update({resource_name:value}) + self._sanity_check_resources(updated_dict) + self.resources = updated_dict def format_batch_args(self) -> t.List[str]: """Get the formatted batch arguments for a preview @@ -216,15 +213,19 @@ def format_batch_args(self) -> t.List[str]: opts += [" ".join((prefix + opt, str(value)))] return opts - def _sanity_check_resources(self) -> None: + def _sanity_check_resources( + self, + resources: t.Optional[t.Dict[str, t.Union[str,int]]] = None + ) -> None: """Check that only select or nodes was specified in resources Note: For PBS Pro, nodes is equivalent to 'select' and 'place' so they are not quite synonyms. Here we assume that """ + checked_resources = resources if resources else self.resources - has_select = self.resources.get("select", None) - has_nodes = self.resources.get("nodes", None) + has_select = checked_resources.get("select", None) + has_nodes = checked_resources.get("nodes", None) if has_select and has_nodes: raise SSConfigError( @@ -233,6 +234,24 @@ def _sanity_check_resources(self) -> None: "'select' was set using 'set_resource'. Please only specify one." ) + if has_select and not isinstance(has_select, int): + raise TypeError("The value for 'select' must be an integer") + if has_nodes and not isinstance(has_nodes, int): + raise TypeError("The value for 'nodes' must be an integer") + + for key, value in checked_resources.items(): + allowed_types = [int, str] + if not any(isinstance(key, type) for type in allowed_types): + raise TypeError( + f"The type of {key=} is {type(key)}. Only int and str " + "are allowed." + ) + if not any(isinstance(value, type) for type in allowed_types): + raise TypeError( + f"The value associated with {key=} is {type(value)}. Only int " + "and str are allowed." + ) + def _create_resource_list(self) -> t.List[str]: self._sanity_check_resources() @@ -257,13 +276,6 @@ def _create_resource_list(self) -> t.List[str]: select_command += f":{'+'.join(hosts)}" res += [select_command] - if place := resources.pop("place", None): - res += [f"-l place={place}"] - - # get time from resources or kwargs - if walltime := resources.pop("walltime", None): - res += [f"-l walltime={walltime}"] - # All other "standard" resource specs for resource, value in resources.items(): res += [f"-l {resource}={str(value)}"] diff --git a/tests/test_pbs_settings.py b/tests/test_pbs_settings.py index 8a5776008..469163052 100644 --- a/tests/test_pbs_settings.py +++ b/tests/test_pbs_settings.py @@ -91,4 +91,36 @@ def test_select_nodes_error(): def test_resources_is_a_copy(): settings = QsubBatchSettings() resources = settings.resources - assert resources is not settings._resources \ No newline at end of file + assert resources is not settings._resources + +def test_nodes_and_select_not_ints_rrror(): + expected_error = TypeError + with pytest.raises(expected_error): + settings = QsubBatchSettings() + settings.set_nodes("10") + with pytest.raises(expected_error): + settings = QsubBatchSettings() + settings.set_resource("nodes","10") + with pytest.raises(expected_error): + settings = QsubBatchSettings() + settings.set_resource("select","10") + with pytest.raises(expected_error): + settings = QsubBatchSettings() + settings.resources = {"nodes":"10"} + with pytest.raises(expected_error): + settings = QsubBatchSettings() + settings.resources = {"select":"10"} + +def test_resources_not_set_on_error(): + settings = QsubBatchSettings(nodes=10) + unaltered_resources = settings.resources + with pytest.raises(SSConfigError): + settings.resources = {"nodes":10, "select":10} + + assert unaltered_resources == settings.resources + +def test_valid_types_in_resources(): + settings = QsubBatchSettings(nodes=10) + with pytest.raises(TypeError): + settings.set_resource("foo", None) + From bd4345ebe654647cf2485ca753dd0d3cedff3b8b Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Fri, 8 Dec 2023 18:15:14 -0600 Subject: [PATCH 58/64] Fix one use of | insteado t.Union --- smartsim/settings/pbsSettings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 47564d428..8362e3fc2 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -99,7 +99,7 @@ def resources(self) -> t.Dict[str, t.Union[str,int]]: return self._resources.copy() @resources.setter - def resources(self, resources: dict[str, str | int]) -> None: + def resources(self, resources: dict[str, t.Union[str,int]]) -> None: self._sanity_check_resources(resources) self._resources = resources.copy() From f2123fab40c54834297e458d92ab42698cd55e41 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Fri, 8 Dec 2023 18:19:38 -0600 Subject: [PATCH 59/64] Fix an incorrect typehint --- smartsim/settings/pbsSettings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 8362e3fc2..fbb8d9727 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -99,7 +99,7 @@ def resources(self) -> t.Dict[str, t.Union[str,int]]: return self._resources.copy() @resources.setter - def resources(self, resources: dict[str, t.Union[str,int]]) -> None: + def resources(self, resources: t.Dict[str, t.Union[str,int]]) -> None: self._sanity_check_resources(resources) self._resources = resources.copy() From 5ff7007df612eebd359e2c80e55b3b471cdde673 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Fri, 8 Dec 2023 18:26:49 -0600 Subject: [PATCH 60/64] Yet another | instead of t.Union --- smartsim/settings/pbsSettings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index fbb8d9727..06285026c 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -180,7 +180,7 @@ def set_account(self, account: str) -> None: if account: self.batch_args["A"] = str(account) - def set_resource(self, resource_name: str, value: str | int) -> None: + def set_resource(self, resource_name: str, value: t.Union[str,int]) -> None: """Set a resource value for the Qsub batch If a select statement is provided, the nodes and ncpus From f485ad1d1432546249774a65ad3a7b4440f11abc Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Fri, 8 Dec 2023 18:35:28 -0600 Subject: [PATCH 61/64] Remove extraneous assignment and blackify --- smartsim/settings/pbsSettings.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 06285026c..7ffbe076d 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -70,7 +70,6 @@ def __init__( """ self._ncpus = ncpus - self.resources: dict[str, t.Union[str,int]] = {} self.resources = resources or {} resource_nodes = self.resources.get("nodes", None) @@ -95,15 +94,14 @@ def __init__( self._hosts: t.List[str] = [] @property - def resources(self) -> t.Dict[str, t.Union[str,int]]: + def resources(self) -> t.Dict[str, t.Union[str, int]]: return self._resources.copy() @resources.setter - def resources(self, resources: t.Dict[str, t.Union[str,int]]) -> None: + def resources(self, resources: t.Dict[str, t.Union[str, int]]) -> None: self._sanity_check_resources(resources) self._resources = resources.copy() - def set_nodes(self, num_nodes: int) -> None: """Set the number of nodes for this batch job @@ -180,7 +178,7 @@ def set_account(self, account: str) -> None: if account: self.batch_args["A"] = str(account) - def set_resource(self, resource_name: str, value: t.Union[str,int]) -> None: + def set_resource(self, resource_name: str, value: t.Union[str, int]) -> None: """Set a resource value for the Qsub batch If a select statement is provided, the nodes and ncpus @@ -194,7 +192,7 @@ def set_resource(self, resource_name: str, value: t.Union[str,int]) -> None: # TODO add error checking here # TODO include option to overwrite place (warning for orchestrator?) updated_dict = self.resources - updated_dict.update({resource_name:value}) + updated_dict.update({resource_name: value}) self._sanity_check_resources(updated_dict) self.resources = updated_dict @@ -214,8 +212,7 @@ def format_batch_args(self) -> t.List[str]: return opts def _sanity_check_resources( - self, - resources: t.Optional[t.Dict[str, t.Union[str,int]]] = None + self, resources: t.Optional[t.Dict[str, t.Union[str, int]]] = None ) -> None: """Check that only select or nodes was specified in resources @@ -258,7 +255,7 @@ def _create_resource_list(self) -> t.List[str]: res = [] # Pop off some specific keywords that need to be treated separately - resources = self.resources # Note this is a copy so not modifying original + resources = self.resources # Note this is a copy so not modifying original # Construct the basic select/nodes statement if select := resources.pop("select", None): From 2824d69571a286524cbab0d38f36c868696fadfc Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Fri, 8 Dec 2023 18:46:25 -0600 Subject: [PATCH 62/64] Remove now invalid test and update type checking --- smartsim/settings/pbsSettings.py | 11 +++++------ tests/test_batch_settings.py | 8 -------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 7ffbe076d..653c48e78 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -76,8 +76,8 @@ def __init__( if nodes and resource_nodes: raise ValueError( - "nodes was incorrectly specified as constructor parameter and also " - "in the as a key in the resource mapping" + "nodes was incorrectly specified as a constructor parameter and also " + "as a key in the resource mapping" ) # time, queue, nodes, and account set in parent class init @@ -237,13 +237,12 @@ def _sanity_check_resources( raise TypeError("The value for 'nodes' must be an integer") for key, value in checked_resources.items(): - allowed_types = [int, str] - if not any(isinstance(key, type) for type in allowed_types): + if not isinstance(key, str): raise TypeError( f"The type of {key=} is {type(key)}. Only int and str " "are allowed." ) - if not any(isinstance(value, type) for type in allowed_types): + if not isinstance(value, (str, int)): raise TypeError( f"The value associated with {key=} is {type(value)}. Only int " "and str are allowed." @@ -275,6 +274,6 @@ def _create_resource_list(self) -> t.List[str]: # All other "standard" resource specs for resource, value in resources.items(): - res += [f"-l {resource}={str(value)}"] + res += [f"-l {resource}={value}"] return res diff --git a/tests/test_batch_settings.py b/tests/test_batch_settings.py index 4d06726d0..95f37a51b 100644 --- a/tests/test_batch_settings.py +++ b/tests/test_batch_settings.py @@ -194,11 +194,3 @@ def test_preamble(): bsub.add_preamble(["first line", "last line"]) assert len(list(bsub.preamble)) == 4 - -def test_qsub_batch_nodes(): - """ - Test specifying nodes in as kwarg and in resources - """ - with pytest.raises(ValueError): - QsubBatchSettings(nodes=1, resources={"nodes":2}) - QsubBatchSettings(nodes=1, resources={"nodes":1}) From 2358c48402f22904ae9254b0d05c4c626782d1e4 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Mon, 11 Dec 2023 11:57:52 -0600 Subject: [PATCH 63/64] Fix accidental collision with default value --- smartsim/settings/pbsSettings.py | 3 ++- tests/test_pbs_settings.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 653c48e78..5d723bcfe 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -219,7 +219,8 @@ def _sanity_check_resources( Note: For PBS Pro, nodes is equivalent to 'select' and 'place' so they are not quite synonyms. Here we assume that """ - checked_resources = resources if resources else self.resources + # Note: isinstance check here to avoid collision with default + checked_resources = resources if isinstance(resources, dict) else self.resources has_select = checked_resources.get("select", None) has_nodes = checked_resources.get("nodes", None) diff --git a/tests/test_pbs_settings.py b/tests/test_pbs_settings.py index 469163052..9d97a92b3 100644 --- a/tests/test_pbs_settings.py +++ b/tests/test_pbs_settings.py @@ -93,7 +93,7 @@ def test_resources_is_a_copy(): resources = settings.resources assert resources is not settings._resources -def test_nodes_and_select_not_ints_rrror(): +def test_nodes_and_select_not_ints_error(): expected_error = TypeError with pytest.raises(expected_error): settings = QsubBatchSettings() From 0663f5db2d0525cdbd98d99ad91e546f939eaae6 Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Mon, 11 Dec 2023 12:12:26 -0600 Subject: [PATCH 64/64] Update behaviour for test_create_pbs_batch --- tests/test_batch_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_batch_settings.py b/tests/test_batch_settings.py index 95f37a51b..f7a5e0869 100644 --- a/tests/test_batch_settings.py +++ b/tests/test_batch_settings.py @@ -41,7 +41,7 @@ def test_create_pbs_batch(): args = pbs_batch.format_batch_args() assert isinstance(pbs_batch, QsubBatchSettings) assert args == [ - "-l select=1:ncpus=10", + "-l nodes=1:ncpus=10", "-l walltime=10:00:00", "-q default", "-A myproject",