From 1cf3345a0b4f8cc3be042ecc0f74bc23b0099569 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 19 Feb 2025 21:42:22 +0000 Subject: [PATCH] handle utils Test-tag: pr CPUUsage DAOSVersion DmgStorageScanSCMTest Ecodtruncate NetworkFailureTest POSIXStatTest SSDSocketTest SuperBlockVersioning soak_smoke MacsioTest daos_racer OSAOnlineExtend OSAOnlineParallelTest DdbTest ContainerListConsolidationTest performance,-manual NvmeEnospace Skip-unit-tests: true Skip-fault-injection-test: true Signed-off-by: Dalton Bohning --- src/tests/ftest/daos_racer/parallel.py | 6 +- src/tests/ftest/nvme/enospace.py | 2 +- .../recovery/container_list_consolidation.py | 7 +- src/tests/ftest/recovery/ddb.py | 18 ++- src/tests/ftest/util/agent_utils.py | 48 ++----- src/tests/ftest/util/daos_racer_utils.py | 19 +-- src/tests/ftest/util/ddb_utils.py | 17 ++- src/tests/ftest/util/general_utils.py | 8 +- src/tests/ftest/util/job_manager_utils.py | 118 +++++++----------- src/tests/ftest/util/macsio_util.py | 9 +- src/tests/ftest/util/performance_test_base.py | 7 +- src/tests/ftest/util/run_utils.py | 10 ++ src/tests/ftest/util/server_utils.py | 21 ++-- src/tests/ftest/util/soak_utils.py | 27 ++-- 14 files changed, 139 insertions(+), 178 deletions(-) diff --git a/src/tests/ftest/daos_racer/parallel.py b/src/tests/ftest/daos_racer/parallel.py index 85c1a39cce8..781e16dcd57 100755 --- a/src/tests/ftest/daos_racer/parallel.py +++ b/src/tests/ftest/daos_racer/parallel.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 """ (C) Copyright 2021-2022 Intel Corporation. +(C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -52,7 +53,8 @@ def test_daos_racer_parallel(self): job_manager.run() except CommandFailure as error: - self.log.error("DAOS Racer Failed: %s", str(error)) - self.fail("Test was expected to pass but it failed.\n") + msg = f"daos_racer failed: {error}" + self.log.error(msg) + self.fail(msg) self.log.info("Test passed!") diff --git a/src/tests/ftest/nvme/enospace.py b/src/tests/ftest/nvme/enospace.py index d8d5baf3c04..a8e0620e5e4 100644 --- a/src/tests/ftest/nvme/enospace.py +++ b/src/tests/ftest/nvme/enospace.py @@ -340,7 +340,7 @@ def err_to_str(err_no): return pydaos.DaosErrorCode(err_no).name logfile_glob = log_file + r".*[0-9]" - errors_count = get_errors_count(self.hostlist_clients, logfile_glob) + errors_count = get_errors_count(self.log, self.hostlist_clients, logfile_glob) for error in self.expected_errors: if error not in errors_count: errors_count[error] = 0 diff --git a/src/tests/ftest/recovery/container_list_consolidation.py b/src/tests/ftest/recovery/container_list_consolidation.py index 0c377e451b2..9911bb9b14c 100644 --- a/src/tests/ftest/recovery/container_list_consolidation.py +++ b/src/tests/ftest/recovery/container_list_consolidation.py @@ -1,5 +1,6 @@ """ (C) Copyright 2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -74,9 +75,8 @@ def test_orphan_container(self): server_host=NodeSet(self.hostlist_servers[0]), path=self.bin, mount_point=scm_mount, pool_uuid=pool.uuid, vos_file=vos_file) cmd_result = ddb_command.list_component() - ls_out = "\n".join(cmd_result[0]["stdout"]) uuid_regex = r"([0-f]{8}-[0-f]{4}-[0-f]{4}-[0-f]{4}-[0-f]{12})" - match = re.search(uuid_regex, ls_out) + match = re.search(uuid_regex, cmd_result.joined_stdout) if match is None: self.fail("Unexpected output from ddb command, unable to parse.") self.log.info("Container UUID from ddb ls = %s", match.group(1)) @@ -133,9 +133,8 @@ def test_orphan_container(self): "(PMEM only).") self.log_step(msg) cmd_result = ddb_command.list_component() - ls_out = "\n".join(cmd_result[0]["stdout"]) uuid_regex = r"([0-f]{8}-[0-f]{4}-[0-f]{4}-[0-f]{4}-[0-f]{12})" - match = re.search(uuid_regex, ls_out) + match = re.search(uuid_regex, cmd_result.joined_stdout) if match: errors.append("Container UUID is found in shard! Checker didn't remove it.") diff --git a/src/tests/ftest/recovery/ddb.py b/src/tests/ftest/recovery/ddb.py index 25e7223e0fa..777c97a53d7 100644 --- a/src/tests/ftest/recovery/ddb.py +++ b/src/tests/ftest/recovery/ddb.py @@ -1,5 +1,6 @@ """ (C) Copyright 2022-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -176,10 +177,9 @@ def test_recovery_ddb_ls(self): # CONT: (/[0]) /3082b7d3-32f9-41ea-bcbf-5d6450c1b34f # stdout is a list which contains each line as separate element. Concatenate them # to single string so that we can apply regex. - ls_out = "\n".join(cmd_result[0]["stdout"]) # Matches the container uuid uuid_regex = r"([0-f]{8}-[0-f]{4}-[0-f]{4}-[0-f]{4}-[0-f]{12})" - match = re.search(uuid_regex, ls_out) + match = re.search(uuid_regex, cmd_result.joined_stdout) if match is None: self.fail("Unexpected output from ddb command, unable to parse.") self.log.info("Container UUID from ddb ls = %s", match.group(1)) @@ -198,10 +198,9 @@ def test_recovery_ddb_ls(self): # OBJ: (/[0]/[0]) /3082b7d3-32f9-41ea-bcbf-5d6450c1b34f/937030214649643008.1.0.1 # OBJ: (/[0]/[1]) /3082b7d3-32f9-41ea-bcbf-5d6450c1b34f/937030214649643009.1.0.1 # OBJ: (/[0]/[2]) /3082b7d3-32f9-41ea-bcbf-5d6450c1b34f/937030214649643016.1.0.1 - ls_out = "\n".join(cmd_result[0]["stdout"]) # Matches an object id. (4 digits separated by a period '.') object_id_regex = r"\d+\.\d+\.\d+\.\d+" - match = re.findall(object_id_regex, ls_out) + match = re.findall(object_id_regex, cmd_result.joined_stdout) self.log.info("List objects match = %s", match) actual_object_count = len(match) @@ -217,12 +216,11 @@ def test_recovery_ddb_ls(self): for obj_index in range(object_count): component_path = "[0]/[{}]".format(obj_index) cmd_result = ddb_command.list_component(component_path=component_path) - ls_out = "\n".join(cmd_result[0]["stdout"]) # Sample output. # /d4e0c836-17bd-4df3-b255-929732486bab/281479271677953.0.0/ # [0] 'Sample dkey 0 0' (15) # [1] 'Sample dkey 0 1' (15) - match = re.findall(dkey_regex, ls_out) + match = re.findall(dkey_regex, cmd_result.joined_stdout) actual_dkey_count += len(match) @@ -248,7 +246,7 @@ def test_recovery_ddb_ls(self): for dkey_index in range(dkey_count): component_path = "[0]/[{}]/[{}]".format(obj_index, dkey_index) cmd_result = ddb_command.list_component(component_path=component_path) - ls_out = "\n".join(cmd_result[0]["stdout"]) + ls_out = cmd_result.joined_stdout msg = "List akeys obj_index = {}, dkey_index = {}, stdout = {}".format( obj_index, dkey_index, ls_out) self.log.info(msg) @@ -356,7 +354,7 @@ def test_recovery_ddb_rm(self): # 4. Call ddb rm to remove the akey. cmd_result = ddb_command.remove_component(component_path="[0]/[0]/[0]/[0]") - self.log.info("rm akey stdout = %s", cmd_result[0]["stdout"]) + self.log.info("rm akey stdout = %s", cmd_result.joined_stdout) # 5. Restart the server to use the API. dmg_command.system_start() @@ -386,7 +384,7 @@ def test_recovery_ddb_rm(self): # 9. Call ddb rm to remove the dkey. cmd_result = ddb_command.remove_component(component_path="[0]/[0]/[0]") - self.log.info("rm dkey stdout = %s", cmd_result[0]["stdout"]) + self.log.info("rm dkey stdout = %s", cmd_result.joined_stdout) # 10. Restart the server to use the API. dmg_command.system_start() @@ -415,7 +413,7 @@ def test_recovery_ddb_rm(self): # 14. Call ddb rm to remove the object. cmd_result = ddb_command.remove_component(component_path="[0]/[0]") - self.log.info("rm object stdout = %s", cmd_result[0]["stdout"]) + self.log.info("rm object stdout = %s", cmd_result.joined_stdout) # 15. Restart the server to use daos command. dmg_command.system_start() diff --git a/src/tests/ftest/util/agent_utils.py b/src/tests/ftest/util/agent_utils.py index 74b79fb9796..40d52a4ee60 100644 --- a/src/tests/ftest/util/agent_utils.py +++ b/src/tests/ftest/util/agent_utils.py @@ -1,10 +1,10 @@ """ (C) Copyright 2019-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ import os -import re import socket from agent_utils_params import DaosAgentTransportCredentials, DaosAgentYamlParameters @@ -13,7 +13,7 @@ from command_utils_base import (CommandWithParameters, CommonConfig, EnvironmentVariables, FormattedParameter) from exception_utils import CommandFailure -from general_utils import get_default_config_file, get_log_file, run_pcmd +from general_utils import get_default_config_file, get_log_file from run_utils import run_remote @@ -293,12 +293,17 @@ def start(self): super().start() def dump_attachinfo(self): - """Run dump-attachinfo on the daos_agent.""" + """Run dump-attachinfo on the daos_agent. + + Raises: + CommandFailure: if the daos_agent command fails. + + Returns: + CommandResult: groups of command results from the same hosts with the same return status + """ cmd = self.manager.job.copy() cmd.set_sub_command("dump-attachinfo") - self.attachinfo = run_pcmd(self.hosts, - str(self.manager.job))[0]["stdout"] - self.log.info("Agent attachinfo: %s", self.attachinfo) + return run_remote(self.log, self.hosts, cmd.with_exports) def support_collect_log(self, **kwargs): """Collect logs for debug purpose. @@ -309,6 +314,7 @@ def support_collect_log(self, **kwargs): archive (bool, optional): Archive the log/config files extra_logs_dir (str, optional): Collect the Logs from given custom directory target-host (str, optional): R sync all the logs to target system + Raises: CommandFailure: if the daos_agent command fails. @@ -324,36 +330,6 @@ def support_collect_log(self, **kwargs): self.log.info("Support collect-log on clients: %s", str(cmd)) return run_remote(self.log, self.hosts, cmd.with_exports) - def get_attachinfo_file(self): - """Run dump-attachinfo on the daos_agent. - - Returns: - str: the attach info file path - - """ - server_name = self.get_config_value("name") - - self.dump_attachinfo() - - attach_info = self.attachinfo - - # Filter log messages from attachinfo content - messages = [x for x in attach_info if re.match(r"^(name\s|size\s|all|\d+\s)", x)] - attach_info_contents = "\n".join(messages) - attach_info_filename = f"{server_name}.attach_info_tmp" - - if len(messages) < 4: - self.log.info("Malformed attachinfo file: %s", attach_info_contents) - return None - - # Write an attach_info_tmp file in this directory for cart_ctl to use - attachinfo_file_path = os.path.join(self.outputdir, attach_info_filename) - - with open(attachinfo_file_path, 'w', encoding='utf-8') as file_handle: - file_handle.write(attach_info_contents) - - return attachinfo_file_path - def stop(self): """Stop the agent through the job manager. diff --git a/src/tests/ftest/util/daos_racer_utils.py b/src/tests/ftest/util/daos_racer_utils.py index a1ebd6e92d6..ddea074a701 100644 --- a/src/tests/ftest/util/daos_racer_utils.py +++ b/src/tests/ftest/util/daos_racer_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -10,7 +11,8 @@ from command_utils_base import BasicParameter, FormattedParameter from env_modules import load_mpi from exception_utils import CommandFailure, MPILoadError -from general_utils import get_log_file, pcmd +from general_utils import get_log_file +from run_utils import run_remote class DaosRacerCommand(ExecutableCommand): @@ -109,15 +111,14 @@ def run(self, raise_exception=None): str(self), self.host, "no" if self.clush_timeout.value is None else "a {}s".format(self.clush_timeout.value)) - return_codes = pcmd(self.host, self.with_exports, True, self.clush_timeout.value) - if 0 not in return_codes or len(return_codes) > 1: - # Kill the daos_racer process if the remote command timed out - if 255 in return_codes: - self.log.info( - "Stopping timed out daos_racer process on %s", self.host) - pcmd(self.host, "pkill daos_racer", True) + result = run_remote( + self.log, self.host, self.with_exports, timeout=self.clush_timeout.value) + if not result.passed: + if result.timeout: + self.log.info("Stopping timed out daos_racer process on %s", result.timeout_hosts) + run_remote(self.log, result.timeout_hosts, "pkill daos_racer", True) if raise_exception: - raise CommandFailure("Error running '{}'".format(self._command)) + raise CommandFailure(f"Error running '{self._command}'") self.log.info("Test passed!") diff --git a/src/tests/ftest/util/ddb_utils.py b/src/tests/ftest/util/ddb_utils.py index cf994a51378..12b787eaf71 100644 --- a/src/tests/ftest/util/ddb_utils.py +++ b/src/tests/ftest/util/ddb_utils.py @@ -1,12 +1,13 @@ """ (C) Copyright 2022 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ import os from command_utils_base import BasicParameter, CommandWithParameters, FormattedParameter -from general_utils import run_pcmd +from run_utils import run_remote class DdbCommandBase(CommandWithParameters): @@ -18,9 +19,9 @@ def __init__(self, server_host, path, verbose=True, timeout=None, sudo=True): Args: server_host (NodeSet): Server host to run the command. path (str): path to the ddb command. - verbose (bool, optional): Display command output when run_pcmd is called. + verbose (bool, optional): Display command output in run. Defaults to True. - timeout (int, optional): Command timeout (sec) used in run_pcmd. Defaults to + timeout (int, optional): Command timeout (sec) used in run. Defaults to None. sudo (bool, optional): Whether to run ddb with sudo. Defaults to True. """ @@ -40,7 +41,7 @@ def __init__(self, server_host, path, verbose=True, timeout=None, sudo=True): # VOS file path. self.vos_path = BasicParameter(None, position=1) - # Members needed for run_pcmd(). + # Members needed for run(). self.verbose = verbose self.timeout = timeout @@ -60,13 +61,11 @@ def run(self): """Run the command. Returns: - list: A list of dictionaries with each entry containing output, exit status, - and interrupted status common to each group of hosts. + CommandResult: groups of command results from the same hosts with the same return status """ - return run_pcmd( - hosts=self.host, command=str(self), verbose=self.verbose, - timeout=self.timeout) + return run_remote( + self.log, self.host, command=str(self), verbose=self.verbose, timeout=self.timeout) class DdbCommand(DdbCommandBase): diff --git a/src/tests/ftest/util/general_utils.py b/src/tests/ftest/util/general_utils.py index 84e55601ff2..198c690594c 100644 --- a/src/tests/ftest/util/general_utils.py +++ b/src/tests/ftest/util/general_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -788,10 +789,11 @@ def get_remote_file_size(host, file_name): return int(result.stdout_text) -def get_errors_count(hostlist, file_glob): +def get_errors_count(log, hostlist, file_glob): """Count the number of errors found in log files. Args: + log (logger): logger for the messages produced by this method hostlist (list): System list to looks for an error. file_glob (str): Glob pattern of the log file to parse. @@ -803,9 +805,9 @@ def get_errors_count(hostlist, file_glob): cmd = "cat {} | sed -n -E -e ".format(get_log_file(file_glob)) cmd += r"'/^.+[[:space:]]ERR[[:space:]].+[[:space:]]DER_[^(]+\([^)]+\).+$/" cmd += r"s/^.+[[:space:]]DER_[^(]+\((-[[:digit:]]+)\).+$/\1/p'" - results = run_pcmd(hostlist, cmd, False, None, None) + result = run_remote(log, hostlist, cmd, verbose=False) errors_count = {} - for error_str in sum([result["stdout"] for result in results], []): + for error_str in sum([data.stdout for data in result.output], []): error = int(error_str) if error not in errors_count: errors_count[error] = 0 diff --git a/src/tests/ftest/util/job_manager_utils.py b/src/tests/ftest/util/job_manager_utils.py index 5f687900fbc..e6aaff0851b 100644 --- a/src/tests/ftest/util/job_manager_utils.py +++ b/src/tests/ftest/util/job_manager_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2020-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -14,8 +15,7 @@ from command_utils_base import BasicParameter, EnvironmentVariables, FormattedParameter from env_modules import load_mpi from exception_utils import CommandFailure, MPILoadError -from general_utils import (get_job_manager_class, get_journalctl_command, journalctl_time, pcmd, - run_pcmd) +from general_utils import get_job_manager_class, get_journalctl_command, journalctl_time from run_utils import run_remote, stop_processes from write_host_file import write_host_file @@ -276,14 +276,9 @@ def _get_remote_process_state(self, message=None): self.log.debug( "%s processes still running remotely%s:", self.command, " {}".format(message) if message else "") - self.log.debug("Running (on %s): %s", self._hosts, command) - results = pcmd(self._hosts, command, True, 10, None) - - # The pcmd method will return a dictionary with a single key, e.g. - # {1: }, if there are no remote processes running on any of the - # hosts. If this value is not returned, indicate there are remote - # processes running by returning a "R" state. - return "R" if 1 not in results or len(results) > 1 else None + result = run_remote(self.log, self._hosts, command, timeout=10) + # Return "R" if processes were found running on any hosts + return "R" if result.passed_hosts else None def run(self, raise_exception=None): """Run the command. @@ -711,7 +706,6 @@ def run(self, raise_exception=None): # Start the daos_server.service self.service_enable() result = self.service_start() - # result = self.service_status() # Determine if the command has launched correctly using its # check_subprocess_status() method. @@ -815,25 +809,21 @@ def _run_unit_command(self, command): CommandFailure: if there is an issue running the command Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + CommandResult: groups of command results from the same hosts with the same return status """ self._systemctl.unit_command.value = command self.timestamps[command] = journalctl_time() - result = pcmd(self._hosts, str(self), self.verbose, self.timeout) - if 255 in result: + cmd = str(self) + result = run_remote( + self.log, self._hosts, cmd, verbose=self.verbose, timeout=self.timeout) + if result.timeout: raise CommandFailure( "Timeout detected running '{}' with a {}s timeout on {}".format( - str(self), self.timeout, NodeSet.fromlist(result[255]))) - - if 0 not in result or len(result) > 1: - failed = [] - for item, value in list(result.items()): - if item != 0: - failed.extend(value) + cmd, self.timeout, result.timeout_hosts)) + if not result.passed: raise CommandFailure( - "Error occurred running '{}' on {}".format(str(self), NodeSet.fromlist(failed))) + "Error occurred running '{}' on {}".format(cmd, result.failed_hosts)) return result def _report_unit_command(self, command): @@ -846,8 +836,7 @@ def _report_unit_command(self, command): CommandFailure: if there is an issue running the command Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + CommandResult: groups of command results from the same hosts with the same return status """ try: @@ -866,8 +855,7 @@ def service_enable(self): CommandFailure: if unable to enable Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + CommandResult: groups of command results from the same hosts with the same return status """ return self._report_unit_command("enable") @@ -879,8 +867,7 @@ def service_disable(self): CommandFailure: if unable to disable Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + CommandResult: groups of command results from the same hosts with the same return status """ return self._report_unit_command("disable") @@ -892,8 +879,7 @@ def service_start(self): CommandFailure: if unable to start Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + CommandResult: groups of command results from the same hosts with the same return status """ return self._report_unit_command("start") @@ -905,8 +891,7 @@ def service_stop(self): CommandFailure: if unable to stop Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + CommandResult: groups of command results from the same hosts with the same return status """ return self._report_unit_command("stop") @@ -918,8 +903,7 @@ def service_status(self): CommandFailure: if unable to get the status Returns: - dict: a dictionary of return codes keys and accompanying NodeSet - values indicating which hosts yielded the return code. + CommandResult: groups of command results from the same hosts with the same return status """ return self._report_unit_command("status") @@ -940,17 +924,17 @@ def service_running(self): states = {} valid_states = ["active", "activating"] self._systemctl.unit_command.value = "is-active" - results = run_pcmd(self._hosts, str(self), False, self.timeout, None) - for result in results: - if result["interrupted"]: - states["timeout"] = result["hosts"] - status = False - else: - output = result["stdout"][-1] - if output not in states: - states[output] = NodeSet() - states[output].add(result["hosts"]) - status &= output in valid_states + result = run_remote(self.log, self._hosts, str(self), verbose=False, timeout=self.timeout) + if result.timeout: + states["timeout"] = result.timeout_hosts + for data in result.output: + if data.timeout: + continue + output = data.stdout[-1] + if output not in states: + states[output] = NodeSet() + states[output].add(data.hosts) + status &= output in valid_states data = ["=".join([key, str(states[key])]) for key in sorted(states)] self.log.info( " Detected %s states: %s", @@ -987,49 +971,41 @@ def get_log_data(self, hosts, command, timeout=60): self.log.info("Gathering log data on %s: %s", str(hosts), command) # Gather the log information per host - results = run_pcmd(hosts, command, False, timeout, None) + result = run_remote(self.log, hosts, command, verbose=False, timeout=timeout) # Determine if the command completed successfully without a timeout - status = True - for result in results: - if result["interrupted"]: - self.log.info(" Errors detected running \"%s\":", command) - self.log.info( - " %s: timeout detected after %s seconds", - str(result["hosts"]), timeout) - status = False - elif result["exit_status"] != 0: - self.log.info(" Errors detected running \"%s\":", command) - status = False - if not status: - break + if not result.passed: + self.log.info(' Errors detected running "%s":', command) + if result.timeout: + self.log.info( + " %s: timeout detected after %s seconds", str(result.timeout_hosts), timeout) # Display/return the command output log_data = [] - for result in results: - if result["exit_status"] == 0 and not result["interrupted"]: + for data in result.output: + if data.returncode == 0: # Add the successful output from each node to the dictionary log_data.append( - {"hosts": result["hosts"], "data": result["stdout"]}) + {"hosts": data.hosts, "data": data.stdout}) else: # Display all of the results in the case of an error - if len(result["stdout"]) > 1: + if len(data.stdout) > 1: self.log.info( " %s: rc=%s, output:", - str(result["hosts"]), result["exit_status"]) - for line in result["stdout"]: + str(data.hosts), data.returncode) + for line in data.stdout: self.log.info(" %s", line) else: self.log.info( " %s: rc=%s, output: %s", - str(result["hosts"]), result["exit_status"], - result["stdout"][0]) + str(data.hosts), data.returncode, + data.stdout[0]) # Report any errors through an exception - if not status: + if not result.passed: raise CommandFailure( - "Error(s) detected gathering {} log data on {}".format( - self._systemctl.service.value, NodeSet.fromlist(hosts))) + f"Error(s) detected gathering {self._systemctl.service.value} " + f"log data on {result.failed_hosts}") # Return the successful command output per set of hosts return log_data diff --git a/src/tests/ftest/util/macsio_util.py b/src/tests/ftest/util/macsio_util.py index cf64dbcb030..1d41e05b358 100644 --- a/src/tests/ftest/util/macsio_util.py +++ b/src/tests/ftest/util/macsio_util.py @@ -1,11 +1,13 @@ """ (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ from command_utils import ExecutableCommand from command_utils_base import FormattedParameter -from general_utils import get_log_file, pcmd +from general_utils import get_log_file +from run_utils import run_remote class MacsioCommand(ExecutableCommand): @@ -467,7 +469,8 @@ def check_results(self, result, hosts): macsio_files = (self.log_file_name.value, self.timings_file_name.value) for macsio_file in macsio_files: if macsio_file: - self.log.info("Output from %s", macsio_file) - pcmd(hosts, "cat {}".format(macsio_file), timeout=30) + result = run_remote(self.log, hosts, f"cat {macsio_file}", timeout=30) + if not result.passed: + status = False return status diff --git a/src/tests/ftest/util/performance_test_base.py b/src/tests/ftest/util/performance_test_base.py index 7976a825a5d..9f1592c7d19 100644 --- a/src/tests/ftest/util/performance_test_base.py +++ b/src/tests/ftest/util/performance_test_base.py @@ -1,5 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -111,10 +112,10 @@ def _log_daos_metrics(self): os.makedirs(metrics_dir, exist_ok=True) per_engine_results = self.server_managers[0].get_daos_metrics() for engine_idx, engine_results in enumerate(per_engine_results): - for host_results in engine_results: - log_name = "{}_engine{}.csv".format(host_results["hosts"], engine_idx) + for hosts, stdout in engine_results.all_stdout.items(): + log_name = "{}_engine{}.csv".format(hosts, engine_idx) log_path = os.path.join(metrics_dir, log_name) - self.log_performance(host_results["stdout"], False, log_path) + self.log_performance(stdout, False, log_path) @property def unique_id(self): diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index 8e96e2228f0..a6129bd1d41 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2022-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -267,6 +268,15 @@ def failed_hosts(self): """ return NodeSet.fromlist(data.hosts for data in self.output if data.returncode != 0) + @property + def timeout_hosts(self): + """Get all timeout hosts. + + Returns: + NodeSet: all nodes where the command timed out + """ + return NodeSet.fromlist(data.hosts for data in self.output if data.timeout) + @property def all_stdout(self): """Get all of the stdout from the issued command from each host. diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index ec79f029c6e..69ed10f184d 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2018-2024 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -16,10 +17,9 @@ from command_utils_base import BasicParameter, CommonConfig from dmg_utils import get_dmg_command from exception_utils import CommandFailure -from general_utils import (get_default_config_file, get_display_size, get_log_file, list_to_str, - pcmd, run_pcmd) +from general_utils import get_default_config_file, get_display_size, get_log_file, list_to_str from host_utils import get_local_host -from run_utils import run_remote, stop_processes +from run_utils import command_as_user, run_remote, stop_processes from server_utils_base import DaosServerCommand, DaosServerInformation, ServerFailed from server_utils_params import DaosServerTransportCredentials, DaosServerYamlParameters from user_utils import get_chown_command @@ -615,7 +615,7 @@ def set_scm_mount_ownership(self, user=None, verbose=False): ) if cmd_list: - pcmd(self._hosts, "; ".join(cmd_list), verbose) + run_remote(self.log, self._hosts, "; ".join(cmd_list), verbose=verbose) def restart(self, hosts, wait=False): """Restart the specified servers after a stop. @@ -1148,10 +1148,10 @@ def get_daos_metrics(self, verbose=False, timeout=60): timeout (int, optional): pass timeout to each execution ofrun_pcmd. Defaults to 60. Returns: - list: list of pcmd results for each host. See general_utils.run_pcmd for details. + list: list of CommandResult results for each host. See run_utils.run_remote for details. [ - general_utils.run_pcmd(), # engine 0 - general_utils.run_pcmd() # engine 1 + run_utils.run_remote(), # engine 0 + run_utils.run_remote() # engine 1 ] """ @@ -1159,8 +1159,7 @@ def get_daos_metrics(self, verbose=False, timeout=60): engines = [] daos_metrics_exe = os.path.join(self.manager.job.command_path, "daos_metrics") for engine in range(engines_per_host): - results = run_pcmd( - hosts=self._hosts, verbose=verbose, timeout=timeout, - command="sudo {} -S {} --csv".format(daos_metrics_exe, engine)) - engines.append(results) + command = command_as_user(f"{daos_metrics_exe} -S {engine} --csv", "root") + result = run_remote(self.log, self._hosts, command, verbose=verbose, timeout=timeout) + engines.append(result) return engines diff --git a/src/tests/ftest/util/soak_utils.py b/src/tests/ftest/util/soak_utils.py index 7b5cfdb2608..cc7a91327b7 100644 --- a/src/tests/ftest/util/soak_utils.py +++ b/src/tests/ftest/util/soak_utils.py @@ -1,5 +1,6 @@ """ (C) Copyright 2019-2024 Intel Corporation. +(C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -28,8 +29,8 @@ from exception_utils import CommandFailure from fio_utils import FioCommand from general_utils import (DaosTestError, check_ping, check_ssh, get_journalctl, get_log_file, - get_random_bytes, get_random_string, list_to_str, pcmd, run_command, - run_pcmd, wait_for_result) + get_random_bytes, get_random_string, list_to_str, run_command, + wait_for_result) from ior_utils import IorCommand from job_manager_utils import Mpirun from macsio_util import MacsioCommand @@ -301,11 +302,9 @@ def run_monitor_check(self): self (obj): soak obj """ - monitor_cmds = self.params.get("monitor", "/run/*") - hosts = self.hostlist_servers - if monitor_cmds: - for cmd in monitor_cmds: - pcmd(hosts, cmd, timeout=30) + monitor_cmds = self.params.get("monitor", "/run/*") or [] + for cmd in monitor_cmds: + run_remote(self.log, self.hostlist_servers, cmd, timeout=30) def run_metrics_check(self, logging=True, prefix=None): @@ -326,17 +325,13 @@ def run_metrics_check(self, logging=True, prefix=None): name = prefix + f"_metrics_{engine}.csv" destination = self.outputsoak_dir daos_metrics = f"{self.sudo_cmd} daos_metrics -S {engine} --csv" - self.log.info("Running %s", daos_metrics) - results = run_pcmd(hosts=self.hostlist_servers, - command=daos_metrics, - verbose=(not logging), - timeout=60) + result = run_remote( + self.log, self.hostlist_servers, daos_metrics, verbose=(not logging), timeout=60) if logging: - for result in results: - hosts = result["hosts"] - log_name = name + "-" + str(hosts) + for data in result.output: + log_name = name + "-" + str(data.hosts) self.log.info("Logging %s output to %s", daos_metrics, log_name) - write_logfile(result["stdout"], log_name, destination) + write_logfile(data.stdout, log_name, destination) def get_harassers(harasser):