Skip to content

Commit

Permalink
Enable the "show-progress" runtime option
Browse files Browse the repository at this point in the history
Move the show-progress support from an MCA paramter to a per-job
runtime option. Fix the daemon counting on DVM startup, and ensure
that `prte` sees the runtime options on its command line.

Signed-off-by: Ralph Castain <[email protected]>
  • Loading branch information
rhc54 committed Aug 25, 2022
1 parent 6ef83cf commit 7dc0018
Show file tree
Hide file tree
Showing 14 changed files with 66 additions and 29 deletions.
1 change: 1 addition & 0 deletions src/mca/ess/hnp/ess_hnp_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ static int rte_init(int argc, char **argv)
jdata->state = PRTE_JOB_STATE_RUNNING;
/* obviously, we have "reported" */
jdata->num_reported = 1;
jdata->num_daemons_reported = 1;

if (0 < prte_output_get_verbosity(prte_ess_base_framework.framework_output)) {
prte_output(0, "ALIASES FOR %s", node->name);
Expand Down
21 changes: 16 additions & 5 deletions src/mca/plm/base/plm_base_launch_support.c
Original file line number Diff line number Diff line change
Expand Up @@ -1360,7 +1360,7 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu
prte_topology_t *t, *mytopo;
hwloc_topology_t topo;
int i;
bool found;
bool found, *fptr;
prte_daemon_cmd_flag_t cmd;
char *myendian;
char *alias;
Expand Down Expand Up @@ -1436,6 +1436,7 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu
prted_failed_launch = true;
goto CLEANUP;
}
daemon->rml_uri = strdup(cnctinfo.data.string);
PMIX_VALUE_DESTRUCT(&cnctinfo);

/* unpack the node name */
Expand Down Expand Up @@ -1813,11 +1814,20 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu
return;
} else {
jdatorted->num_reported++;
jdatorted->num_daemons_reported++;
PRTE_OUTPUT_VERBOSE(
(5, prte_plm_base_framework.framework_output,
"%s plm:base:orted_report_launch job %s recvd %d of %d reported daemons",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_JOBID_PRINT(jdatorted->nspace),
jdatorted->num_reported, jdatorted->num_procs));
found = false;
fptr = &found;
prte_get_attribute(&jdatorted->attributes, PRTE_JOB_SHOW_PROGRESS, (void**)&fptr, PMIX_BOOL);
if (found &&
(0 == jdatorted->num_reported % 100 ||
jdatorted->num_reported == prte_process_info.num_daemons)) {
PRTE_ACTIVATE_JOB_STATE(jdatorted, PRTE_JOB_STATE_REPORT_PROGRESS);
}
if (jdatorted->num_procs == jdatorted->num_reported) {
bool dvm = true;
jdatorted->state = PRTE_JOB_STATE_DAEMONS_REPORTED;
Expand Down Expand Up @@ -2119,7 +2129,8 @@ int prte_plm_base_setup_virtual_machine(prte_job_t *jdata)
bool singleton = false;
bool multi_sim = false;

PRTE_OUTPUT_VERBOSE((5, prte_plm_base_framework.framework_output, "%s plm:base:setup_vm",
PRTE_OUTPUT_VERBOSE((5, prte_plm_base_framework.framework_output,
"%s plm:base:setup_vm",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME)));

if (NULL == (daemons = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace))) {
Expand Down Expand Up @@ -2679,9 +2690,9 @@ int prte_plm_base_setup_virtual_machine(prte_job_t *jdata)
/* if new daemons are being launched, mark that this job
* caused it to happen */
if (0 < map->num_new_daemons) {
if (PRTE_SUCCESS
!= (rc = prte_set_attribute(&jdata->attributes, PRTE_JOB_LAUNCHED_DAEMONS, true, NULL,
PMIX_BOOL))) {
rc = prte_set_attribute(&jdata->attributes, PRTE_JOB_LAUNCHED_DAEMONS, true,
NULL, PMIX_BOOL);
if (PRTE_SUCCESS != rc) {
PRTE_ERROR_LOG(rc);
return rc;
}
Expand Down
11 changes: 7 additions & 4 deletions src/mca/plm/base/plm_base_receive.c
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, pmix_data_buffer_t *buf
prte_app_context_t *app, *child_app;
pmix_proc_t name, *nptr;
pid_t pid;
bool debugging;
bool debugging, found, *fptr;
int i, room;
char **env;
char *prefix_dir, *tmp;
Expand Down Expand Up @@ -633,9 +633,12 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, pmix_data_buffer_t *buf
}
/* record that we heard back from a daemon during app launch */
jdata->num_daemons_reported++;
if (prte_report_launch_progress) {
if (0 == jdata->num_daemons_reported % 100
|| jdata->num_daemons_reported == prte_process_info.num_daemons) {
found = false;
fptr = &found;
prte_get_attribute(&jdata->attributes, PRTE_JOB_SHOW_PROGRESS, (void**)&fptr, PMIX_BOOL);
if (found) {
if (0 == jdata->num_daemons_reported % 100 ||
jdata->num_daemons_reported == prte_process_info.num_daemons) {
PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_REPORT_PROGRESS);
}
}
Expand Down
4 changes: 4 additions & 0 deletions src/mca/rmaps/base/rmaps_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,10 @@ int prte_rmaps_base_set_runtime_options(prte_job_t *jdata, char *spec)
prte_set_attribute(&djob->attributes, PRTE_JOB_DO_NOT_LAUNCH, PRTE_ATTR_GLOBAL,
&flag, PMIX_BOOL);
}
} else if (PRTE_CHECK_CLI_OPTION(options[n], PRTE_CLI_SHOW_PROGRESS)) {
flag = PRTE_CHECK_TRUE(ptr);
prte_set_attribute(&jdata->attributes, PRTE_JOB_SHOW_PROGRESS, PRTE_ATTR_GLOBAL,
&flag, PMIX_BOOL);
} else {
pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", true,
"runtime options", spec);
Expand Down
13 changes: 10 additions & 3 deletions src/mca/schizo/base/help-schizo-cli.txt
Original file line number Diff line number Diff line change
Expand Up @@ -474,12 +474,19 @@ a value. Thus, "--runtime-options abort-nonzero" is sufficient to set the

Supported values include:

- ABORT-NONZERO-STATUS[=(bool)] directs the runtime to not abort a running
job if a process exits with non-zero status if set to true.
- ABORT-NONZERO-STATUS[=(bool)] if set to false, this directs the runtime
to not abort a running job if a process exits with non-zero status. The
system default for this value is true.

- DONOTLAUNCH directs the runtime to map but not launch the specified
job. This is provided to help explore possible process placement patterns
before actually starting execution.
before actually starting execution. No value need be passed as this is
not an option that can be set by default in PRRTE.

- SHOW-PROGRESS=(bool) requests that the runtime provide progress reports
on its startup procedure - i.e., the launch of its daemons in support
of a job. This is typically used to debug DVM startup on large systems.


The runtime-options command line option has no qualifiers. Note that directives
are case-insensitive.
1 change: 1 addition & 0 deletions src/mca/schizo/base/schizo_base_frame.c
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line)
char *rtos[] = {
PRTE_CLI_ABORT_NZ,
PRTE_CLI_NOLAUNCH,
PRTE_CLI_SHOW_PROGRESS,
NULL
};

Expand Down
2 changes: 2 additions & 0 deletions src/mca/schizo/prte/help-schizo-prte.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ option to the help request as "--help <option>".

/***** DVM Options *****/

--runtime-options <arg0> Comma-delimited list of runtime directives for the job (e.g., show
progress reports on DVM startup for large systems)
--default-hostfile <arg0> Provide a default hostfile
-H|--host <arg0> List of hosts to use for the DVM
--hostfile <arg0> Provide a hostfile
Expand Down
6 changes: 3 additions & 3 deletions src/mca/schizo/prte/schizo_prte.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,13 @@ static struct option prteoptions[] = {
// Launch options
PMIX_OPTION_DEFINE(PRTE_CLI_TIMEOUT, PMIX_ARG_REQD),
PMIX_OPTION_SHORT_DEFINE(PRTE_CLI_FWD_ENVAR, PMIX_ARG_REQD, 'x'),
PMIX_OPTION_DEFINE(PRTE_CLI_SHOW_PROGRESS, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_HOSTFILE, PMIX_ARG_REQD),
PMIX_OPTION_SHORT_DEFINE(PRTE_CLI_HOST, PMIX_ARG_REQD, 'H'),
PMIX_OPTION_DEFINE(PRTE_CLI_EXEC_AGENT, PMIX_ARG_REQD),

// Runtime options
PMIX_OPTION_DEFINE(PRTE_CLI_RTOS, PMIX_ARG_REQD),

// output options
PMIX_OPTION_DEFINE(PRTE_CLI_STREAM_BUF, PMIX_ARG_REQD),

Expand Down Expand Up @@ -183,7 +185,6 @@ static struct option prterunoptions[] = {
PMIX_OPTION_DEFINE("wd", PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_SET_CWD_SESSION, PMIX_ARG_NONE),
PMIX_OPTION_DEFINE(PRTE_CLI_PATH, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_SHOW_PROGRESS, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_PSET, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_HOSTFILE, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE("machinefile", PMIX_ARG_REQD),
Expand Down Expand Up @@ -295,7 +296,6 @@ static struct option prunoptions[] = {
PMIX_OPTION_DEFINE("wd", PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_SET_CWD_SESSION, PMIX_ARG_NONE),
PMIX_OPTION_DEFINE(PRTE_CLI_PATH, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_SHOW_PROGRESS, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_PSET, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_HOSTFILE, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE("machinefile", PMIX_ARG_REQD),
Expand Down
9 changes: 6 additions & 3 deletions src/mca/state/base/state_base_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -359,11 +359,14 @@ void prte_state_base_local_launch_complete(int fd, short argc, void *cbdata)
{
prte_state_caddy_t *state = (prte_state_caddy_t *) cbdata;
prte_job_t *jdata = state->jdata;
bool found = false, *fptr;
PRTE_HIDE_UNUSED_PARAMS(fd, argc);

if (prte_report_launch_progress) {
if (0 == jdata->num_daemons_reported % 100
|| jdata->num_daemons_reported == prte_process_info.num_daemons) {
fptr = &found;
prte_get_attribute(&jdata->attributes, PRTE_JOB_SHOW_PROGRESS, (void**)&fptr, PMIX_BOOL);
if (found) {
if (0 == jdata->num_daemons_reported % 100 ||
jdata->num_daemons_reported == prte_process_info.num_daemons) {
PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_REPORT_PROGRESS);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/prted/prte_app_parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ static int create_app(prte_schizo_base_module_t *schizo, char **argv, pmix_list_
app->app.cwd = pmix_os_path(false, cwd, param, NULL);
}
PMIX_INFO_LIST_ADD(rc, app->info, PMIX_WDIR_USER_SPECIFIED, NULL, PMIX_BOOL);
} else if (pmix_cmd_line_is_taken(&results, "set-cwd-to-session-dir")) {
} else if (pmix_cmd_line_is_taken(&results, PRTE_CLI_SET_CWD_SESSION)) {
PMIX_INFO_LIST_ADD(rc, app->info, PMIX_SET_SESSION_CWD, NULL, PMIX_BOOL);
} else {
app->app.cwd = strdup(cwd);
Expand Down Expand Up @@ -217,7 +217,7 @@ static int create_app(prte_schizo_base_module_t *schizo, char **argv, pmix_list_
}

/* check for preload files */
opt = pmix_cmd_line_get_param(&results, "preload-files");
opt = pmix_cmd_line_get_param(&results, PRTE_CLI_PRELOAD_FILES);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(rc, app->info, PMIX_PRELOAD_FILES, opt->values[0], PMIX_STRING);
}
Expand Down
8 changes: 0 additions & 8 deletions src/runtime/prte_mca_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -494,14 +494,6 @@ int prte_register_params(void)
prte_leave_session_attached = true;
}

/* whether or not to report launch progress */
prte_report_launch_progress = false;
(void) prte_mca_base_var_register(
"prte", "prte", NULL, "report_launch_progress",
"Output a brief periodic report on launch progress [default: no]",
PRTE_MCA_BASE_VAR_TYPE_BOOL, NULL, 0, PRTE_MCA_BASE_VAR_FLAG_NONE, PRTE_INFO_LVL_9,
PRTE_MCA_BASE_VAR_SCOPE_READONLY, &prte_report_launch_progress);

/* tool communication controls */
prte_report_events_uri = NULL;
(void) prte_mca_base_var_register("prte", "prte", NULL, "report_events",
Expand Down
12 changes: 11 additions & 1 deletion src/tools/prte/prte.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
#include "src/mca/ess/base/base.h"
#include "src/mca/odls/odls.h"
#include "src/mca/plm/plm.h"
#include "src/mca/rmaps/rmaps_types.h"
#include "src/mca/rmaps/base/base.h"
#include "src/rml/rml.h"
#include "src/mca/schizo/base/base.h"
#include "src/mca/state/base/base.h"
Expand Down Expand Up @@ -654,6 +654,16 @@ int main(int argc, char *argv[])
}
}

/* apply any provided runtime options */
opt = pmix_cmd_line_get_param(&results, PRTE_CLI_RTOS);
if (NULL != opt) {
rc = prte_rmaps_base_set_runtime_options(jdata, opt->values[0]);
if (PRTE_SUCCESS != rc) {
PRTE_UPDATE_EXIT_STATUS(PRTE_ERR_FATAL);
goto DONE;
}
}

/* setup to listen for commands sent specifically to me, even though I would probably
* be the one sending them! Unfortunately, since I am a participating daemon,
* there are times I need to send a command to "all daemons", and that means *I* have
Expand Down
2 changes: 2 additions & 0 deletions src/util/attr.c
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,8 @@ const char *prte_attr_key_to_str(prte_attribute_key_t key)
return "TERM IF NONZERO EXIT";
case PRTE_JOB_CONTROLS:
return "JOB CONTROLS";
case PRTE_JOB_SHOW_PROGRESS:
return "SHOW LAUNCH PROGRESS";

case PRTE_PROC_NOBARRIER:
return "PROC-NOBARRIER";
Expand Down
1 change: 1 addition & 0 deletions src/util/attr.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ typedef uint16_t prte_job_flags_t;
#define PRTE_JOB_TAG_OUTPUT_FULLNAME (PRTE_JOB_START_KEY + 101) // bool - use full namespace in output stream tag
#define PRTE_JOB_TERM_NONZERO_EXIT (PRTE_JOB_START_KEY + 102) // bool - terminate job if a proc exits with non-zero status
#define PRTE_JOB_CONTROLS (PRTE_JOB_START_KEY + 103) // char* - Directives controlling job behavior
#define PRTE_JOB_SHOW_PROGRESS (PRTE_JOB_START_KEY + 104) // bool - show launch progress of this job

#define PRTE_JOB_MAX_KEY (PRTE_JOB_START_KEY + 200)

Expand Down

0 comments on commit 7dc0018

Please sign in to comment.