diff --git a/src/mca/ess/hnp/ess_hnp_module.c b/src/mca/ess/hnp/ess_hnp_module.c index c0dddc2145..6e7e938e11 100644 --- a/src/mca/ess/hnp/ess_hnp_module.c +++ b/src/mca/ess/hnp/ess_hnp_module.c @@ -353,6 +353,7 @@ static int rte_init(int argc, char **argv) jdata->state = PRTE_JOB_STATE_RUNNING; /* obviously, we have "reported" */ jdata->num_reported = 1; + jdata->num_daemons_reported = 1; if (0 < prte_output_get_verbosity(prte_ess_base_framework.framework_output)) { prte_output(0, "ALIASES FOR %s", node->name); diff --git a/src/mca/plm/base/plm_base_launch_support.c b/src/mca/plm/base/plm_base_launch_support.c index 883bdacc00..494cc79b7d 100644 --- a/src/mca/plm/base/plm_base_launch_support.c +++ b/src/mca/plm/base/plm_base_launch_support.c @@ -1360,7 +1360,7 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu prte_topology_t *t, *mytopo; hwloc_topology_t topo; int i; - bool found; + bool found, *fptr; prte_daemon_cmd_flag_t cmd; char *myendian; char *alias; @@ -1436,6 +1436,7 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu prted_failed_launch = true; goto CLEANUP; } + daemon->rml_uri = strdup(cnctinfo.data.string); PMIX_VALUE_DESTRUCT(&cnctinfo); /* unpack the node name */ @@ -1813,11 +1814,20 @@ void prte_plm_base_daemon_callback(int status, pmix_proc_t *sender, pmix_data_bu return; } else { jdatorted->num_reported++; + jdatorted->num_daemons_reported++; PRTE_OUTPUT_VERBOSE( (5, prte_plm_base_framework.framework_output, "%s plm:base:orted_report_launch job %s recvd %d of %d reported daemons", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME), PRTE_JOBID_PRINT(jdatorted->nspace), jdatorted->num_reported, jdatorted->num_procs)); + found = false; + fptr = &found; + prte_get_attribute(&jdatorted->attributes, PRTE_JOB_SHOW_PROGRESS, (void**)&fptr, PMIX_BOOL); + if (found && + (0 == jdatorted->num_reported % 100 || + jdatorted->num_reported == prte_process_info.num_daemons)) { + PRTE_ACTIVATE_JOB_STATE(jdatorted, PRTE_JOB_STATE_REPORT_PROGRESS); + } if (jdatorted->num_procs == jdatorted->num_reported) { bool dvm = true; jdatorted->state = PRTE_JOB_STATE_DAEMONS_REPORTED; @@ -2119,7 +2129,8 @@ int prte_plm_base_setup_virtual_machine(prte_job_t *jdata) bool singleton = false; bool multi_sim = false; - PRTE_OUTPUT_VERBOSE((5, prte_plm_base_framework.framework_output, "%s plm:base:setup_vm", + PRTE_OUTPUT_VERBOSE((5, prte_plm_base_framework.framework_output, + "%s plm:base:setup_vm", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME))); if (NULL == (daemons = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace))) { @@ -2679,9 +2690,9 @@ int prte_plm_base_setup_virtual_machine(prte_job_t *jdata) /* if new daemons are being launched, mark that this job * caused it to happen */ if (0 < map->num_new_daemons) { - if (PRTE_SUCCESS - != (rc = prte_set_attribute(&jdata->attributes, PRTE_JOB_LAUNCHED_DAEMONS, true, NULL, - PMIX_BOOL))) { + rc = prte_set_attribute(&jdata->attributes, PRTE_JOB_LAUNCHED_DAEMONS, true, + NULL, PMIX_BOOL); + if (PRTE_SUCCESS != rc) { PRTE_ERROR_LOG(rc); return rc; } diff --git a/src/mca/plm/base/plm_base_receive.c b/src/mca/plm/base/plm_base_receive.c index f59ded2025..6e2a4bfa0b 100644 --- a/src/mca/plm/base/plm_base_receive.c +++ b/src/mca/plm/base/plm_base_receive.c @@ -127,7 +127,7 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, pmix_data_buffer_t *buf prte_app_context_t *app, *child_app; pmix_proc_t name, *nptr; pid_t pid; - bool debugging; + bool debugging, found, *fptr; int i, room; char **env; char *prefix_dir, *tmp; @@ -633,9 +633,12 @@ void prte_plm_base_recv(int status, pmix_proc_t *sender, pmix_data_buffer_t *buf } /* record that we heard back from a daemon during app launch */ jdata->num_daemons_reported++; - if (prte_report_launch_progress) { - if (0 == jdata->num_daemons_reported % 100 - || jdata->num_daemons_reported == prte_process_info.num_daemons) { + found = false; + fptr = &found; + prte_get_attribute(&jdata->attributes, PRTE_JOB_SHOW_PROGRESS, (void**)&fptr, PMIX_BOOL); + if (found) { + if (0 == jdata->num_daemons_reported % 100 || + jdata->num_daemons_reported == prte_process_info.num_daemons) { PRTE_ACTIVATE_JOB_STATE(jdata, PRTE_JOB_STATE_REPORT_PROGRESS); } } diff --git a/src/mca/rmaps/base/rmaps_base_frame.c b/src/mca/rmaps/base/rmaps_base_frame.c index d25bfe0c8e..b04ac33a39 100644 --- a/src/mca/rmaps/base/rmaps_base_frame.c +++ b/src/mca/rmaps/base/rmaps_base_frame.c @@ -732,6 +732,10 @@ int prte_rmaps_base_set_runtime_options(prte_job_t *jdata, char *spec) prte_set_attribute(&djob->attributes, PRTE_JOB_DO_NOT_LAUNCH, PRTE_ATTR_GLOBAL, &flag, PMIX_BOOL); } + } else if (PRTE_CHECK_CLI_OPTION(options[n], PRTE_CLI_SHOW_PROGRESS)) { + flag = PRTE_CHECK_TRUE(ptr); + prte_set_attribute(&jdata->attributes, PRTE_JOB_SHOW_PROGRESS, PRTE_ATTR_GLOBAL, + &flag, PMIX_BOOL); } else { pmix_show_help("help-prte-rmaps-base.txt", "unrecognized-policy", true, "runtime options", spec); diff --git a/src/mca/schizo/base/help-schizo-cli.txt b/src/mca/schizo/base/help-schizo-cli.txt index a93394369a..681110089d 100644 --- a/src/mca/schizo/base/help-schizo-cli.txt +++ b/src/mca/schizo/base/help-schizo-cli.txt @@ -474,12 +474,19 @@ a value. Thus, "--runtime-options abort-nonzero" is sufficient to set the Supported values include: -- ABORT-NONZERO-STATUS[=(bool)] directs the runtime to not abort a running - job if a process exits with non-zero status if set to true. +- ABORT-NONZERO-STATUS[=(bool)] if set to false, this directs the runtime + to not abort a running job if a process exits with non-zero status. The + system default for this value is true. - DONOTLAUNCH directs the runtime to map but not launch the specified job. This is provided to help explore possible process placement patterns - before actually starting execution. + before actually starting execution. No value need be passed as this is + not an option that can be set by default in PRRTE. + +- SHOW-PROGRESS=(bool) requests that the runtime provide progress reports + on its startup procedure - i.e., the launch of its daemons in support + of a job. This is typically used to debug DVM startup on large systems. + The runtime-options command line option has no qualifiers. Note that directives are case-insensitive. diff --git a/src/mca/schizo/base/schizo_base_frame.c b/src/mca/schizo/base/schizo_base_frame.c index 00e81edb6c..d78f8fe82b 100644 --- a/src/mca/schizo/base/schizo_base_frame.c +++ b/src/mca/schizo/base/schizo_base_frame.c @@ -399,6 +399,7 @@ int prte_schizo_base_sanity(pmix_cli_result_t *cmd_line) char *rtos[] = { PRTE_CLI_ABORT_NZ, PRTE_CLI_NOLAUNCH, + PRTE_CLI_SHOW_PROGRESS, NULL }; diff --git a/src/mca/schizo/prte/help-schizo-prte.txt b/src/mca/schizo/prte/help-schizo-prte.txt index 411c90374d..3f7f047926 100644 --- a/src/mca/schizo/prte/help-schizo-prte.txt +++ b/src/mca/schizo/prte/help-schizo-prte.txt @@ -44,6 +44,8 @@ option to the help request as "--help