Skip to content

Commit

Permalink
Fix testing of suicide for daemons
Browse files Browse the repository at this point in the history
We don't support a cmd line option for this as it isn't
something a user should ever do. Instead, we use two
MCA params to specify it:

prte_daemon_fail <N> - specifies the daemon rank that
should commit suicide

prte_daemon_fail_delay <N> - time in seconds the target
rank should wait before dying. A value of zero means
no delay, just die after calling init. This is the
default value.

Signed-off-by: Ralph Castain <[email protected]>
(cherry picked from commit 618dd0a)
  • Loading branch information
rhc54 committed Feb 25, 2024
1 parent 2c2519b commit e2cff33
Show file tree
Hide file tree
Showing 8 changed files with 170 additions and 354 deletions.
5 changes: 1 addition & 4 deletions src/docs/show-help-files/help-prte.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.. -*- rst -*-
Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
$COPYRIGHT$
Expand Down Expand Up @@ -79,9 +79,6 @@ option to the help request as ``--help <option>``.
* - ``--leave-session-attached``
- Do not discard stdout/stderr of remote PRRTE daemons

* - ``--test-suicide <arg0>``
- Direct that the specified daemon suicide after delay

* - ``--display <arg0>``
- Comma-delimited list of options for displaying information

Expand Down
5 changes: 1 addition & 4 deletions src/docs/show-help-files/help-prterun.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.. -*- rst -*-
Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
$COPYRIGHT$
Expand Down Expand Up @@ -107,9 +107,6 @@ option to the help request as ``--help <option>``.
- Direct the specified processes to stop at an
application-controlled location

* - ``--test-suicide <arg0>``
- Direct that the specified daemon suicide after delay

* - ``--do-not-launch``
- Perform all necessary operations to prepare to launch the
application, but do not actually launch it (usually used to
Expand Down
4 changes: 1 addition & 3 deletions src/mca/schizo/prte/schizo_prte.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018-2022 IBM Corporation. All rights reserved.
* Copyright (c) 2021-2023 Nanook Consulting All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -98,7 +98,6 @@ static struct option prteoptions[] = {
PMIX_OPTION_DEFINE(PRTE_CLI_SET_SID, PMIX_ARG_NONE),
PMIX_OPTION_DEFINE(PRTE_CLI_REPORT_PID, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_REPORT_URI, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_TEST_SUICIDE, PMIX_ARG_NONE),
PMIX_OPTION_DEFINE(PRTE_CLI_DEFAULT_HOSTFILE, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_SINGLETON, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_KEEPALIVE, PMIX_ARG_REQD),
Expand Down Expand Up @@ -152,7 +151,6 @@ static struct option prterunoptions[] = {
PMIX_OPTION_DEFINE(PRTE_CLI_SET_SID, PMIX_ARG_NONE),
PMIX_OPTION_DEFINE(PRTE_CLI_REPORT_PID, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_REPORT_URI, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_TEST_SUICIDE, PMIX_ARG_NONE),
PMIX_OPTION_DEFINE(PRTE_CLI_DEFAULT_HOSTFILE, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_KEEPALIVE, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_LAUNCH_AGENT, PMIX_ARG_REQD),
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/prte_globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ bool prte_show_resolved_nodenames = false;
bool prte_do_not_resolve = false;
int prte_hostname_cutoff = 1000;

int prted_debug_failure = -1;
pmix_rank_t prted_debug_failure = PMIX_RANK_INVALID;
int prted_debug_failure_delay = -1;
bool prte_never_launched = false;
bool prte_devel_level_output = false;
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/prte_globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ PRTE_EXPORT extern int prte_hostname_cutoff;
PRTE_EXPORT extern bool prte_do_not_resolve;

/* debug flags */
PRTE_EXPORT extern int prted_debug_failure;
PRTE_EXPORT extern pmix_rank_t prted_debug_failure;
PRTE_EXPORT extern int prted_debug_failure_delay;

PRTE_EXPORT extern bool prte_never_launched;
Expand Down
56 changes: 56 additions & 0 deletions src/tools/prte/prte.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
#include "src/mca/schizo/base/base.h"
#include "src/mca/state/base/base.h"
#include "src/runtime/prte_globals.h"
#include "src/runtime/prte_wait.h"
#include "src/runtime/runtime.h"

#include "include/prte.h"
Expand Down Expand Up @@ -228,6 +229,32 @@ static void setup_sighandler(int signal, prte_event_t *ev, prte_event_cbfunc_t c
prte_event_signal_add(ev, NULL);
}

static void shutdown_callback(int fd, short flags, void *arg)
{
prte_timer_t *tm = (prte_timer_t *) arg;
prte_job_t *jdata;
PRTE_HIDE_UNUSED_PARAMS(fd, flags);

if (NULL != tm) {
/* release the timer */
PMIX_RELEASE(tm);
}

/* if we were ordered to abort, do so */
pmix_output(0, "%s is executing clean abnormal termination",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME));
/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
* the local session_dir tree and exit
*/
prte_odls.kill_local_procs(NULL);
// mark that we are finalizing so the session directory will cleanup
prte_finalizing = true;
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
PMIX_RELEASE(jdata);
exit(PRTE_ERROR_DEFAULT_EXIT_CODE);
}

int main(int argc, char *argv[])
{
int rc = 1, i;
Expand Down Expand Up @@ -848,6 +875,35 @@ int main(int argc, char *argv[])
goto DONE;
}

// see if we are to suicide
if (PMIX_RANK_INVALID != prted_debug_failure) {
/* are we the specified vpid? */
if (PRTE_PROC_MY_NAME->rank == prted_debug_failure ||
prted_debug_failure == PMIX_RANK_WILDCARD) {
/* if the user specified we delay, then setup a timer
* and have it kill us
*/
if (0 < prted_debug_failure_delay) {
PRTE_TIMER_EVENT(prted_debug_failure_delay, 0, shutdown_callback);

} else {
pmix_output(0, "%s is executing clean abnormal termination",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME));

/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
* the local session_dir tree and exit
*/
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
PMIX_RELEASE(jdata);

/* return with non-zero status */
ret = PRTE_ERROR_DEFAULT_EXIT_CODE;
goto DONE;
}
}
}

opt = pmix_cmd_line_get_param(&results, PRTE_CLI_REPORT_PID);
if (NULL != opt) {
/* if the string is a "-", then output to stdout */
Expand Down
60 changes: 18 additions & 42 deletions src/tools/prted/prted.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2015-2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2021-2023 Nanook Consulting All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting. All rights reserved.
* Copyright (c) 2022 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
Expand Down Expand Up @@ -125,7 +125,6 @@ static void report_prted(void);
static pmix_data_buffer_t *bucket, *mybucket = NULL;
static int ncollected = 0;
static bool node_regex_waiting = false;
static bool prted_abort = false;
static char *prte_parent_uri = NULL;
static pmix_cli_result_t results;

Expand Down Expand Up @@ -200,6 +199,7 @@ int main(int argc, char *argv[])
int pargc;
prte_schizo_base_module_t *schizo;
pmix_cli_item_t *opt;
prte_job_t *jdata;

char *umask_str = getenv("PRTE_DAEMON_UMASK_VALUE");
if (NULL != umask_str) {
Expand Down Expand Up @@ -322,6 +322,9 @@ int main(int argc, char *argv[])
if (pmix_cmd_line_is_taken(&results, PRTE_CLI_DEBUG_DAEMONS)) {
prte_debug_daemons_flag = true;
}
if (pmix_cmd_line_is_taken(&results, PRTE_CLI_DEBUG_DAEMONS_FILE)) {
prte_debug_daemons_file_flag = true;
}
if (pmix_cmd_line_is_taken(&results, PRTE_CLI_LEAVE_SESSION_ATTACHED)) {
prte_leave_session_attached = true;
}
Expand Down Expand Up @@ -414,40 +417,28 @@ int main(int argc, char *argv[])
}
}

if ((int) PMIX_RANK_INVALID != prted_debug_failure) {
prted_abort = false;
/* some vpid was ordered to fail. The value can be positive
* or negative, depending upon the desired method for failure,
* so need to check both here
*/
if (0 > prted_debug_failure) {
prted_debug_failure = -1 * prted_debug_failure;
prted_abort = true;
}
if (PMIX_RANK_INVALID != prted_debug_failure) {
/* are we the specified vpid? */
if ((int) PRTE_PROC_MY_NAME->rank == prted_debug_failure) {
if (PRTE_PROC_MY_NAME->rank == prted_debug_failure ||
prted_debug_failure == PMIX_RANK_WILDCARD) {
/* if the user specified we delay, then setup a timer
* and have it kill us
*/
if (0 < prted_debug_failure_delay) {
PRTE_TIMER_EVENT(prted_debug_failure_delay, 0, shutdown_callback);

} else {
pmix_output(0, "%s is executing clean %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME),
prted_abort ? "abort" : "abnormal termination");
pmix_output(0, "%s is executing clean abnormal termination",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME));

/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
* the local session_dir tree and exit
*/
prte_session_dir_cleanup(PRTE_JOBID_WILDCARD);

/* if we were ordered to abort, do so */
if (prted_abort) {
abort();
}
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
PMIX_RELEASE(jdata);

/* otherwise, return with non-zero status */
/* return with non-zero status */
ret = PRTE_ERROR_DEFAULT_EXIT_CODE;
goto DONE;
}
Expand Down Expand Up @@ -808,7 +799,6 @@ int main(int argc, char *argv[])
/* cleanup and leave */
prte_finalize();

prte_session_dir_cleanup(PRTE_JOBID_WILDCARD);
/* cleanup the process info */
prte_proc_info_finalize();

Expand All @@ -821,7 +811,7 @@ int main(int argc, char *argv[])
static void shutdown_callback(int fd, short flags, void *arg)
{
prte_timer_t *tm = (prte_timer_t *) arg;
bool suicide = false;
prte_job_t *jdata;
PRTE_HIDE_UNUSED_PARAMS(fd, flags);

if (NULL != tm) {
Expand All @@ -830,31 +820,17 @@ static void shutdown_callback(int fd, short flags, void *arg)
}

/* if we were ordered to abort, do so */
if (prted_abort) {
if (pmix_cmd_line_is_taken(&results, PRTE_CLI_TEST_SUICIDE)) {
suicide = true;
}
pmix_output(0, "%s is executing %s abort", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME),
suicide ? "suicide" : "clean");
/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just kill our
* local procs, forcibly cleanup the local session_dir tree, and abort
*/
if (suicide) {
exit(1);
}
prte_odls.kill_local_procs(NULL);
prte_session_dir_cleanup(PRTE_JOBID_WILDCARD);
abort();
}
pmix_output(0, "%s is executing clean abnormal termination",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME));
/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
* the local session_dir tree and exit
*/
prte_odls.kill_local_procs(NULL);
prte_session_dir_cleanup(PRTE_JOBID_WILDCARD);
// mark that we are finalizing so the session directory will cleanup
prte_finalizing = true;
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
PMIX_RELEASE(jdata);
exit(PRTE_ERROR_DEFAULT_EXIT_CODE);
}

Expand Down
Loading

0 comments on commit e2cff33

Please sign in to comment.