Skip to content

Commit

Permalink
Enable per-job specification of the exec agent
Browse files Browse the repository at this point in the history
PRRTE allows the user to specify their own exec agent - i.e.,
the function to use when spawning an application process on
the compute node. In most cases, this is just NULL as we exec
the application process itself. However, in some cases users
want/need a wrapper function that will in turn exec the app.

We have always had an MCA parameter by which one could set that,
but we need to support it in DVM mode as well. So let the MCA
parameter set the default, and add a cmd line option to PRRTE
so one can override the default (whether NULL or some MCA param
setting) for a specific job.

Signed-off-by: Ralph Castain <[email protected]>
  • Loading branch information
rhc54 committed Feb 5, 2022
1 parent 9d06d37 commit e824387
Show file tree
Hide file tree
Showing 12 changed files with 55 additions and 24 deletions.
24 changes: 20 additions & 4 deletions src/mca/odls/base/odls_base_default_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -1066,17 +1066,33 @@ void prte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
cd->cmd = strdup(app->app);
cd->argv = prte_argv_copy(app->argv);
}
} else if (NULL != prte_fork_agent) {
} else if (prte_get_attribute(&jobdat->attributes, PRTE_JOB_EXEC_AGENT, (void**)&ptr, PMIX_STRING)) {
/* we were given a fork agent - use it */
cd->argv = prte_argv_copy(prte_fork_agent);
cd->argv = prte_argv_split(ptr, ' ');
/* add in the argv from the app */
for (i = 0; NULL != app->argv[i]; i++) {
prte_argv_append_nosize(&cd->argv, app->argv[i]);
}
cd->cmd = prte_path_findv(prte_fork_agent[0], X_OK, prte_launch_environ, NULL);
cd->cmd = prte_path_findv(cd->argv[0], X_OK, prte_launch_environ, NULL);
if (NULL == cd->cmd) {
prte_show_help("help-prte-odls-base.txt", "prte-odls-base:fork-agent-not-found", true,
prte_process_info.nodename, prte_fork_agent[0]);
prte_process_info.nodename, ptr);
state = PRTE_PROC_STATE_FAILED_TO_LAUNCH;
free(ptr);
goto errorout;
}
free(ptr);
} else if (NULL != prte_fork_agent_string) {
/* we were given a fork agent - use it */
cd->argv = prte_argv_split(prte_fork_agent_string, ' ');
/* add in the argv from the app */
for (i = 0; NULL != app->argv[i]; i++) {
prte_argv_append_nosize(&cd->argv, app->argv[i]);
}
cd->cmd = prte_path_findv(cd->argv[0], X_OK, prte_launch_environ, NULL);
if (NULL == cd->cmd) {
prte_show_help("help-prte-odls-base.txt", "prte-odls-base:fork-agent-not-found", true,
prte_process_info.nodename, cd->argv[0]);
state = PRTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
}
Expand Down
1 change: 1 addition & 0 deletions src/mca/schizo/prte/schizo_prte.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ static struct option prteoptions[] = {
PRTE_OPTION_DEFINE(PRTE_CLI_SHOW_PROGRESS, PRTE_ARG_REQD),
PRTE_OPTION_DEFINE(PRTE_CLI_HOSTFILE, PRTE_ARG_REQD),
PRTE_OPTION_SHORT_DEFINE(PRTE_CLI_HOST, PRTE_ARG_REQD, 'H'),
PRTE_OPTION_DEFINE(PRTE_CLI_EXEC_AGENT, PRTE_ARG_REQD),

// output options
PRTE_OPTION_DEFINE(PRTE_CLI_STREAM_BUF, PRTE_ARG_REQD),
Expand Down
5 changes: 5 additions & 0 deletions src/prted/pmix/pmix_server_dyn.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,11 @@ static void interim(int sd, short args, void *cbdata)
goto complete;
}

/*** EXEC AGENT ***/
} else if (PMIX_CHECK_KEY(info, PMIX_EXEC_AGENT)) {
prte_set_attribute(&jdata->attributes, PRTE_JOB_EXEC_AGENT, PRTE_ATTR_GLOBAL,
info->value.data.string, PMIX_STRING);

/*** CPUS/RANK ***/
} else if (PMIX_CHECK_KEY(info, PMIX_CPUS_PER_PROC)) {
u16 = info->value.data.uint32;
Expand Down
4 changes: 0 additions & 4 deletions src/runtime/prte_finalize.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,6 @@ int prte_finalize(void)
}
PRTE_RELEASE(prte_node_pool);

if (NULL != prte_fork_agent) {
prte_argv_free(prte_fork_agent);
}

free(prte_process_info.nodename);
prte_process_info.nodename = NULL;

Expand Down
2 changes: 1 addition & 1 deletion src/runtime/prte_globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ bool prte_node_info_communicated = false;
/* launch agents */
char *prte_launch_agent = NULL;
char **prted_cmd_line = NULL;
char **prte_fork_agent = NULL;
char *prte_fork_agent_string = NULL;

/* exit flags */
int prte_exit_status = 0;
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/prte_globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ PRTE_EXPORT extern bool prte_node_info_communicated;
/* launch agents */
PRTE_EXPORT extern char *prte_launch_agent;
PRTE_EXPORT extern char **prted_cmd_line;
PRTE_EXPORT extern char **prte_fork_agent;
PRTE_EXPORT extern char *prte_fork_agent_string;

/* exit flags */
PRTE_EXPORT extern bool prte_abnormal_term_ordered;
Expand Down
5 changes: 0 additions & 5 deletions src/runtime/prte_mca_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@

static bool passed_thru = false;
static int prte_progress_thread_debug_level = -1;
static char *prte_fork_agent_string = NULL;
static char *prte_tmpdir_base = NULL;
static char *prte_local_tmpdir_base = NULL;
static char *prte_remote_tmpdir_base = NULL;
Expand Down Expand Up @@ -480,10 +479,6 @@ int prte_register_params(void)
PRTE_MCA_BASE_VAR_FLAG_NONE, PRTE_INFO_LVL_9,
PRTE_MCA_BASE_VAR_SCOPE_READONLY, &prte_fork_agent_string);

if (NULL != prte_fork_agent_string) {
prte_fork_agent = prte_argv_split(prte_fork_agent_string, ' ');
}

/* whether or not to require RM allocation */
prte_allocation_required = false;
(void) prte_mca_base_var_register(
Expand Down
7 changes: 7 additions & 0 deletions src/tools/prte/prte.c
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,13 @@ int main(int argc, char *argv[])
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_BINDTO, opt->values[0], PMIX_STRING);
}

/* check for an exec agent */
opt = prte_cmd_line_get_param(&results, PRTE_CLI_EXEC_AGENT);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_EXEC_AGENT, opt->values[0], PMIX_STRING);
}

/* mark if recovery was enabled on the cmd line */
if (prte_cmd_line_is_taken(&results, PRTE_CLI_ENABLE_RECOVERY)) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_RECOVERABLE, NULL, PMIX_BOOL);
Expand Down
18 changes: 12 additions & 6 deletions src/tools/prun/prun.c
Original file line number Diff line number Diff line change
Expand Up @@ -762,12 +762,12 @@ int prun(int argc, char *argv[])
}

/* check what user wants us to do with stdin */
opt = prte_cmd_line_get_param(&results, "stdin");
opt = prte_cmd_line_get_param(&results, PRTE_CLI_STDIN);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_STDIN_TGT, opt->values[0], PMIX_STRING);
}

opt = prte_cmd_line_get_param(&results, "map-by");
opt = prte_cmd_line_get_param(&results, PRTE_CLI_MAPBY);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_MAPBY, opt->values[0], PMIX_STRING);
if (NULL != strcasestr(opt->values[0], "DONOTLAUNCH")) {
Expand All @@ -776,23 +776,29 @@ int prun(int argc, char *argv[])
}

/* if the user specified a ranking policy, then set it */
opt = prte_cmd_line_get_param(&results, "rank-by");
opt = prte_cmd_line_get_param(&results, PRTE_CLI_RANKBY);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_RANKBY, opt->values[0], PMIX_STRING);
}

/* if the user specified a binding policy, then set it */
opt = prte_cmd_line_get_param(&results, "bind-to");
opt = prte_cmd_line_get_param(&results, PRTE_CLI_BINDTO);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_BINDTO, opt->values[0], PMIX_STRING);
}

/* check for an exec agent */
opt = prte_cmd_line_get_param(&results, PRTE_CLI_EXEC_AGENT);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_EXEC_AGENT, opt->values[0], PMIX_STRING);
}

/* mark if recovery was enabled on the cmd line */
if (prte_cmd_line_is_taken(&results, "enable-recovery")) {
if (prte_cmd_line_is_taken(&results, PRTE_CLI_ENABLE_RECOVERY)) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_RECOVERABLE, &flag, PMIX_BOOL);
}
/* record the max restarts */
opt = prte_cmd_line_get_param(&results, "max-restarts");
opt = prte_cmd_line_get_param(&results, PRTE_CLI_MAX_RESTARTS);
if (NULL != opt) {
ui32 = strtol(opt->values[0], NULL, 10);
PRTE_LIST_FOREACH(app, &apps, prte_pmix_app_t)
Expand Down
4 changes: 3 additions & 1 deletion src/util/attr.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018-2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
Expand Down Expand Up @@ -466,6 +466,8 @@ const char *prte_attr_key_to_str(prte_attribute_key_t key)
return "SPAWN-TIMEOUT";
case PRTE_JOB_RAW_OUTPUT:
return "DO-NOT-BUFFER-OUTPUT";
case PRTE_JOB_EXEC_AGENT:
return "EXEC-AGENT";

case PRTE_PROC_NOBARRIER:
return "PROC-NOBARRIER";
Expand Down
6 changes: 4 additions & 2 deletions src/util/attr.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
Expand Down Expand Up @@ -158,7 +158,7 @@ typedef uint16_t prte_job_flags_t;
#define PRTE_JOB_INFO_CACHE (PRTE_JOB_START_KEY + 52) // prte_list_t - list of prte_value_t to be included in job_info
#define PRTE_JOB_FULLY_DESCRIBED (PRTE_JOB_START_KEY + 53) // bool - job is fully described in launch msg
#define PRTE_JOB_SILENT_TERMINATION (PRTE_JOB_START_KEY + 54) // bool - do not generate an event notification when job
// normally terminates
// normally terminates
#define PRTE_JOB_SET_ENVAR (PRTE_JOB_START_KEY + 55) // prte_envar_t - set the given envar to the specified value
#define PRTE_JOB_UNSET_ENVAR (PRTE_JOB_START_KEY + 56) // string - name of envar to unset, if present
#define PRTE_JOB_PREPEND_ENVAR (PRTE_JOB_START_KEY + 57) // prte_envar_t - prepend the specified value to the given envar
Expand Down Expand Up @@ -199,6 +199,8 @@ typedef uint16_t prte_job_flags_t;
#define PRTE_JOB_RANK_OUTPUT (PRTE_JOB_START_KEY + 92) // bool - tag stdout/stderr with rank
#define PRTE_SPAWN_TIMEOUT (PRTE_JOB_START_KEY + 93) // int32 - number of seconds to spawn before terminating it as timed out
#define PRTE_JOB_RAW_OUTPUT (PRTE_JOB_START_KEY + 94) // bool - do not buffer output
#define PRTE_JOB_EXEC_AGENT (PRTE_JOB_START_KEY + 95) // char* - string specifying the cmd to use when exec'ing the local proc


#define PRTE_JOB_MAX_KEY 300

Expand Down
1 change: 1 addition & 0 deletions src/util/cmd_line.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ PRTE_CLASS_DECLARATION(prte_cli_result_t);
#define PRTE_CLI_MAX_RESTARTS "max-restarts" // required
#define PRTE_CLI_DISABLE_RECOVERY "disable-recovery" // none
#define PRTE_CLI_CONTINUOUS "continuous" // none
#define PRTE_CLI_EXEC_AGENT "exec-agent" // required

// Placement options
#define PRTE_CLI_MAPBY "map-by" // required
Expand Down

0 comments on commit e824387

Please sign in to comment.