Skip to content

Commit

Permalink
Merge pull request #1194 from rhc54/topic/rfa
Browse files Browse the repository at this point in the history
Enable per-job specification of the exec agent
  • Loading branch information
rhc54 authored Feb 5, 2022
2 parents 37070b4 + e824387 commit d68a61a
Show file tree
Hide file tree
Showing 12 changed files with 55 additions and 24 deletions.
24 changes: 20 additions & 4 deletions src/mca/odls/base/odls_base_default_fns.c
Original file line number Diff line number Diff line change
Expand Up @@ -1066,17 +1066,33 @@ void prte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
cd->cmd = strdup(app->app);
cd->argv = prte_argv_copy(app->argv);
}
} else if (NULL != prte_fork_agent) {
} else if (prte_get_attribute(&jobdat->attributes, PRTE_JOB_EXEC_AGENT, (void**)&ptr, PMIX_STRING)) {
/* we were given a fork agent - use it */
cd->argv = prte_argv_copy(prte_fork_agent);
cd->argv = prte_argv_split(ptr, ' ');
/* add in the argv from the app */
for (i = 0; NULL != app->argv[i]; i++) {
prte_argv_append_nosize(&cd->argv, app->argv[i]);
}
cd->cmd = prte_path_findv(prte_fork_agent[0], X_OK, prte_launch_environ, NULL);
cd->cmd = prte_path_findv(cd->argv[0], X_OK, prte_launch_environ, NULL);
if (NULL == cd->cmd) {
prte_show_help("help-prte-odls-base.txt", "prte-odls-base:fork-agent-not-found", true,
prte_process_info.nodename, prte_fork_agent[0]);
prte_process_info.nodename, ptr);
state = PRTE_PROC_STATE_FAILED_TO_LAUNCH;
free(ptr);
goto errorout;
}
free(ptr);
} else if (NULL != prte_fork_agent_string) {
/* we were given a fork agent - use it */
cd->argv = prte_argv_split(prte_fork_agent_string, ' ');
/* add in the argv from the app */
for (i = 0; NULL != app->argv[i]; i++) {
prte_argv_append_nosize(&cd->argv, app->argv[i]);
}
cd->cmd = prte_path_findv(cd->argv[0], X_OK, prte_launch_environ, NULL);
if (NULL == cd->cmd) {
prte_show_help("help-prte-odls-base.txt", "prte-odls-base:fork-agent-not-found", true,
prte_process_info.nodename, cd->argv[0]);
state = PRTE_PROC_STATE_FAILED_TO_LAUNCH;
goto errorout;
}
Expand Down
1 change: 1 addition & 0 deletions src/mca/schizo/prte/schizo_prte.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ static struct option prteoptions[] = {
PRTE_OPTION_DEFINE(PRTE_CLI_SHOW_PROGRESS, PRTE_ARG_REQD),
PRTE_OPTION_DEFINE(PRTE_CLI_HOSTFILE, PRTE_ARG_REQD),
PRTE_OPTION_SHORT_DEFINE(PRTE_CLI_HOST, PRTE_ARG_REQD, 'H'),
PRTE_OPTION_DEFINE(PRTE_CLI_EXEC_AGENT, PRTE_ARG_REQD),

// output options
PRTE_OPTION_DEFINE(PRTE_CLI_STREAM_BUF, PRTE_ARG_REQD),
Expand Down
5 changes: 5 additions & 0 deletions src/prted/pmix/pmix_server_dyn.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,11 @@ static void interim(int sd, short args, void *cbdata)
goto complete;
}

/*** EXEC AGENT ***/
} else if (PMIX_CHECK_KEY(info, PMIX_EXEC_AGENT)) {
prte_set_attribute(&jdata->attributes, PRTE_JOB_EXEC_AGENT, PRTE_ATTR_GLOBAL,
info->value.data.string, PMIX_STRING);

/*** CPUS/RANK ***/
} else if (PMIX_CHECK_KEY(info, PMIX_CPUS_PER_PROC)) {
u16 = info->value.data.uint32;
Expand Down
4 changes: 0 additions & 4 deletions src/runtime/prte_finalize.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,6 @@ int prte_finalize(void)
}
PRTE_RELEASE(prte_node_pool);

if (NULL != prte_fork_agent) {
prte_argv_free(prte_fork_agent);
}

free(prte_process_info.nodename);
prte_process_info.nodename = NULL;

Expand Down
2 changes: 1 addition & 1 deletion src/runtime/prte_globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ bool prte_node_info_communicated = false;
/* launch agents */
char *prte_launch_agent = NULL;
char **prted_cmd_line = NULL;
char **prte_fork_agent = NULL;
char *prte_fork_agent_string = NULL;

/* exit flags */
int prte_exit_status = 0;
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/prte_globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -546,7 +546,7 @@ PRTE_EXPORT extern bool prte_node_info_communicated;
/* launch agents */
PRTE_EXPORT extern char *prte_launch_agent;
PRTE_EXPORT extern char **prted_cmd_line;
PRTE_EXPORT extern char **prte_fork_agent;
PRTE_EXPORT extern char *prte_fork_agent_string;

/* exit flags */
PRTE_EXPORT extern bool prte_abnormal_term_ordered;
Expand Down
5 changes: 0 additions & 5 deletions src/runtime/prte_mca_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@

static bool passed_thru = false;
static int prte_progress_thread_debug_level = -1;
static char *prte_fork_agent_string = NULL;
static char *prte_tmpdir_base = NULL;
static char *prte_local_tmpdir_base = NULL;
static char *prte_remote_tmpdir_base = NULL;
Expand Down Expand Up @@ -480,10 +479,6 @@ int prte_register_params(void)
PRTE_MCA_BASE_VAR_FLAG_NONE, PRTE_INFO_LVL_9,
PRTE_MCA_BASE_VAR_SCOPE_READONLY, &prte_fork_agent_string);

if (NULL != prte_fork_agent_string) {
prte_fork_agent = prte_argv_split(prte_fork_agent_string, ' ');
}

/* whether or not to require RM allocation */
prte_allocation_required = false;
(void) prte_mca_base_var_register(
Expand Down
7 changes: 7 additions & 0 deletions src/tools/prte/prte.c
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,13 @@ int main(int argc, char *argv[])
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_BINDTO, opt->values[0], PMIX_STRING);
}

/* check for an exec agent */
opt = prte_cmd_line_get_param(&results, PRTE_CLI_EXEC_AGENT);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_EXEC_AGENT, opt->values[0], PMIX_STRING);
}

/* mark if recovery was enabled on the cmd line */
if (prte_cmd_line_is_taken(&results, PRTE_CLI_ENABLE_RECOVERY)) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_RECOVERABLE, NULL, PMIX_BOOL);
Expand Down
18 changes: 12 additions & 6 deletions src/tools/prun/prun.c
Original file line number Diff line number Diff line change
Expand Up @@ -762,12 +762,12 @@ int prun(int argc, char *argv[])
}

/* check what user wants us to do with stdin */
opt = prte_cmd_line_get_param(&results, "stdin");
opt = prte_cmd_line_get_param(&results, PRTE_CLI_STDIN);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_STDIN_TGT, opt->values[0], PMIX_STRING);
}

opt = prte_cmd_line_get_param(&results, "map-by");
opt = prte_cmd_line_get_param(&results, PRTE_CLI_MAPBY);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_MAPBY, opt->values[0], PMIX_STRING);
if (NULL != strcasestr(opt->values[0], "DONOTLAUNCH")) {
Expand All @@ -776,23 +776,29 @@ int prun(int argc, char *argv[])
}

/* if the user specified a ranking policy, then set it */
opt = prte_cmd_line_get_param(&results, "rank-by");
opt = prte_cmd_line_get_param(&results, PRTE_CLI_RANKBY);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_RANKBY, opt->values[0], PMIX_STRING);
}

/* if the user specified a binding policy, then set it */
opt = prte_cmd_line_get_param(&results, "bind-to");
opt = prte_cmd_line_get_param(&results, PRTE_CLI_BINDTO);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_BINDTO, opt->values[0], PMIX_STRING);
}

/* check for an exec agent */
opt = prte_cmd_line_get_param(&results, PRTE_CLI_EXEC_AGENT);
if (NULL != opt) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_EXEC_AGENT, opt->values[0], PMIX_STRING);
}

/* mark if recovery was enabled on the cmd line */
if (prte_cmd_line_is_taken(&results, "enable-recovery")) {
if (prte_cmd_line_is_taken(&results, PRTE_CLI_ENABLE_RECOVERY)) {
PMIX_INFO_LIST_ADD(ret, jinfo, PMIX_JOB_RECOVERABLE, &flag, PMIX_BOOL);
}
/* record the max restarts */
opt = prte_cmd_line_get_param(&results, "max-restarts");
opt = prte_cmd_line_get_param(&results, PRTE_CLI_MAX_RESTARTS);
if (NULL != opt) {
ui32 = strtol(opt->values[0], NULL, 10);
PRTE_LIST_FOREACH(app, &apps, prte_pmix_app_t)
Expand Down
4 changes: 3 additions & 1 deletion src/util/attr.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018-2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
Expand Down Expand Up @@ -466,6 +466,8 @@ const char *prte_attr_key_to_str(prte_attribute_key_t key)
return "SPAWN-TIMEOUT";
case PRTE_JOB_RAW_OUTPUT:
return "DO-NOT-BUFFER-OUTPUT";
case PRTE_JOB_EXEC_AGENT:
return "EXEC-AGENT";

case PRTE_PROC_NOBARRIER:
return "PROC-NOBARRIER";
Expand Down
6 changes: 4 additions & 2 deletions src/util/attr.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
Expand Down Expand Up @@ -158,7 +158,7 @@ typedef uint16_t prte_job_flags_t;
#define PRTE_JOB_INFO_CACHE (PRTE_JOB_START_KEY + 52) // prte_list_t - list of prte_value_t to be included in job_info
#define PRTE_JOB_FULLY_DESCRIBED (PRTE_JOB_START_KEY + 53) // bool - job is fully described in launch msg
#define PRTE_JOB_SILENT_TERMINATION (PRTE_JOB_START_KEY + 54) // bool - do not generate an event notification when job
// normally terminates
// normally terminates
#define PRTE_JOB_SET_ENVAR (PRTE_JOB_START_KEY + 55) // prte_envar_t - set the given envar to the specified value
#define PRTE_JOB_UNSET_ENVAR (PRTE_JOB_START_KEY + 56) // string - name of envar to unset, if present
#define PRTE_JOB_PREPEND_ENVAR (PRTE_JOB_START_KEY + 57) // prte_envar_t - prepend the specified value to the given envar
Expand Down Expand Up @@ -199,6 +199,8 @@ typedef uint16_t prte_job_flags_t;
#define PRTE_JOB_RANK_OUTPUT (PRTE_JOB_START_KEY + 92) // bool - tag stdout/stderr with rank
#define PRTE_SPAWN_TIMEOUT (PRTE_JOB_START_KEY + 93) // int32 - number of seconds to spawn before terminating it as timed out
#define PRTE_JOB_RAW_OUTPUT (PRTE_JOB_START_KEY + 94) // bool - do not buffer output
#define PRTE_JOB_EXEC_AGENT (PRTE_JOB_START_KEY + 95) // char* - string specifying the cmd to use when exec'ing the local proc


#define PRTE_JOB_MAX_KEY 300

Expand Down
1 change: 1 addition & 0 deletions src/util/cmd_line.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ PRTE_CLASS_DECLARATION(prte_cli_result_t);
#define PRTE_CLI_MAX_RESTARTS "max-restarts" // required
#define PRTE_CLI_DISABLE_RECOVERY "disable-recovery" // none
#define PRTE_CLI_CONTINUOUS "continuous" // none
#define PRTE_CLI_EXEC_AGENT "exec-agent" // required

// Placement options
#define PRTE_CLI_MAPBY "map-by" // required
Expand Down

0 comments on commit d68a61a

Please sign in to comment.