From 4c1160e257c945d095c1985671b8a478fc45ca6a Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Thu, 29 Aug 2019 16:26:43 -0400 Subject: [PATCH] Fix tree spawn routed component issue * Fix #6618 - See comments on Issue #6618 for finer details. * The `plm/rsh` component uses the highest priority `routed` component to construct the launch tree. The remote orted's will activate all available `routed` components when updating routes. This allows the opportunity for the parent vpid on the remote `orted` to not match that which was expected in the tree launch. The result is that the remote orted tries to contact their parent with the wrong contact information and orted wireup will fail. * This fix forces the orteds to use the same `routed` component as the HNP used when contructing the tree, if tree launch is enabled. Signed-off-by: Joshua Hursey --- orte/mca/plm/rsh/plm_rsh_module.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 7e34de9ecfe..d022fe51c29 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -343,11 +343,12 @@ static int setup_launch(int *argcptr, char ***argvptr, char *orted_cmd, *orted_prefix, *final_cmd; int orted_index; int rc; - int i, j; + int i, j, cnt; bool found; char *lib_base=NULL, *bin_base=NULL; char *opal_prefix = getenv("OPAL_PREFIX"); char* full_orted_cmd = NULL; + char * rtmod; /* Figure out the basenames for the libdir and bindir. This requires some explanation: @@ -609,6 +610,18 @@ static int setup_launch(int *argcptr, char ***argvptr, (mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) { } + if (!mca_plm_rsh_component.no_tree_spawn) { + // Remove problematic and/or conflicting command line arguments that + // should not be passed on to our children. + cnt = opal_argv_count(orted_cmd_line); + for (i=0; i < cnt; i+=3) { + if (0 == strcmp(orted_cmd_line[i+1], "routed")) { + opal_argv_delete(&cnt, &orted_cmd_line, i, 3); + break; + } + } + } + /* * Add the basic arguments to the orted command line, including * all debug options @@ -627,6 +640,16 @@ static int setup_launch(int *argcptr, char ***argvptr, if (!mca_plm_rsh_component.no_tree_spawn) { opal_argv_append(&argc, &argv, "--tree-spawn"); orte_oob_base_get_addr(¶m); + + // When tree-spawn'ing we need to force the remote daemons to use + // the routing component that was used to setup the launch tree. + // Otherwise the orte_parent_uri will not match the orted they + // expect to find in the routing tree. + rtmod = orte_rml.get_routed(orte_coll_conduit); + opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID); + opal_argv_append(&argc, &argv, "routed"); + opal_argv_append(&argc, &argv, rtmod); + opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID); opal_argv_append(&argc, &argv, "orte_parent_uri"); opal_argv_append(&argc, &argv, param); @@ -1187,6 +1210,10 @@ static void launch_daemons(int fd, short args, void *cbdata) OBJ_CONSTRUCT(&coll, opal_list_t); rtmod = orte_rml.get_routed(orte_coll_conduit); orte_routed.get_routing_list(rtmod, &coll); + + OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, + "%s plm:rsh:launch Tree Launch using routed/%s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rtmod)); } /* setup the launch */