Skip to content

Commit

Permalink
common/ucx: Fix string MCA handling
Browse files Browse the repository at this point in the history
 * Model the MCA string variable handling after the common/ofi
   component that uses a second degree of indirection to keep
   the pointer valid even when the library closes (i.e., in ompi_info).

Signed-off-by: Joshua Hursey <[email protected]>
  • Loading branch information
jjhursey committed May 11, 2022
1 parent 2cf8549 commit d79d5e8
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 16 deletions.
40 changes: 26 additions & 14 deletions opal/mca/common/ucx/common_ucx.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
* Copyright (c) 2021 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2022 Google, LLC. All rights reserved.
*
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -37,6 +37,8 @@ opal_common_ucx_module_t opal_common_ucx =
{
.progress_iterations = 100,
.opal_mem_hooks = 1,
.tls = NULL,
.devices = NULL,
};

static opal_mutex_t opal_common_ucx_mutex = OPAL_MUTEX_STATIC_INIT;
Expand Down Expand Up @@ -80,7 +82,12 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
&opal_common_ucx.opal_mem_hooks);

if (NULL == opal_common_ucx.tls) {
opal_common_ucx.tls = default_tls;
// Extra level of string indirection needed to make ompi_info
// happy since it will unload this library before the MCA base
// cleans up the MCA vars. This will cause the string to go
// out of scope unless we place the pointer to it on the heap.
opal_common_ucx.tls = (char **) malloc(sizeof(char *));
*opal_common_ucx.tls = strdup(default_tls);
}

tls_index = mca_base_var_register(
Expand All @@ -90,18 +97,23 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t *
"A '^' prefix negates the list. "
"For example, in order to exclude on shared memory and TCP transports, "
"please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&opal_common_ucx.tls);
MCA_BASE_VAR_TYPE_STRING, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL,
opal_common_ucx.tls);

if (NULL == opal_common_ucx.devices) {
opal_common_ucx.devices = default_devices;
opal_common_ucx.devices = (char **) malloc(sizeof(char *));
*opal_common_ucx.devices = strdup(default_devices);
}
devices_index = mca_base_var_register(
"opal", "opal_common", "ucx", "devices",
"List of device driver pattern names, which, if supported by UCX, will "
"bump its priority above ob1. Special values: any (any available)",
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
&opal_common_ucx.devices);
MCA_BASE_VAR_TYPE_STRING, NULL, 0,
MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_LOCAL,
opal_common_ucx.devices);

if (component) {
mca_base_var_register_synonym(verbose_index, component->mca_project_name,
Expand Down Expand Up @@ -238,8 +250,8 @@ opal_common_ucx_support_level(ucp_context_h context)
int ret;
#endif

is_any_tl = !strcmp(opal_common_ucx.tls, "any");
is_any_device = !strcmp(opal_common_ucx.devices, "any");
is_any_tl = !strcmp(*opal_common_ucx.tls, "any");
is_any_device = !strcmp(*opal_common_ucx.devices, "any");

/* Check for special value "any" */
if (is_any_tl && is_any_device) {
Expand All @@ -250,19 +262,19 @@ opal_common_ucx_support_level(ucp_context_h context)

#if HAVE_DECL_OPEN_MEMSTREAM
/* Split transports list */
negate = ('^' == (opal_common_ucx.tls)[0]);
tl_list = opal_argv_split(opal_common_ucx.tls + (negate ? 1 : 0), ',');
negate = ('^' == (*opal_common_ucx.tls)[0]);
tl_list = opal_argv_split(*opal_common_ucx.tls + (negate ? 1 : 0), ',');
if (tl_list == NULL) {
MCA_COMMON_UCX_VERBOSE(1, "failed to split tl list '%s', ucx is disabled",
opal_common_ucx.tls);
*opal_common_ucx.tls);
goto out;
}

/* Split devices list */
device_list = opal_argv_split(opal_common_ucx.devices, ',');
device_list = opal_argv_split(*opal_common_ucx.devices, ',');
if (device_list == NULL) {
MCA_COMMON_UCX_VERBOSE(1, "failed to split devices list '%s', ucx is disabled",
opal_common_ucx.devices);
*opal_common_ucx.devices);
goto out_free_tl_list;
}

Expand Down
5 changes: 3 additions & 2 deletions opal/mca/common/ucx/common_ucx.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*
* Copyright (c) 2018 Mellanox Technologies. All rights reserved.
* All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -88,8 +89,8 @@ typedef struct opal_common_ucx_module {
int progress_iterations;
int registered;
bool opal_mem_hooks;
char *tls;
char *devices;
char **tls;
char **devices;
} opal_common_ucx_module_t;

typedef struct opal_common_ucx_del_proc {
Expand Down

0 comments on commit d79d5e8

Please sign in to comment.