From d79d5e84b8b2cb7ef872c971e94323bdfb218fda Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Wed, 11 May 2022 14:15:19 -0400 Subject: [PATCH] common/ucx: Fix string MCA handling * Model the MCA string variable handling after the common/ofi component that uses a second degree of indirection to keep the pointer valid even when the library closes (i.e., in ompi_info). Signed-off-by: Joshua Hursey --- opal/mca/common/ucx/common_ucx.c | 40 +++++++++++++++++++++----------- opal/mca/common/ucx/common_ucx.h | 5 ++-- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/opal/mca/common/ucx/common_ucx.c b/opal/mca/common/ucx/common_ucx.c index fd1432fb364..1b88d869bb8 100644 --- a/opal/mca/common/ucx/common_ucx.c +++ b/opal/mca/common/ucx/common_ucx.c @@ -7,7 +7,7 @@ * Copyright (c) 2021 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2022 Google, LLC. All rights reserved. - * + * Copyright (c) 2022 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,6 +37,8 @@ opal_common_ucx_module_t opal_common_ucx = { .progress_iterations = 100, .opal_mem_hooks = 1, + .tls = NULL, + .devices = NULL, }; static opal_mutex_t opal_common_ucx_mutex = OPAL_MUTEX_STATIC_INIT; @@ -80,7 +82,12 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t * &opal_common_ucx.opal_mem_hooks); if (NULL == opal_common_ucx.tls) { - opal_common_ucx.tls = default_tls; + // Extra level of string indirection needed to make ompi_info + // happy since it will unload this library before the MCA base + // cleans up the MCA vars. This will cause the string to go + // out of scope unless we place the pointer to it on the heap. + opal_common_ucx.tls = (char **) malloc(sizeof(char *)); + *opal_common_ucx.tls = strdup(default_tls); } tls_index = mca_base_var_register( @@ -90,18 +97,23 @@ OPAL_DECLSPEC void opal_common_ucx_mca_var_register(const mca_base_component_t * "A '^' prefix negates the list. " "For example, in order to exclude on shared memory and TCP transports, " "please set to '^posix,sysv,self,tcp,cma,knem,xpmem'.", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, - &opal_common_ucx.tls); + MCA_BASE_VAR_TYPE_STRING, NULL, 0, + MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, + opal_common_ucx.tls); if (NULL == opal_common_ucx.devices) { - opal_common_ucx.devices = default_devices; + opal_common_ucx.devices = (char **) malloc(sizeof(char *)); + *opal_common_ucx.devices = strdup(default_devices); } devices_index = mca_base_var_register( "opal", "opal_common", "ucx", "devices", "List of device driver pattern names, which, if supported by UCX, will " "bump its priority above ob1. Special values: any (any available)", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, - &opal_common_ucx.devices); + MCA_BASE_VAR_TYPE_STRING, NULL, 0, + MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, + opal_common_ucx.devices); if (component) { mca_base_var_register_synonym(verbose_index, component->mca_project_name, @@ -238,8 +250,8 @@ opal_common_ucx_support_level(ucp_context_h context) int ret; #endif - is_any_tl = !strcmp(opal_common_ucx.tls, "any"); - is_any_device = !strcmp(opal_common_ucx.devices, "any"); + is_any_tl = !strcmp(*opal_common_ucx.tls, "any"); + is_any_device = !strcmp(*opal_common_ucx.devices, "any"); /* Check for special value "any" */ if (is_any_tl && is_any_device) { @@ -250,19 +262,19 @@ opal_common_ucx_support_level(ucp_context_h context) #if HAVE_DECL_OPEN_MEMSTREAM /* Split transports list */ - negate = ('^' == (opal_common_ucx.tls)[0]); - tl_list = opal_argv_split(opal_common_ucx.tls + (negate ? 1 : 0), ','); + negate = ('^' == (*opal_common_ucx.tls)[0]); + tl_list = opal_argv_split(*opal_common_ucx.tls + (negate ? 1 : 0), ','); if (tl_list == NULL) { MCA_COMMON_UCX_VERBOSE(1, "failed to split tl list '%s', ucx is disabled", - opal_common_ucx.tls); + *opal_common_ucx.tls); goto out; } /* Split devices list */ - device_list = opal_argv_split(opal_common_ucx.devices, ','); + device_list = opal_argv_split(*opal_common_ucx.devices, ','); if (device_list == NULL) { MCA_COMMON_UCX_VERBOSE(1, "failed to split devices list '%s', ucx is disabled", - opal_common_ucx.devices); + *opal_common_ucx.devices); goto out_free_tl_list; } diff --git a/opal/mca/common/ucx/common_ucx.h b/opal/mca/common/ucx/common_ucx.h index 1d5bc674a12..4b78bc66587 100644 --- a/opal/mca/common/ucx/common_ucx.h +++ b/opal/mca/common/ucx/common_ucx.h @@ -1,6 +1,7 @@ /* * Copyright (c) 2018 Mellanox Technologies. All rights reserved. * All rights reserved. + * Copyright (c) 2022 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -88,8 +89,8 @@ typedef struct opal_common_ucx_module { int progress_iterations; int registered; bool opal_mem_hooks; - char *tls; - char *devices; + char **tls; + char **devices; } opal_common_ucx_module_t; typedef struct opal_common_ucx_del_proc {