diff --git a/src/hwloc/hwloc-internal.h b/src/hwloc/hwloc-internal.h index ee8b304e25..20d7ef23f8 100644 --- a/src/hwloc/hwloc-internal.h +++ b/src/hwloc/hwloc-internal.h @@ -469,7 +469,6 @@ PRRTE_EXPORT int prrte_hwloc_base_topology_export_xmlbuffer(hwloc_topology_t top PRRTE_EXPORT int prrte_hwloc_base_topology_set_flags (hwloc_topology_t topology, unsigned long flags, bool io); - PRRTE_EXPORT int prrte_hwloc_base_open(void); PRRTE_EXPORT void prrte_hwloc_base_close(void); PRRTE_EXPORT int prrte_hwloc_base_register(void); diff --git a/src/hwloc/hwloc_base_util.c b/src/hwloc/hwloc_base_util.c index 96f177e998..e03663d902 100644 --- a/src/hwloc/hwloc_base_util.c +++ b/src/hwloc/hwloc_base_util.c @@ -19,6 +19,7 @@ * Copyright (C) 2018 Mellanox Technologies, Ltd. * All rights reserved. * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + # Copyright (c) 2019 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -154,7 +155,7 @@ int prrte_hwloc_base_filter_cpus(hwloc_topology_t topo) avail = hwloc_bitmap_alloc(); hwloc_bitmap_and(avail, root->online_cpuset, root->allowed_cpuset); #else - avail = hwloc_bitmap_dup(root->cpuset); + avail = hwloc_bitmap_dup(hwloc_topology_get_allowed_cpuset(topo)); #endif PRRTE_OUTPUT_VERBOSE((5, prrte_hwloc_base_output, "hwloc:base: no cpus specified - using root available cpuset")); @@ -177,8 +178,7 @@ int prrte_hwloc_base_filter_cpus(hwloc_topology_t topo) #if HWLOC_API_VERSION < 0x20000 hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset); #else - hwloc_bitmap_free(pucpus); - pucpus = hwloc_bitmap_dup(pu->cpuset); + hwloc_bitmap_and(pucpus, pu->cpuset, hwloc_topology_get_allowed_cpuset(topo)); #endif hwloc_bitmap_or(res, avail, pucpus); hwloc_bitmap_copy(avail, res); @@ -199,8 +199,7 @@ int prrte_hwloc_base_filter_cpus(hwloc_topology_t topo) #if HWLOC_API_VERSION < 0x20000 hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset); #else - hwloc_bitmap_free(pucpus); - pucpus = hwloc_bitmap_dup(pu->cpuset); + hwloc_bitmap_and(pucpus, pu->cpuset, hwloc_topology_get_allowed_cpuset(topo)); #endif hwloc_bitmap_or(res, avail, pucpus); hwloc_bitmap_copy(avail, res); @@ -274,13 +273,6 @@ static void fill_cache_line_size(void) int prrte_hwloc_base_get_topology(void) { int rc; - prrte_process_name_t wildcard_rank; - char *val = NULL; -#if HWLOC_API_VERSION >= 0x20000 - int rc2, rc3, fd; - uint64_t addr, *aptr, size, *sptr; - char *shmemfile; -#endif prrte_output_verbose(2, prrte_hwloc_base_output, "hwloc:base:get_topology"); @@ -289,114 +281,12 @@ int prrte_hwloc_base_get_topology(void) if (NULL != prrte_hwloc_topology) { return PRRTE_SUCCESS; } - wildcard_rank.jobid = PRRTE_PROC_MY_NAME->jobid; - wildcard_rank.vpid = PRRTE_VPID_WILDCARD; - -#if HWLOC_API_VERSION >= 0x20000 - prrte_output_verbose(2, prrte_hwloc_base_output, - "hwloc:base: looking for topology in shared memory"); - - /* first try to get the shmem link, if available */ - aptr = &addr; - sptr = &size; - PRRTE_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_HWLOC_SHMEM_FILE, - &wildcard_rank, (void**)&shmemfile, PRRTE_STRING); - PRRTE_MODEX_RECV_VALUE_OPTIONAL(rc2, PMIX_HWLOC_SHMEM_ADDR, - &wildcard_rank, (void**)&aptr, PRRTE_SIZE); - PRRTE_MODEX_RECV_VALUE_OPTIONAL(rc3, PMIX_HWLOC_SHMEM_SIZE, - &wildcard_rank, (void**)&sptr, PRRTE_SIZE); - if (PRRTE_SUCCESS == rc && PRRTE_SUCCESS == rc2 && PRRTE_SUCCESS == rc3) { - if (0 > (fd = open(shmemfile, O_RDONLY))) { - free(shmemfile); - PRRTE_ERROR_LOG(PRRTE_ERR_FILE_OPEN_FAILURE) - return PRRTE_ERR_FILE_OPEN_FAILURE; - } - free(shmemfile); - if (0 != hwloc_shmem_topology_adopt(&prrte_hwloc_topology, fd, - 0, (void*)addr, size, 0)) { - if (4 < prrte_output_get_verbosity(prrte_hwloc_base_output)) { - FILE *file = fopen("/proc/self/maps", "r"); - if (file) { - char line[256]; - prrte_output(0, "Dumping /proc/self/maps"); - - while (fgets(line, sizeof(line), file) != NULL) { - char *end = strchr(line, '\n'); - if (end) { - *end = '\0'; - } - prrte_output(0, "%s", line); - } - fclose(file); - } - } - /* failed to adopt from shmem, fallback to other ways to get the topology */ - } else { - prrte_output_verbose(2, prrte_hwloc_base_output, - "hwloc:base: topology in shared memory"); - topo_in_shmem = true; - return PRRTE_SUCCESS; - } - } -#endif - /* if that isn't available, then try to retrieve - * the xml representation from the PMIx data store */ - prrte_output_verbose(1, prrte_hwloc_base_output, - "hwloc:base[%s:%d] getting topology XML string", - __FILE__, __LINE__); -#if HWLOC_API_VERSION >= 0x20000 - PRRTE_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_HWLOC_XML_V2, - &wildcard_rank, &val, PRRTE_STRING); -#else - PRRTE_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_HWLOC_XML_V1, - &wildcard_rank, &val, PRRTE_STRING); -#endif - if (rc != PRRTE_SUCCESS) { - /* check the old topo key to keep compatibility with older RMs */ - PRRTE_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_TOPO, - &wildcard_rank, &val, PRRTE_STRING); - } - if (PRRTE_SUCCESS == rc && NULL != val) { - prrte_output_verbose(1, prrte_hwloc_base_output, - "hwloc:base loading topology from XML"); - /* load the topology */ - if (0 != hwloc_topology_init(&prrte_hwloc_topology)) { - free(val); - return PRRTE_ERROR; - } - if (0 != hwloc_topology_set_xmlbuffer(prrte_hwloc_topology, val, strlen(val))) { - free(val); - hwloc_topology_destroy(prrte_hwloc_topology); - return PRRTE_ERROR; - } - /* since we are loading this from an external source, we have to - * explicitly set a flag so hwloc sets things up correctly - */ - if (0 != prrte_hwloc_base_topology_set_flags(prrte_hwloc_topology, - HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM, - true)) { - hwloc_topology_destroy(prrte_hwloc_topology); - free(val); - return PRRTE_ERROR; - } - /* now load the topology */ - if (0 != hwloc_topology_load(prrte_hwloc_topology)) { - hwloc_topology_destroy(prrte_hwloc_topology); - free(val); - return PRRTE_ERROR; - } - free(val); - /* filter the cpus thru any default cpu set */ - if (PRRTE_SUCCESS != (rc = prrte_hwloc_base_filter_cpus(prrte_hwloc_topology))) { - hwloc_topology_destroy(prrte_hwloc_topology); - return rc; - } - } else if (NULL == prrte_hwloc_base_topo_file) { + if (NULL == prrte_hwloc_base_topo_file) { prrte_output_verbose(1, prrte_hwloc_base_output, "hwloc:base discovering topology"); if (0 != hwloc_topology_init(&prrte_hwloc_topology) || - 0 != prrte_hwloc_base_topology_set_flags(prrte_hwloc_topology, 0, true) || + 0 != prrte_hwloc_base_topology_set_flags(prrte_hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM, true) || 0 != hwloc_topology_load(prrte_hwloc_topology)) { PRRTE_ERROR_LOG(PRRTE_ERR_NOT_SUPPORTED); return PRRTE_ERR_NOT_SUPPORTED; @@ -450,7 +340,7 @@ int prrte_hwloc_base_set_topology(char *topofile) * explicitly set a flag so hwloc sets things up correctly */ if (0 != prrte_hwloc_base_topology_set_flags(prrte_hwloc_topology, - HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM, + HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM, true)) { hwloc_topology_destroy(prrte_hwloc_topology); return PRRTE_ERR_NOT_SUPPORTED; @@ -523,7 +413,9 @@ void prrte_hwloc_base_free_topology(hwloc_topology_t topo) void prrte_hwloc_base_get_local_cpuset(void) { +#if HWLOC_API_VERSION < 0x20000 hwloc_obj_t root; +#endif if (NULL != prrte_hwloc_topology) { if (NULL == prrte_hwloc_my_cpuset) { @@ -535,8 +427,12 @@ void prrte_hwloc_base_get_local_cpuset(void) prrte_hwloc_my_cpuset, HWLOC_CPUBIND_PROCESS) < 0) { /* we are not bound - use the root's available cpuset */ - root = hwloc_get_root_obj(prrte_hwloc_topology); - hwloc_bitmap_copy(prrte_hwloc_my_cpuset, root->cpuset); + #if HWLOC_API_VERSION < 0x20000 + root = hwloc_get_root_obj(prrte_hwloc_topology); + hwloc_bitmap_and(prrte_hwloc_my_cpuset, root->online_cpuset, root->allowed_cpuset); + #else + hwloc_bitmap_copy(prrte_hwloc_my_cpuset, hwloc_topology_get_allowed_cpuset(prrte_hwloc_topology)); + #endif } } } @@ -1743,82 +1639,251 @@ int prrte_hwloc_base_cset2str(char *str, int len, return PRRTE_SUCCESS; } + +/* given an input obj somewhere in the hwloc tree, look for a + * numa object that contains it + */ +static hwloc_obj_t find_my_numa(hwloc_obj_t obj) +{ + hwloc_obj_t p, numa; + size_t i; + + p = obj; + while (NULL != p && 0 == p->memory_arity) { + p = p->parent; + } + // p should have either found a level that contains numas or reached NULL + if (NULL == p) { + return NULL; + } + for (i=0; i < p->memory_arity; ++i) { + numa = &(p->memory_first_child[i]); + + if (hwloc_bitmap_isincluded(obj->cpuset, numa->cpuset)) { + return numa; + } + } + return NULL; +} + +/* which level from the set {socket, core, pu} has + * the first descendent underneath the lowest numa level. + * returns MACHINE if there is no numa level + * + * Eg if an hwloc tree had numas containing sockets like this + * <[../..][../..]><[../..][../..]> + * the tree would be + * mach +memory_children: n n + * s s s s + * c c c c c c c c + * pppppppppppppppp + * so this should return SOCKET + */ +static hwloc_obj_type_t +first_type_under_a_numa(hwloc_topology_t topo) +{ + hwloc_obj_t p; + hwloc_obj_type_t type; + + p = hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, 0); + /* climb the ladder */ + while (NULL != p && 0 == p->memory_arity) { + if (HWLOC_OBJ_PU == p->type|| + HWLOC_OBJ_CORE == p->type || + HWLOC_OBJ_SOCKET == p->type) { + type = p->type; + } + p = p->parent; + } + if (NULL != p && 0 < p->memory_arity) { + return type; + } + + return HWLOC_OBJ_MACHINE; +} + /* - * Make a prettyprint string for a cset in a map format. + * Make a prettyprint string for a cset in a map format with NUMA markers. * Example: [B./..] * Key: [] - signifies socket + * <> - signifies numa * / - divider between cores * . - signifies PU a process not bound to * B - signifies PU a process is bound to + * ~ - signifies PU that is disallowed, eg not in our cgroup: */ -int prrte_hwloc_base_cset2mapstr(char *str, int len, + int prrte_hwloc_base_cset2mapstr(char *str, int len, hwloc_topology_t topo, hwloc_cpuset_t cpuset) { char tmp[BUFSIZ]; int core_index, pu_index; - const int stmp = sizeof(tmp) - 1; hwloc_obj_t socket, core, pu; hwloc_obj_t root; prrte_hwloc_topo_data_t *sum; - - str[0] = tmp[stmp] = '\0'; - - /* if the cpuset is all zero, then not bound */ + bool fake_on_first_socket; + bool fake_on_first_core; + hwloc_cpuset_t cpuset_for_socket; + hwloc_cpuset_t cpuset_for_core; + hwloc_obj_t prev_numa = NULL; + hwloc_obj_t cur_numa = NULL; + hwloc_obj_type_t type_under_numa; + bool a_numa_marker_is_open; + + /* if the cpuset is all zero, then not bound */ if (hwloc_bitmap_iszero(cpuset)) { return PRRTE_ERR_NOT_BOUND; } - /* if the cpuset includes all available cpus, then we are unbound */ + str[0] = '\0'; + memset(tmp, 0, BUFSIZ); + + /* if the cpuset includes all available cpus, then we are unbound */ root = hwloc_get_root_obj(topo); - if (NULL != root->userdata) { - sum = (prrte_hwloc_topo_data_t*)root->userdata; - if (NULL == sum->available) { - return PRRTE_ERROR; - } - if (0 != hwloc_bitmap_isincluded(sum->available, cpuset)) { - return PRRTE_ERR_NOT_BOUND; - } + if (NULL == root->userdata) { + /* this should never happen */ + return PRRTE_ERROR; + } + sum = (prrte_hwloc_topo_data_t*)root->userdata; + if (NULL == sum->available) { + /* again, should never happen */ + return PRRTE_ERROR; + } + if (0 != hwloc_bitmap_isincluded(sum->available, cpuset)) { + return PRRTE_ERR_NOT_BOUND; } + /* hwloc trees aren't required to have sockets and cores, + * just a MACHINE at the top and PU at the bottom. The 'fake_*' vars make + * the loops always iterate at least once, even if the initial socket = ... + * etc lookup is NULL. So we have to take a little extra care here in + * case we are in a no-socket or no-core scenario. Thankfully, everyone + * still has NUMA regions! */ + type_under_numa = first_type_under_a_numa(topo); + /* Iterate over all existing sockets */ - for (socket = hwloc_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0); - NULL != socket; - socket = socket->next_cousin) { + fake_on_first_socket = true; + do { + socket = hwloc_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0); + fake_on_first_socket = false; strncat(str, "[", len - strlen(str) - 1); + // if numas contain sockets, example output <[../..][../..]><[../..][../..]> + if (HWLOC_OBJ_SOCKET == type_under_numa) { + prev_numa = cur_numa; + cur_numa = find_my_numa(socket); + if (cur_numa && cur_numa != prev_numa) { + if (a_numa_marker_is_open) { + strncat(str, ">", len - strlen(str) - 1); + } + strncat(str, "<", len - strlen(str) - 1); + a_numa_marker_is_open = true; + } + } + if (NULL != socket) { + strncat(str, "[", len - strlen(str) - 1); + cpuset_for_socket = socket->cpuset; + } else { + cpuset_for_socket = root->cpuset; + } + /* Iterate over all existing cores in this socket */ + fake_on_first_core = true; core_index = 0; - for (core = hwloc_get_obj_inside_cpuset_by_type(topo, - socket->cpuset, - HWLOC_OBJ_CORE, core_index); - NULL != core; - core = hwloc_get_obj_inside_cpuset_by_type(topo, - socket->cpuset, - HWLOC_OBJ_CORE, ++core_index)) { - if (core_index > 0) { + core = hwloc_get_obj_inside_cpuset_by_type(topo, + cpuset_for_socket, + HWLOC_OBJ_CORE, core_index); + while (NULL != core || fake_on_first_core) { + fake_on_first_core = false; + + /* if numas contain cores and are contained by sockets, + * example output [<../..><../..>][<../../../..>] + */ + if (HWLOC_OBJ_CORE == type_under_numa) { + prev_numa = cur_numa; + cur_numa = find_my_numa(core); + if (cur_numa && cur_numa != prev_numa) { + if (a_numa_marker_is_open) { + strncat(str, ">", len - strlen(str) - 1); + } + strncat(str, "<", len - strlen(str) - 1); + a_numa_marker_is_open = true; + } + } + + + if (0 < core_index) { strncat(str, "/", len - strlen(str) - 1); } + if (NULL != core) { + cpuset_for_core = core->cpuset; + } else { + cpuset_for_core = cpuset_for_socket; + } + /* Iterate over all existing PUs in this core */ pu_index = 0; - for (pu = hwloc_get_obj_inside_cpuset_by_type(topo, - core->cpuset, - HWLOC_OBJ_PU, pu_index); - NULL != pu; - pu = hwloc_get_obj_inside_cpuset_by_type(topo, - core->cpuset, - HWLOC_OBJ_PU, ++pu_index)) { + pu = hwloc_get_obj_inside_cpuset_by_type(topo, + cpuset_for_core, + HWLOC_OBJ_PU, pu_index); + while (NULL != pu) { + /* if numas contain PU and are contained by cores (seems unlikely) + * example output [<..../....>/<..../....>/<..../....>/<..../....>] + */ + if (HWLOC_OBJ_PU == type_under_numa) { + prev_numa = cur_numa; + cur_numa = find_my_numa(pu); + if (cur_numa && cur_numa != prev_numa) { + if (a_numa_marker_is_open) { + strncat(str, ">", len - strlen(str) - 1); + } + strncat(str, "<", len - strlen(str) - 1); + a_numa_marker_is_open = true; + } + } /* Is this PU in the cpuset? */ if (hwloc_bitmap_isset(cpuset, pu->os_index)) { strncat(str, "B", len - strlen(str) - 1); } else { - strncat(str, ".", len - strlen(str) - 1); + if (hwloc_bitmap_isset(sum->available, pu->os_index)) { + strncat(str, ".", len - strlen(str) - 1); + } else { + strncat(str, "~", len - strlen(str) - 1); + } + } + pu = hwloc_get_obj_inside_cpuset_by_type(topo, + cpuset_for_core, + HWLOC_OBJ_PU, ++pu_index); + } /* end while pu */ + if (HWLOC_OBJ_PU == type_under_numa) { + if (a_numa_marker_is_open) { + strncat(str, ">", len - strlen(str) - 1); + a_numa_marker_is_open = false; } } + core = hwloc_get_obj_inside_cpuset_by_type(topo, + cpuset_for_socket, + HWLOC_OBJ_CORE, ++core_index); + } /* end while core */ + if (HWLOC_OBJ_CORE == type_under_numa) { + if (a_numa_marker_is_open) { + strncat(str, ">", len - strlen(str) - 1); + a_numa_marker_is_open = false; + } + } + if (NULL != socket) { + strncat(str, "]", len - strlen(str) - 1); + socket = socket->next_cousin; + } + } while (NULL != socket || fake_on_first_socket); + + if (HWLOC_OBJ_SOCKET == type_under_numa) { + if (a_numa_marker_is_open) { + strncat(str, ">", len - strlen(str) - 1); + a_numa_marker_is_open = false; } - strncat(str, "]", len - strlen(str) - 1); } return PRRTE_SUCCESS; @@ -2024,7 +2089,7 @@ int prrte_hwloc_get_sorted_numa_list(hwloc_topology_t topo, char* device_name, p char* prrte_hwloc_base_get_topo_signature(hwloc_topology_t topo) { int nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt; - char *sig=NULL, *arch = NULL, *endian; + char *sig=NULL, *arch = NULL, *endian, *pus; hwloc_obj_t obj; unsigned i; @@ -2058,8 +2123,14 @@ char* prrte_hwloc_base_get_topo_signature(hwloc_topology_t topo) endian = "unknown"; #endif - prrte_asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s:%s", - nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, arch, endian); + /* print the cpu bitmap itself so we can detect mismatches in the available + * cores across the nodes */ + if (0 != hwloc_bitmap_list_asprintf(&pus, hwloc_topology_get_topology_cpuset(topo))) { + pus = strdup("unknown"); + } + prrte_asprintf(&sig, "%dN:%dS:%dL3:%dL2:%dL1:%dC:%dH:%s:%s:%s", + nnuma, nsocket, nl3, nl2, nl1, ncore, nhwt, pus, arch, endian); + free(pus); return sig; } diff --git a/src/mca/plm/base/plm_base_launch_support.c b/src/mca/plm/base/plm_base_launch_support.c index b9648e31cb..c81b9b0ecb 100644 --- a/src/mca/plm/base/plm_base_launch_support.c +++ b/src/mca/plm/base/plm_base_launch_support.c @@ -718,6 +718,8 @@ void prrte_plm_base_daemon_topology(int status, prrte_process_name_t* sender, prrte_rml_tag_t tag, void *cbdata) { hwloc_topology_t topo; + hwloc_obj_t root; + prrte_hwloc_topo_data_t *sum; int rc, idx; char *sig, *coprocessors, **sns; prrte_proc_t *daemon=NULL; @@ -825,6 +827,17 @@ void prrte_plm_base_daemon_topology(int status, prrte_process_name_t* sender, } /* record the final topology */ t->topo = topo; + /* setup the summary data for this topology as we will need + * it when we go to map/bind procs to it */ + root = hwloc_get_root_obj(topo); + root->userdata = (void*)PRRTE_NEW(prrte_hwloc_topo_data_t); + sum = (prrte_hwloc_topo_data_t*)root->userdata; + #if HWLOC_API_VERSION < 0x20000 + sum->available = hwloc_bitmap_alloc(); + hwloc_bitmap_and(sum->available, root->online_cpuset, root->allowed_cpuset); + #else + sum->available = hwloc_bitmap_dup(hwloc_topology_get_allowed_cpuset(topo)); + #endif /* unpack any coprocessors */ idx=1; @@ -1160,6 +1173,7 @@ void prrte_plm_base_daemon_callback(int status, prrte_process_name_t* sender, } else { data = buffer; } + /* unpack the available topology information */ idx=1; if (PRRTE_SUCCESS != (rc = prrte_dss.unpack(data, &topo, &idx, PRRTE_HWLOC_TOPO))) { PRRTE_ERROR_LOG(rc);