From 223314630156b0b417821f0f0131ec8ddc28a977 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Thu, 18 Jul 2019 15:38:50 -0400 Subject: [PATCH] restoring more hwloc --cpu-set behavior I made an older checkin bdd92a7a643f40e76dfe7a332f4ae98b54702f9c that partially restored --cpu-set behavior using OMPI 3.x code, and I was about to restore more code from 3.x, but I'm taking a different approach here and reverting the previous addition of 3.x code too. For the (partial) revert: commit: bdd92a7a643f40e76dfe7a332f4ae98b54702f9c. title: "-cpu-set as a constraint rather than as a binding" I'm reverting the functional changes to how the hwloc tree is iterated over, but keeping the error-detection changes modeled after 3.x. For the new approach to --cpu-set: The --cpu-set option is logically similar to running under a cgroup just without the OS-level enforcement that comes with a cgroup. For cgroups the new code loads the topology without the WHOLE_SYSTEM flag so the tree only contains what's in the cgroup. We can do the same thing with a hwloc_restrict_topology() call to constrain the topology to whatever --cpu-set the user enters. Then all the 3.x code that skips over unavailable tree entries becomes unnecessary. Here are the cases that were fixed in bdd92a7a643f40e76dfe7a332f4ae98b54702f9c, and which are confirmed to still work with the new approach: hardware: [..../..../..../....] numbered sequentially 0-15 % mpirun -np 2 --report-bindings --bind-to hwthread \ --map-by hwthread --cpu-set 6,7 hostname > MCW rank 0 [..../..B./..../....] > MCW rank 1 [..../...B/..../....] % mpirun -np 2 --report-bindings --bind-to hwthread \ --map-by ppr:2:node:pe=2 --cpu-set 6,7,12,13 hostname > MCW rank 0 [..../..BB/..../....] > MCW rank 1 [..../..../..../BB..] The additional case I was in progress to fix here and which is also handled by the new approach is % mpirun -np 2 --report-bindings --bind-to hwthread \ --map-by ppr:2:node:pe=3 --cpu-set 4,5,9,11,14,15 ./x > MCW rank 0 [..../BB../.B../....] > MCW rank 1 [..../..../...B/..BB] Signed-off-by: Mark Allen [ This checkin also includes a squashed commit from Ralph Castain to fix code style violations. ] --- opal/mca/hwloc/base/hwloc_base_util.c | 49 ++++++++++-------------- orte/mca/rmaps/base/rmaps_base_binding.c | 11 ------ 2 files changed, 21 insertions(+), 39 deletions(-) diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c index 3614ce761c8..bf47ce05fca 100644 --- a/opal/mca/hwloc/base/hwloc_base_util.c +++ b/opal/mca/hwloc/base/hwloc_base_util.c @@ -229,6 +229,10 @@ int opal_hwloc_base_filter_cpus(hwloc_topology_t topo) /* cache this info */ sum->available = avail; + /* Treat --cpu-set analogously to how a cgroup is treated, eg make + * the loaded topolog only contain what is available. */ + hwloc_topology_restrict(topo, avail, 0); + return OPAL_SUCCESS; } @@ -318,7 +322,7 @@ int opal_hwloc_base_get_topology(void) 0, (void*)addr, size, 0)) { if (4 < opal_output_get_verbosity(opal_hwloc_base_framework.framework_output)) { FILE *file = fopen("/proc/self/maps", "r"); - if (file) { + if (NULL != file) { char line[256]; opal_output(0, "Dumping /proc/self/maps"); @@ -727,17 +731,20 @@ static hwloc_obj_t df_search(hwloc_topology_t topo, #if HWLOC_API_VERSION >= 0x20000 return NULL; #else - if (cache_level != HWLOC_OBJ_CACHE) + if (cache_level != HWLOC_OBJ_CACHE) { return NULL; + } search_depth = hwloc_get_cache_type_depth(topo, cache_level, (hwloc_obj_cache_type_t) -1); #endif } - if (HWLOC_TYPE_DEPTH_UNKNOWN == search_depth) + if (HWLOC_TYPE_DEPTH_UNKNOWN == search_depth) { return NULL; + } if (OPAL_HWLOC_LOGICAL == rtype) { - if (num_objs) + if (NULL != num_objs) { *num_objs = hwloc_get_nbobjs_by_depth(topo, search_depth); + } return hwloc_get_obj_by_depth(topo, search_depth, nobj); } if (OPAL_HWLOC_PHYSICAL == rtype) { @@ -754,45 +761,31 @@ static hwloc_obj_t df_search(hwloc_topology_t topo, */ hwloc_obj_t found = NULL; obj = NULL; - if (num_objs) + if (NULL != num_objs) { *num_objs = 0; + } while ((obj = hwloc_get_next_obj_by_depth(topo, search_depth, obj)) != NULL) { - if (num_objs && obj->os_index > *num_objs) + if (num_objs && obj->os_index > *num_objs) { *num_objs = obj->os_index; - if (obj->os_index == nobj) + } + if (obj->os_index == nobj) { found = obj; + } } return found; } if (OPAL_HWLOC_AVAILABLE == rtype) { -// The previous (3.x) code included a check for -// available = opal_hwloc_base_get_available_cpus(topo, start) -// and skipped objs that had hwloc_bitmap_iszero(available) - hwloc_obj_t root; - opal_hwloc_topo_data_t *rdata; - root = hwloc_get_root_obj(topo); - rdata = (opal_hwloc_topo_data_t*)root->userdata; - hwloc_cpuset_t constrained_cpuset; - - constrained_cpuset = hwloc_bitmap_alloc(); - if (rdata && rdata->available) { - hwloc_bitmap_and(constrained_cpuset, start->cpuset, rdata->available); - } else { - hwloc_bitmap_copy(constrained_cpuset, start->cpuset); - } - unsigned idx = 0; - if (num_objs) - *num_objs = hwloc_get_nbobjs_inside_cpuset_by_depth(topo, constrained_cpuset, search_depth); + if (NULL != num_objs) { + *num_objs = hwloc_get_nbobjs_inside_cpuset_by_depth(topo, start->cpuset, search_depth); + } obj = NULL; - while ((obj = hwloc_get_next_obj_inside_cpuset_by_depth(topo, constrained_cpuset, search_depth, obj)) != NULL) { + while ((obj = hwloc_get_next_obj_inside_cpuset_by_depth(topo, start->cpuset, search_depth, obj)) != NULL) { if (idx == nobj) { - hwloc_bitmap_free(constrained_cpuset); return obj; } idx++; } - hwloc_bitmap_free(constrained_cpuset); return NULL; } return NULL; diff --git a/orte/mca/rmaps/base/rmaps_base_binding.c b/orte/mca/rmaps/base/rmaps_base_binding.c index 1174dcf9482..52c060d930c 100644 --- a/orte/mca/rmaps/base/rmaps_base_binding.c +++ b/orte/mca/rmaps/base/rmaps_base_binding.c @@ -169,20 +169,9 @@ static int bind_generic(orte_job_t *jdata, trg_obj = NULL; min_bound = UINT_MAX; while (NULL != (tmp_obj = hwloc_get_next_obj_by_depth(node->topology->topo, target_depth, tmp_obj))) { - hwloc_obj_t root; - opal_hwloc_topo_data_t *rdata; - root = hwloc_get_root_obj(node->topology->topo); - rdata = (opal_hwloc_topo_data_t*)root->userdata; - if (!hwloc_bitmap_intersects(locale->cpuset, tmp_obj->cpuset)) { continue; } -// From the old 3.x code trg_obj was picked via a call to -// opal_hwloc_base_find_min_bound_target_under_obj() which -// skiped over unavailable objects (via opal_hwloc_base_get_npus). - if (rdata && rdata->available && !hwloc_bitmap_intersects(rdata->available, tmp_obj->cpuset)) - continue; - data = (opal_hwloc_obj_data_t*)tmp_obj->userdata; if (NULL == data) { data = OBJ_NEW(opal_hwloc_obj_data_t);