Skip to content

Commit

Permalink
restoring more hwloc --cpu-set behavior
Browse files Browse the repository at this point in the history
I made an older checkin bdd92a7
that partially restored --cpu-set behavior using OMPI 3.x code,
and I was about to restore more code from 3.x, but I'm taking a
different approach here and reverting the previous addition of
3.x code too.

For the (partial) revert:
  commit: bdd92a7.
  title: "-cpu-set as a constraint rather than as a binding"
I'm reverting the functional changes to how the hwloc tree is
iterated over, but keeping the error-detection changes modeled
after 3.x.

For the new approach to --cpu-set:

The --cpu-set option is logically similar to running under a cgroup
just without the OS-level enforcement that comes with a cgroup. For
cgroups the new code loads the topology without the WHOLE_SYSTEM
flag so the tree only contains what's in the cgroup. We can do the
same thing with a hwloc_restrict_topology() call to constrain the
topology to whatever --cpu-set the user enters.  Then all the 3.x
code that skips over unavailable tree entries becomes unnecessary.

Here are the cases that were fixed in bdd92a7,
and which are confirmed to still work with the new approach:

hardware: [..../..../..../....] numbered sequentially 0-15

% mpirun -np 2 --report-bindings --bind-to hwthread \
  --map-by hwthread --cpu-set 6,7 hostname
> MCW rank 0 [..../..B./..../....]
> MCW rank 1 [..../...B/..../....]

% mpirun -np 2 --report-bindings --bind-to hwthread \
  --map-by ppr:2:node:pe=2 --cpu-set 6,7,12,13 hostname
> MCW rank 0 [..../..BB/..../....]
> MCW rank 1 [..../..../..../BB..]

The additional case I was in progress to fix here and which
is also handled by the new approach is
% mpirun -np 2 --report-bindings --bind-to hwthread \
  --map-by ppr:2:node:pe=3 --cpu-set 4,5,9,11,14,15 ./x
> MCW rank 0 [..../BB../.B../....]
> MCW rank 1 [..../..../...B/..BB]

Signed-off-by: Mark Allen <[email protected]>

[ This checkin also includes a squashed commit from Ralph Castain
to fix code style violations. ]
  • Loading branch information
markalle committed Sep 10, 2019
1 parent 66b9140 commit 2233146
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 39 deletions.
49 changes: 21 additions & 28 deletions opal/mca/hwloc/base/hwloc_base_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,10 @@ int opal_hwloc_base_filter_cpus(hwloc_topology_t topo)
/* cache this info */
sum->available = avail;

/* Treat --cpu-set analogously to how a cgroup is treated, eg make
* the loaded topolog only contain what is available. */
hwloc_topology_restrict(topo, avail, 0);

return OPAL_SUCCESS;
}

Expand Down Expand Up @@ -318,7 +322,7 @@ int opal_hwloc_base_get_topology(void)
0, (void*)addr, size, 0)) {
if (4 < opal_output_get_verbosity(opal_hwloc_base_framework.framework_output)) {
FILE *file = fopen("/proc/self/maps", "r");
if (file) {
if (NULL != file) {
char line[256];
opal_output(0, "Dumping /proc/self/maps");

Expand Down Expand Up @@ -727,17 +731,20 @@ static hwloc_obj_t df_search(hwloc_topology_t topo,
#if HWLOC_API_VERSION >= 0x20000
return NULL;
#else
if (cache_level != HWLOC_OBJ_CACHE)
if (cache_level != HWLOC_OBJ_CACHE) {
return NULL;
}
search_depth = hwloc_get_cache_type_depth(topo, cache_level, (hwloc_obj_cache_type_t) -1);
#endif
}
if (HWLOC_TYPE_DEPTH_UNKNOWN == search_depth)
if (HWLOC_TYPE_DEPTH_UNKNOWN == search_depth) {
return NULL;
}

if (OPAL_HWLOC_LOGICAL == rtype) {
if (num_objs)
if (NULL != num_objs) {
*num_objs = hwloc_get_nbobjs_by_depth(topo, search_depth);
}
return hwloc_get_obj_by_depth(topo, search_depth, nobj);
}
if (OPAL_HWLOC_PHYSICAL == rtype) {
Expand All @@ -754,45 +761,31 @@ static hwloc_obj_t df_search(hwloc_topology_t topo,
*/
hwloc_obj_t found = NULL;
obj = NULL;
if (num_objs)
if (NULL != num_objs) {
*num_objs = 0;
}
while ((obj = hwloc_get_next_obj_by_depth(topo, search_depth, obj)) != NULL) {
if (num_objs && obj->os_index > *num_objs)
if (num_objs && obj->os_index > *num_objs) {
*num_objs = obj->os_index;
if (obj->os_index == nobj)
}
if (obj->os_index == nobj) {
found = obj;
}
}
return found;
}
if (OPAL_HWLOC_AVAILABLE == rtype) {
// The previous (3.x) code included a check for
// available = opal_hwloc_base_get_available_cpus(topo, start)
// and skipped objs that had hwloc_bitmap_iszero(available)
hwloc_obj_t root;
opal_hwloc_topo_data_t *rdata;
root = hwloc_get_root_obj(topo);
rdata = (opal_hwloc_topo_data_t*)root->userdata;
hwloc_cpuset_t constrained_cpuset;

constrained_cpuset = hwloc_bitmap_alloc();
if (rdata && rdata->available) {
hwloc_bitmap_and(constrained_cpuset, start->cpuset, rdata->available);
} else {
hwloc_bitmap_copy(constrained_cpuset, start->cpuset);
}

unsigned idx = 0;
if (num_objs)
*num_objs = hwloc_get_nbobjs_inside_cpuset_by_depth(topo, constrained_cpuset, search_depth);
if (NULL != num_objs) {
*num_objs = hwloc_get_nbobjs_inside_cpuset_by_depth(topo, start->cpuset, search_depth);
}
obj = NULL;
while ((obj = hwloc_get_next_obj_inside_cpuset_by_depth(topo, constrained_cpuset, search_depth, obj)) != NULL) {
while ((obj = hwloc_get_next_obj_inside_cpuset_by_depth(topo, start->cpuset, search_depth, obj)) != NULL) {
if (idx == nobj) {
hwloc_bitmap_free(constrained_cpuset);
return obj;
}
idx++;
}
hwloc_bitmap_free(constrained_cpuset);
return NULL;
}
return NULL;
Expand Down
11 changes: 0 additions & 11 deletions orte/mca/rmaps/base/rmaps_base_binding.c
Original file line number Diff line number Diff line change
Expand Up @@ -169,20 +169,9 @@ static int bind_generic(orte_job_t *jdata,
trg_obj = NULL;
min_bound = UINT_MAX;
while (NULL != (tmp_obj = hwloc_get_next_obj_by_depth(node->topology->topo, target_depth, tmp_obj))) {
hwloc_obj_t root;
opal_hwloc_topo_data_t *rdata;
root = hwloc_get_root_obj(node->topology->topo);
rdata = (opal_hwloc_topo_data_t*)root->userdata;

if (!hwloc_bitmap_intersects(locale->cpuset, tmp_obj->cpuset)) {
continue;
}
// From the old 3.x code trg_obj was picked via a call to
// opal_hwloc_base_find_min_bound_target_under_obj() which
// skiped over unavailable objects (via opal_hwloc_base_get_npus).
if (rdata && rdata->available && !hwloc_bitmap_intersects(rdata->available, tmp_obj->cpuset))
continue;

data = (opal_hwloc_obj_data_t*)tmp_obj->userdata;
if (NULL == data) {
data = OBJ_NEW(opal_hwloc_obj_data_t);
Expand Down

0 comments on commit 2233146

Please sign in to comment.