From 07427c6d89adebfb8682f065c4f7752049f17452 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Mon, 11 Dec 2017 13:50:24 -0800 Subject: [PATCH 1/2] Update to PMIx v3.0 PR for cleanup registration If available, have apps use registration capability to cleanup their session directories. Setup capability for vader to register its shared memory file location - let someone familiar with that code do so. Final cleanup to track uid/gid, update the opal/pmix API to pass flags for ignore and leave top directory alone Signed-off-by: Ralph Castain --- opal/mca/btl/vader/btl_vader_component.c | 2 +- opal/mca/pmix/base/base.h | 1 + opal/mca/pmix/pmix.h | 4 + opal/mca/pmix/pmix3x/pmix/VERSION | 4 +- .../pmix/pmix3x/pmix/include/pmix_common.h.in | 11 + .../pmix/src/atomics/sys/powerpc/atomic.h | 4 +- .../pmix3x/pmix/src/include/pmix_globals.c | 262 ++++++++++++++++++ .../pmix3x/pmix/src/include/pmix_globals.h | 46 ++- .../pmix3x/pmix/src/mca/gds/hash/gds_hash.c | 10 +- .../pmix/src/mca/ptl/tcp/ptl_tcp_component.c | 11 + .../src/mca/ptl/usock/ptl_usock_component.c | 5 + .../pmix3x/pmix/src/server/pmix_server_get.c | 1 + .../pmix3x/pmix/src/server/pmix_server_ops.c | 190 +++++++++++++ opal/mca/pmix/pmix3x/pmix/src/util/error.c | 2 + opal/mca/pmix/pmix3x/pmix3x.c | 78 ++++++ opal/mca/pmix/pmix3x/pmix3x.h | 19 +- opal/mca/pmix/pmix3x/pmix3x_client.c | 35 ++- opal/mca/pmix/pmix_types.h | 12 + opal/util/output.c | 58 ++-- orte/mca/ess/base/ess_base_std_app.c | 21 +- 20 files changed, 705 insertions(+), 71 deletions(-) diff --git a/opal/mca/btl/vader/btl_vader_component.c b/opal/mca/btl/vader/btl_vader_component.c index 38cc5fb987a..83246da2009 100644 --- a/opal/mca/btl/vader/btl_vader_component.c +++ b/opal/mca/btl/vader/btl_vader_component.c @@ -15,7 +15,7 @@ * Copyright (c) 2010-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2016 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ diff --git a/opal/mca/pmix/base/base.h b/opal/mca/pmix/base/base.h index d1eeb68e109..e533e026720 100644 --- a/opal/mca/pmix/base/base.h +++ b/opal/mca/pmix/base/base.h @@ -65,6 +65,7 @@ typedef struct { opal_mutex_t mutex; opal_pmix_condition_t cond; volatile bool active; + int status; } opal_pmix_lock_t; diff --git a/opal/mca/pmix/pmix.h b/opal/mca/pmix/pmix.h index 53e04571ab5..4e650cf30bf 100644 --- a/opal/mca/pmix/pmix.h +++ b/opal/mca/pmix/pmix.h @@ -867,6 +867,9 @@ typedef int (*opal_pmix_base_process_monitor_fn_t)(opal_list_t *monitor, opal_list_t *directives, opal_pmix_info_cbfunc_t cbfunc, void *cbdata); +/* register cleanup */ +typedef int (*opal_pmix_base_register_cleanup_fn_t)(char *path, bool ignore, bool jobscope); + /* * the standard public API data structure */ @@ -901,6 +904,7 @@ typedef struct { opal_pmix_base_alloc_fn_t allocate; opal_pmix_base_job_control_fn_t job_control; opal_pmix_base_process_monitor_fn_t monitor; + opal_pmix_base_register_cleanup_fn_t register_cleanup; /* server APIs */ opal_pmix_base_module_server_init_fn_t server_init; opal_pmix_base_module_server_finalize_fn_t server_finalize; diff --git a/opal/mca/pmix/pmix3x/pmix/VERSION b/opal/mca/pmix/pmix3x/pmix/VERSION index 3b0f60b307a..93b4afb0c98 100644 --- a/opal/mca/pmix/pmix3x/pmix/VERSION +++ b/opal/mca/pmix/pmix3x/pmix/VERSION @@ -30,7 +30,7 @@ greek= # command, or with the date (if "git describe" fails) in the form of # "date". -repo_rev=gitf56d30e +repo_rev=git5c0b64b # If tarball_version is not empty, it is used as the version string in # the tarball filename, regardless of all other versions listed in @@ -44,7 +44,7 @@ tarball_version= # The date when this release was created -date="Nov 11, 2017" +date="Dec 11, 2017" # The shared library version of each of PMIx's public libraries. # These versions are maintained in accordance with the "Library diff --git a/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in b/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in index 897c5f43a3e..de699b2fcd3 100644 --- a/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in +++ b/opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in @@ -462,6 +462,16 @@ typedef uint32_t pmix_rank_t; #define PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned #define PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted #define PMIX_JOB_CTRL_TERMINATE "pmix.jctrl.term" // (bool) politely terminate the specified procs +#define PMIX_REGISTER_CLEANUP "pmix.reg.cleanup" // (char*) comma-delimited list of files/directories to + // be removed upon process termination +#define PMIX_CLEANUP_RECURSIVE "pmix.clnup.recurse" // (bool) recursively cleanup all subdirectories under the + // specified one(s) +#define PMIX_CLEANUP_EMPTY "pmix.clnup.empty" // (bool) only remove empty subdirectories +#define PMIX_CLEANUP_IGNORE "pmix.clnup.ignore" // (char*) comma-delimited list of filenames that are not + // to be removed +#define PMIX_CLEANUP_LEAVE_TOPDIR "pmix.clnup.lvtop" // (bool) when recursively cleaning subdirs, do not remove + // the top-level directory (the one given in the + // cleanup request) /* monitoring attributes */ #define PMIX_MONITOR_ID "pmix.monitor.id" // (char*) provide a string identifier for this request @@ -584,6 +594,7 @@ typedef int pmix_status_t; #define PMIX_ERR_NOT_IMPLEMENTED -48 #define PMIX_ERR_COMM_FAILURE -49 #define PMIX_ERR_UNPACK_READ_PAST_END_OF_BUFFER -50 // internal-only +#define PMIX_ERR_CONFLICTING_CLEANUP_DIRECTIVES -51 /* define a starting point for v2.x error values */ #define PMIX_ERR_V2X_BASE -100 diff --git a/opal/mca/pmix/pmix3x/pmix/src/atomics/sys/powerpc/atomic.h b/opal/mca/pmix/pmix3x/pmix/src/atomics/sys/powerpc/atomic.h index 9682b9e62af..4e39a43ee33 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/atomics/sys/powerpc/atomic.h +++ b/opal/mca/pmix/pmix3x/pmix/src/atomics/sys/powerpc/atomic.h @@ -84,7 +84,7 @@ void pmix_atomic_rmb(void) static inline void pmix_atomic_wmb(void) { - PMIXRMB(); + PMIXWMB(); } static inline @@ -110,7 +110,7 @@ void pmix_atomic_isync(void) #pragma mc_func pmix_atomic_rmb { "7c2004ac" } /* lwsync */ #pragma reg_killed_by pmix_atomic_rmb /* none */ -#pragma mc_func pmix_atomic_wmb { "7c0006ac" } /* eieio */ +#pragma mc_func pmix_atomic_wmb { "7c2004ac" } /* lwsync */ #pragma reg_killed_by pmix_atomic_wmb /* none */ #endif diff --git a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c index 15d56e6268b..9f5487f14f2 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c +++ b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.c @@ -36,11 +36,27 @@ #endif #include #include PMIX_EVENT_HEADER +#if HAVE_SYS_STAT_H +#include +#endif /* HAVE_SYS_STAT_H */ +#ifdef HAVE_DIRENT_H +#include +#endif /* HAVE_DIRENT_H */ + +#include #include "src/mca/bfrops/bfrops_types.h" #include "src/class/pmix_hash_table.h" #include "src/class/pmix_list.h" #include "src/threads/threads.h" +#include "src/util/argv.h" +#include "src/util/error.h" +#include "src/util/os_path.h" + +static void cleanup(pmix_epilog_t *epi); +static void dirpath_destroy(char *path, pmix_cleanup_dir_t *cd, + pmix_epilog_t *epi); +static bool dirpath_is_empty(const char *path); PMIX_EXPORT pmix_lock_t pmix_global_lock = { .mutex = PMIX_MUTEX_STATIC_INIT, @@ -52,6 +68,36 @@ PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_namelist_t, pmix_list_item_t, NULL, NULL); +static void cfcon(pmix_cleanup_file_t *p) +{ + p->path = NULL; +} +static void cfdes(pmix_cleanup_file_t *p) +{ + if (NULL != p->path) { + free(p->path); + } +} +PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_cleanup_file_t, + pmix_list_item_t, + cfcon, cfdes); + +static void cdcon(pmix_cleanup_dir_t *p) +{ + p->path = NULL; + p->recurse = false; + p->leave_topdir = false; +} +static void cddes(pmix_cleanup_dir_t *p) +{ + if (NULL != p->path) { + free(p->path); + } +} +PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_cleanup_dir_t, + pmix_list_item_t, + cdcon, cddes); + static void nscon(pmix_nspace_t *p) { p->nspace = NULL; @@ -61,6 +107,9 @@ static void nscon(pmix_nspace_t *p) p->ndelivered = 0; PMIX_CONSTRUCT(&p->ranks, pmix_list_t); memset(&p->compat, 0, sizeof(p->compat)); + PMIX_CONSTRUCT(&p->epilog.cleanup_dirs, pmix_list_t); + PMIX_CONSTRUCT(&p->epilog.cleanup_files, pmix_list_t); + PMIX_CONSTRUCT(&p->epilog.ignores, pmix_list_t); } static void nsdes(pmix_nspace_t *p) { @@ -71,6 +120,12 @@ static void nsdes(pmix_nspace_t *p) PMIX_RELEASE(p->jobbkt); } PMIX_LIST_DESTRUCT(&p->ranks); + /* perform any epilog */ + cleanup(&p->epilog); + /* cleanup the epilog */ + PMIX_LIST_DESTRUCT(&p->epilog.cleanup_dirs); + PMIX_LIST_DESTRUCT(&p->epilog.cleanup_files); + PMIX_LIST_DESTRUCT(&p->epilog.ignores); } PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_nspace_t, pmix_list_item_t, @@ -124,7 +179,11 @@ static void pcon(pmix_peer_t *p) PMIX_CONSTRUCT(&p->send_queue, pmix_list_t); p->send_msg = NULL; p->recv_msg = NULL; + PMIX_CONSTRUCT(&p->epilog.cleanup_dirs, pmix_list_t); + PMIX_CONSTRUCT(&p->epilog.cleanup_files, pmix_list_t); + PMIX_CONSTRUCT(&p->epilog.ignores, pmix_list_t); } + static void pdes(pmix_peer_t *p) { if (0 <= p->sd) { @@ -148,6 +207,12 @@ static void pdes(pmix_peer_t *p) if (NULL != p->recv_msg) { PMIX_RELEASE(p->recv_msg); } + /* perform any epilog */ + cleanup(&p->epilog); + /* cleanup the epilog */ + PMIX_LIST_DESTRUCT(&p->epilog.cleanup_dirs); + PMIX_LIST_DESTRUCT(&p->epilog.cleanup_files); + PMIX_LIST_DESTRUCT(&p->epilog.ignores); } PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_peer_t, pmix_object_t, @@ -252,3 +317,200 @@ static void qdes(pmix_query_caddy_t *p) PMIX_EXPORT PMIX_CLASS_INSTANCE(pmix_query_caddy_t, pmix_object_t, qcon, qdes); + +static void cleanup(pmix_epilog_t *epi) +{ + pmix_cleanup_file_t *cf; + pmix_cleanup_dir_t *cd; + struct stat statbuf; + int rc; + + /* start with any specified files */ + PMIX_LIST_FOREACH(cf, &epi->cleanup_files, pmix_cleanup_file_t) { + /* check the effective uid/gid of the file and ensure it + * matches that of the peer - we do this to provide at least + * some minimum level of protection */ + rc = stat(cf->path, &statbuf); + if (0 != rc) { + pmix_output_verbose(10, pmix_globals.debug_output, + "File %s failed to stat: %s", cf->path, strerror(rc)); + continue; + } + if (statbuf.st_uid != epi->uid || + statbuf.st_gid != epi->gid) { + pmix_output_verbose(10, pmix_globals.debug_output, + "File %s uid/gid doesn't match: uid %lu(%lu) gid %lu(%lu)", + cf->path, + (unsigned long)statbuf.st_uid, (unsigned long)epi->uid, + (unsigned long)statbuf.st_gid, (unsigned long)epi->gid); + continue; + } + rc = unlink(cf->path); + if (0 != rc) { + pmix_output_verbose(10, pmix_globals.debug_output, + "File %s failed to unlink: %s", cf->path, strerror(rc)); + } + } + + /* now cleanup the directories */ + PMIX_LIST_FOREACH(cd, &epi->cleanup_dirs, pmix_cleanup_dir_t) { + /* check the effective uid/gid of the file and ensure it + * matches that of the peer - we do this to provide at least + * some minimum level of protection */ + rc = stat(cd->path, &statbuf); + if (0 != rc) { + pmix_output_verbose(10, pmix_globals.debug_output, + "Directory %s failed to stat: %s", cd->path, strerror(rc)); + continue; + } + if (statbuf.st_uid != epi->uid || + statbuf.st_gid != epi->gid) { + pmix_output_verbose(10, pmix_globals.debug_output, + "Directory %s uid/gid doesn't match: uid %lu(%lu) gid %lu(%lu)", + cd->path, + (unsigned long)statbuf.st_uid, (unsigned long)epi->uid, + (unsigned long)statbuf.st_gid, (unsigned long)epi->gid); + continue; + } + if ((statbuf.st_mode & S_IRWXU) == S_IRWXU) { + dirpath_destroy(cd->path, cd, epi); + } else { + pmix_output_verbose(10, pmix_globals.debug_output, + "Directory %s lacks permissions", cd->path); + } + } +} + +static void dirpath_destroy(char *path, pmix_cleanup_dir_t *cd, pmix_epilog_t *epi) +{ + int rc; + bool is_dir = false, ignore; + DIR *dp; + struct dirent *ep; + char *filenm; + struct stat buf; + size_t n; + pmix_cleanup_file_t *cf; + + if (NULL == path) { /* protect against error */ + return; + } + + /* if this path is it to be ignored, then do so */ + PMIX_LIST_FOREACH(cf, &epi->ignores, pmix_cleanup_file_t) { + if (0 == strcmp(cf->path, path)) { + return; + } + } + + /* Open up the directory */ + dp = opendir(path); + if (NULL == dp) { + return; + } + + while (NULL != (ep = readdir(dp))) { + /* skip: + * - . and .. + */ + if ((0 == strcmp(ep->d_name, ".")) || + (0 == strcmp(ep->d_name, ".."))) { + continue; + } + + /* Create a pathname. This is not always needed, but it makes + * for cleaner code just to create it here. Note that we are + * allocating memory here, so we need to free it later on. + */ + filenm = pmix_os_path(false, path, ep->d_name, NULL); + + /* if this path is it to be ignored, then do so */ + PMIX_LIST_FOREACH(cf, &epi->ignores, pmix_cleanup_file_t) { + if (0 == strcmp(cf->path, filenm)) { + free(filenm); + continue; + } + } + + /* Check to see if it is a directory */ + is_dir = false; + + rc = stat(filenm, &buf); + if (0 > rc) { + /* Handle a race condition. filenm might have been deleted by an + * other process running on the same node. That typically occurs + * when one task is removing the job_session_dir and an other task + * is still removing its proc_session_dir. + */ + free(filenm); + continue; + } + /* if the uid/gid don't match, then leave it alone */ + if (buf.st_uid != epi->uid || + buf.st_gid != epi->gid) { + free(filenm); + continue; + } + + if (S_ISDIR(buf.st_mode)) { + is_dir = true; + } + + /* + * If not recursively decending, then if we find a directory then fail + * since we were not told to remove it. + */ + if (is_dir && !cd->recurse) { + /* continue removing files */ + free(filenm); + continue; + } + + /* Directories are recursively destroyed */ + if (is_dir && cd->recurse && ((buf.st_mode & S_IRWXU) == S_IRWXU)) { + dirpath_destroy(filenm, cd, epi); + free(filenm); + } else { + /* Files are removed right here */ + unlink(filenm); + free(filenm); + } + } + + /* Done with this directory */ + closedir(dp); + + cleanup: + /* If the directory is empty, then remove it unless we + * were told to leave it */ + if (0 == strcmp(path, cd->path) && cd->leave_topdir) { + return; + } + if (dirpath_is_empty(path)) { + rmdir(path); + } +} + +static bool dirpath_is_empty(const char *path ) +{ + DIR *dp; + struct dirent *ep; + + if (NULL != path) { /* protect against error */ + dp = opendir(path); + if (NULL != dp) { + while ((ep = readdir(dp))) { + if ((0 != strcmp(ep->d_name, ".")) && + (0 != strcmp(ep->d_name, ".."))) { + closedir(dp); + return false; + } + } + closedir(dp); + return true; + } + return false; + } + + return true; +} diff --git a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h index 34f12a5dfeb..ab43db2cba8 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h +++ b/opal/mca/pmix/pmix3x/pmix/src/include/pmix_globals.h @@ -119,6 +119,29 @@ typedef struct pmix_personality_t { pmix_gds_base_module_t *gds; } pmix_personality_t; +/* define a set of structs for tracking post-termination cleanup */ +typedef struct pmix_epilog_t { + uid_t uid; + gid_t gid; + pmix_list_t cleanup_dirs; + pmix_list_t cleanup_files; + pmix_list_t ignores; +} pmix_epilog_t; + +typedef struct { + pmix_list_item_t super; + char *path; +} pmix_cleanup_file_t; +PMIX_CLASS_DECLARATION(pmix_cleanup_file_t); + +typedef struct { + pmix_list_item_t super; + char *path; + bool recurse; + bool leave_topdir; +} pmix_cleanup_dir_t; +PMIX_CLASS_DECLARATION(pmix_cleanup_dir_t); + /* objects used by servers for tracking active nspaces */ typedef struct { pmix_list_item_t super; @@ -133,6 +156,8 @@ typedef struct { * Since servers may support clients from multiple nspaces, * track their respective compatibility modules here */ pmix_personality_t compat; + pmix_epilog_t epilog; // things to do upon termination of all local clients + // from this nspace } pmix_nspace_t; PMIX_CLASS_DECLARATION(pmix_nspace_t); @@ -156,6 +181,17 @@ typedef struct pmix_rank_info_t { } pmix_rank_info_t; PMIX_CLASS_DECLARATION(pmix_rank_info_t); + +/* define a very simple caddy for dealing with pmix_info_t + * objects when transferring portions of arrays */ +typedef struct { + pmix_list_item_t super; + pmix_info_t *info; + size_t ninfo; +} pmix_info_caddy_t; +PMIX_CLASS_DECLARATION(pmix_info_caddy_t); + + /* object for tracking peers - each peer can have multiple * connections. This can occur if the initial app executes * a fork/exec, and the child initiates its own connection @@ -177,6 +213,8 @@ typedef struct pmix_peer_t { pmix_list_t send_queue; /**< list of messages to send */ pmix_ptl_send_t *send_msg; /**< current send in progress */ pmix_ptl_recv_t *recv_msg; /**< current recv in progress */ + pmix_epilog_t epilog; /**< things to be performed upon + termination of this peer */ } pmix_peer_t; PMIX_CLASS_DECLARATION(pmix_peer_t); @@ -305,14 +343,6 @@ typedef struct { } pmix_cb_t; PMIX_CLASS_DECLARATION(pmix_cb_t); -/* define a very simple caddy for dealing with pmix_info_t - * objects when transferring portions of arrays */ -typedef struct { - pmix_list_item_t super; - pmix_info_t *info; -} pmix_info_caddy_t; -PMIX_CLASS_DECLARATION(pmix_info_caddy_t); - #define PMIX_THREADSHIFT(r, c) \ do { \ pmix_event_assign(&((r)->ev), pmix_globals.evbase, \ diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/hash/gds_hash.c b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/hash/gds_hash.c index 1f60b49dbcf..4d7a2b8549e 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/gds/hash/gds_hash.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/gds/hash/gds_hash.c @@ -426,6 +426,7 @@ pmix_status_t hash_cache_job_info(struct pmix_nspace_t *ns, /* an array of data pertaining to a specific proc */ if (PMIX_DATA_ARRAY != info[n].value.type) { PMIX_ERROR_LOG(PMIX_ERR_BAD_PARAM); + rc = PMIX_ERR_TYPE_MISMATCH; goto release; } size = info[n].value.data.darray->size; @@ -433,6 +434,7 @@ pmix_status_t hash_cache_job_info(struct pmix_nspace_t *ns, /* first element of the array must be the rank */ if (0 != strcmp(iptr[0].key, PMIX_RANK) || PMIX_PROC_RANK != iptr[0].value.type) { + rc = PMIX_ERR_TYPE_MISMATCH; PMIX_ERROR_LOG(PMIX_ERR_BAD_PARAM); goto release; } @@ -458,7 +460,7 @@ pmix_status_t hash_cache_job_info(struct pmix_nspace_t *ns, if (NULL == tmp) { PMIX_ERROR_LOG(PMIX_ERR_NOMEM); rc = PMIX_ERR_NOMEM; - return rc; + goto release; } kp2->value->type = PMIX_COMPRESSED_STRING; free(kp2->value->data.string); @@ -493,10 +495,10 @@ pmix_status_t hash_cache_job_info(struct pmix_nspace_t *ns, if (PMIX_STRING_SIZE_CHECK(kp2->value)) { if (pmix_util_compress_string(kp2->value->data.string, &tmp, &len)) { if (NULL == tmp) { - PMIX_ERROR_LOG(PMIX_ERR_NOMEM); - PMIX_RELEASE(kp2); rc = PMIX_ERR_NOMEM; - return rc; + PMIX_ERROR_LOG(rc); + PMIX_RELEASE(kp2); + goto release; } kp2->value->type = PMIX_COMPRESSED_STRING; free(kp2->value->data.string); diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp_component.c b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp_component.c index 69ae60e55dd..05bb12ef298 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp_component.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/tcp/ptl_tcp_component.c @@ -1161,6 +1161,12 @@ static void connection_handler(int sd, short args, void *cbdata) peer->nptr = nptr; PMIX_RETAIN(info); peer->info = info; + /* update the epilog fields */ + peer->epilog.uid = info->uid; + peer->epilog.gid = info->gid; + /* ensure the nspace epilog is updated too */ + nptr->epilog.uid = info->uid; + nptr->epilog.gid = info->gid; info->proc_cnt++; /* increase number of processes on this rank */ peer->sd = pnd->sd; if (0 > (peer->index = pmix_pointer_array_add(&pmix_server_globals.clients, peer))) { @@ -1399,6 +1405,11 @@ static void process_cbfunc(int sd, short args, void *cbdata) peer->nptr = nptr; PMIX_RETAIN(info); peer->info = info; + /* save the uid/gid */ + peer->epilog.uid = info->uid; + peer->epilog.gid = info->gid; + nptr->epilog.uid = info->uid; + nptr->epilog.gid = info->gid; peer->proc_cnt = 1; peer->sd = pnd->sd; diff --git a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock_component.c b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock_component.c index f25d66eba7b..f3c63b9b4ca 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock_component.c +++ b/opal/mca/pmix/pmix3x/pmix/src/mca/ptl/usock/ptl_usock_component.c @@ -601,6 +601,11 @@ static void connection_handler(int sd, short args, void *cbdata) psave->nptr = nptr; PMIX_RETAIN(info); psave->info = info; + /* save the epilog info */ + psave->epilog.uid = info->uid; + psave->epilog.gid = info->gid; + nptr->epilog.uid = info->uid; + nptr->epilog.gid = info->gid; info->proc_cnt++; /* increase number of processes on this rank */ psave->sd = pnd->sd; if (0 > (psave->index = pmix_pointer_array_add(&pmix_server_globals.clients, psave))) { diff --git a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_get.c b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_get.c index b50c7ae743f..73c993c8df0 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_get.c +++ b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_get.c @@ -382,6 +382,7 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, } if (PMIX_ERR_NOT_FOUND != rc || NULL == lcd) { /* we have a problem - e.g., out of memory */ + cbfunc(PMIX_ERR_NOT_FOUND, NULL, 0, cbdata, NULL, NULL); PMIX_INFO_FREE(info, ninfo); return rc; } diff --git a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c index 0f3d8f2f41b..eab1b4137a8 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c +++ b/opal/mca/pmix/pmix3x/pmix/src/server/pmix_server_ops.c @@ -2015,6 +2015,13 @@ pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer, pmix_status_t rc; pmix_query_caddy_t *cd; pmix_proc_t proc; + size_t n; + bool recurse, leave_topdir, duplicate; + pmix_list_t cachedirs, cachefiles; + pmix_epilog_t *epi; + pmix_cleanup_file_t *cf, *cf2; + pmix_cleanup_dir_t *cdir, *cdir2; + struct stat statbuf; pmix_output_verbose(2, pmix_server_globals.base_output, "recvd job control request from client"); @@ -2045,6 +2052,22 @@ pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer, goto exit; } } + + /* check targets to find proper place to put any epilog requests */ + if (NULL == cd->targets) { + epi = &peer->nptr->epilog; + } else if (1 == cd->ntargets) { + if (0 == strncmp(cd->targets[0].nspace, peer->info->pname.nspace, PMIX_MAX_NSLEN)) { + if (PMIX_RANK_WILDCARD == cd->targets[0].rank) { + epi = &peer->nptr->epilog; + } else { + epi = &peer->epilog; + } + } + } else { + epi = NULL; // do not allow epilog requests + } + /* unpack the number of info objects */ cnt = 1; PMIX_BFROPS_UNPACK(rc, peer, buf, &cd->ninfo, &cnt, PMIX_SIZE); @@ -2063,6 +2086,173 @@ pmix_status_t pmix_server_job_ctrl(pmix_peer_t *peer, } } + /* if this includes a request for post-termination cleanup, we handle + * that request ourselves */ + PMIX_CONSTRUCT(&cachedirs, pmix_list_t); + PMIX_CONSTRUCT(&cachefiles, pmix_list_t); + cnt = 0; // track how many infos are cleanup related + for (n=0; n < cd->ninfo; n++) { + if (0 == strncmp(cd->info[n].key, PMIX_REGISTER_CLEANUP, PMIX_MAX_KEYLEN)) { + ++cnt; + /* see if we allow epilog requests */ + if (NULL == epi) { + /* return an error */ + rc = PMIX_ERR_BAD_PARAM; + goto exit; + } + if (PMIX_STRING != cd->info[n].value.type || + NULL == cd->info[n].value.data.string) { + /* return an error */ + rc = PMIX_ERR_BAD_PARAM; + goto exit; + } + if (0 != stat(cd->info[n].value.data.string, &statbuf)) { + /* return an error */ + rc = PMIX_ERR_BAD_PARAM; + goto exit; + } + if (S_ISDIR(statbuf.st_mode)) { + cdir = PMIX_NEW(pmix_cleanup_dir_t); + if (NULL == cdir) { + /* return an error */ + rc = PMIX_ERR_NOMEM; + goto exit; + } + cdir->path = strdup(cd->info[n].value.data.string); + pmix_list_append(&cachedirs, &cdir->super); + } else { + cf = PMIX_NEW(pmix_cleanup_file_t); + if (NULL == cf) { + /* return an error */ + rc = PMIX_ERR_NOMEM; + goto exit; + } + cf->path = strdup(cd->info[n].value.data.string); + pmix_list_append(&cachefiles, &cf->super); + } + } else if (0 == strncmp(cd->info[n].key, PMIX_CLEANUP_RECURSIVE, PMIX_MAX_KEYLEN)) { + /* see if we allow epilog requests */ + if (NULL == epi) { + /* return an error */ + rc = PMIX_ERR_BAD_PARAM; + goto exit; + } + recurse = PMIX_INFO_TRUE(&cd->info[n]); + ++cnt; + } else if (0 == strncmp(cd->info[n].key, PMIX_CLEANUP_IGNORE, PMIX_MAX_KEYLEN)) { + if (PMIX_STRING != cd->info[n].value.type || + NULL == cd->info[n].value.data.string) { + /* return an error */ + rc = PMIX_ERR_BAD_PARAM; + goto exit; + } + /* see if we allow epilog requests */ + if (NULL == epi) { + /* return an error */ + rc = PMIX_ERR_BAD_PARAM; + goto exit; + } + /* scan the list of ignores for any duplicate */ + duplicate = false; + PMIX_LIST_FOREACH(cf, &epi->ignores, pmix_cleanup_file_t) { + if (0 == strcmp(cf->path, cd->info[n].value.data.string)) { + /* we can drop this request */ + duplicate = true; + break; + } + } + if (!duplicate) { + cf = PMIX_NEW(pmix_cleanup_file_t); + if (NULL == cf) { + /* return an error */ + rc = PMIX_ERR_NOMEM; + goto exit; + } + cf->path = strdup(cd->info[n].value.data.string); + pmix_list_append(&epi->ignores, &cf->super); + } + ++cnt; + } else if (0 == strncmp(cd->info[n].key, PMIX_CLEANUP_LEAVE_TOPDIR, PMIX_MAX_KEYLEN)) { + /* see if we allow epilog requests */ + if (NULL == epi) { + /* return an error */ + rc = PMIX_ERR_BAD_PARAM; + goto exit; + } + leave_topdir = PMIX_INFO_TRUE(&cd->info[n]); + ++cnt; + } + } + if (0 < cnt) { + while (NULL != (cdir = (pmix_cleanup_dir_t*)pmix_list_remove_first(&cachedirs))) { + /* scan the existing list of directories for any duplicate */ + PMIX_LIST_FOREACH(cdir2, &epi->cleanup_dirs, pmix_cleanup_dir_t) { + if (0 == strcmp(cdir2->path, cdir->path)) { + /* duplicate - check for difference in flags per RFC + * precedence rules */ + if (!cdir->recurse && recurse) { + cdir->recurse = recurse; + } + if (!cdir->leave_topdir && leave_topdir) { + cdir->leave_topdir = leave_topdir; + } + PMIX_RELEASE(cdir); + cdir = NULL; + break; + } + } + if (NULL != cdir) { + /* check for conflict with ignore */ + PMIX_LIST_FOREACH(cf, &epi->ignores, pmix_cleanup_file_t) { + if (0 == strcmp(cf->path, cdir->path)) { + /* return an error */ + rc = PMIX_ERR_CONFLICTING_CLEANUP_DIRECTIVES; + PMIX_LIST_DESTRUCT(&cachedirs); + PMIX_LIST_DESTRUCT(&cachefiles); + goto exit; + } + } + cdir->recurse = recurse; + cdir->leave_topdir = leave_topdir; + /* just append it to the end of the list */ + pmix_list_append(&epi->cleanup_dirs, &cdir->super); + } + } + PMIX_DESTRUCT(&cachedirs); + while (NULL != (cf = (pmix_cleanup_file_t*)pmix_list_remove_first(&cachefiles))) { + /* scan the existing list of files for any duplicate */ + PMIX_LIST_FOREACH(cf2, &epi->cleanup_files, pmix_cleanup_file_t) { + if (0 == strcmp(cf2->path, cf->path)) { + PMIX_RELEASE(cf); + cf = NULL; + break; + } + } + if (NULL != cf) { + /* check for conflict with ignore */ + PMIX_LIST_FOREACH(cf2, &epi->ignores, pmix_cleanup_file_t) { + if (0 == strcmp(cf->path, cf2->path)) { + /* return an error */ + rc = PMIX_ERR_CONFLICTING_CLEANUP_DIRECTIVES; + PMIX_LIST_DESTRUCT(&cachedirs); + PMIX_LIST_DESTRUCT(&cachefiles); + goto exit; + } + } + /* just append it to the end of the list */ + pmix_list_append(&epi->cleanup_files, &cf->super); + } + } + PMIX_DESTRUCT(&cachefiles); + if (cnt == cd->ninfo) { + /* nothing more to do */ + if (NULL != cbfunc) { + cbfunc(PMIX_SUCCESS, NULL, 0, cd, NULL, NULL); + } + return PMIX_SUCCESS; + } + } + /* setup the requesting peer name */ (void)strncpy(proc.nspace, peer->info->pname.nspace, PMIX_MAX_NSLEN); proc.rank = peer->info->pname.rank; diff --git a/opal/mca/pmix/pmix3x/pmix/src/util/error.c b/opal/mca/pmix/pmix3x/pmix/src/util/error.c index ae3851da051..ed2e230a387 100644 --- a/opal/mca/pmix/pmix3x/pmix/src/util/error.c +++ b/opal/mca/pmix/pmix3x/pmix/src/util/error.c @@ -171,6 +171,8 @@ PMIX_EXPORT const char* PMIx_Error_string(pmix_status_t errnum) return "PMIX MODEL DECLARED"; case PMIX_ERR_TEMP_UNAVAILABLE: return "PMIX TEMPORARILY UNAVAILABLE"; + case PMIX_ERR_CONFLICTING_CLEANUP_DIRECTIVES: + return "PMIX CONFLICTING CLEANUP DIRECTIVES"; case PMIX_SUCCESS: return "SUCCESS"; default: diff --git a/opal/mca/pmix/pmix3x/pmix3x.c b/opal/mca/pmix/pmix3x/pmix3x.c index 5499d18d0ab..1127be4a29c 100644 --- a/opal/mca/pmix/pmix3x/pmix3x.c +++ b/opal/mca/pmix/pmix3x/pmix3x.c @@ -25,6 +25,9 @@ #ifdef HAVE_UNISTD_H #include #endif +#ifdef HAVE_SYS_STAT_H +#include +#endif #include "opal/dss/dss.h" #include "opal/mca/event/event.h" @@ -71,6 +74,8 @@ static void pmix3x_query(opal_list_t *queries, static void pmix3x_log(opal_list_t *info, opal_pmix_op_cbfunc_t cbfunc, void *cbdata); +static int pmix3x_register_cleanup(char *path, bool ignore, bool jobscope); + const opal_pmix_base_module_t opal_pmix_pmix3x_module = { /* client APIs */ .init = pmix3x_client_init, @@ -101,6 +106,7 @@ const opal_pmix_base_module_t opal_pmix_pmix3x_module = { .log = pmix3x_log, .allocate = pmix3x_allocate, .job_control = pmix3x_job_control, + .register_cleanup = pmix3x_register_cleanup, /* server APIs */ .server_init = pmix3x_server_init, .server_finalize = pmix3x_server_finalize, @@ -333,6 +339,78 @@ void pmix3x_event_hdlr(size_t evhdlr_registration_id, return; } +static void cleanup_cbfunc(pmix_status_t status, + pmix_info_t *info, size_t ninfo, + void *cbdata, + pmix_release_cbfunc_t release_fn, + void *release_cbdata) +{ + opal_pmix_lock_t *lk = (opal_pmix_lock_t*)cbdata; + + OPAL_POST_OBJECT(lk); + + /* let the library release the data and cleanup from + * the operation */ + if (NULL != release_fn) { + release_fn(release_cbdata); + } + + /* release the block */ + lk->status = pmix3x_convert_rc(status); + OPAL_PMIX_WAKEUP_THREAD(lk); +} + +static int pmix3x_register_cleanup(char *path, bool ignore, bool jobscope) +{ + opal_pmix_lock_t lk; + pmix_info_t pinfo[3]; + size_t n, ninfo=0; + pmix_status_t rc; + int ret; + struct stat statbuf; + + OPAL_PMIX_CONSTRUCT_LOCK(&lk); + + if (ignore) { + /* they want this path ignored */ + PMIX_INFO_LOAD(&pinfo[ninfo], PMIX_CLEANUP_IGNORE, path, PMIX_STRING); + ++ninfo; + } else { + /* order cleanup of the provided path */ + PMIX_INFO_LOAD(&pinfo[ninfo], PMIX_REGISTER_CLEANUP, path, PMIX_STRING); + ++ninfo; + /* if the path is a directory, then we need to tell the server + * to recursively clean up */ + if (stat(path, &statbuf) != 0) { + return OPAL_ERR_NOT_FOUND; + } + if (S_ISDIR(statbuf.st_mode)) { + /* recursively cleanup directories */ + PMIX_INFO_LOAD(&pinfo[ninfo], PMIX_CLEANUP_RECURSIVE, NULL, PMIX_BOOL); + ++ninfo; + } + } + + /* if they want this applied to the job, then indicate so */ + if (jobscope) { + rc = PMIx_Job_control_nb(NULL, 0, pinfo, ninfo, cleanup_cbfunc, (void*)&lk); + } else { + /* only applies to us */ + rc = PMIx_Job_control_nb(&mca_pmix_pmix3x_component.myproc, 1, pinfo, ninfo, cleanup_cbfunc, (void*)&lk); + } + if (PMIX_SUCCESS != rc) { + ret = pmix3x_convert_rc(rc); + } else { + OPAL_PMIX_WAIT_THREAD(&lk); + ret = lk.status; + } + OPAL_PMIX_DESTRUCT_LOCK(&lk); + for (n=0; n < ninfo; n++) { + PMIX_INFO_DESTRUCT(&pinfo[n]); + } + return ret; +} + opal_vpid_t pmix3x_convert_rank(pmix_rank_t rank) { switch(rank) { diff --git a/opal/mca/pmix/pmix3x/pmix3x.h b/opal/mca/pmix/pmix3x/pmix3x.h index a5b67f146ed..9227da1d769 100644 --- a/opal/mca/pmix/pmix3x/pmix3x.h +++ b/opal/mca/pmix/pmix3x/pmix3x.h @@ -38,15 +38,16 @@ BEGIN_C_DECLS typedef struct { - opal_pmix_base_component_t super; - opal_list_t jobids; - bool native_launch; - size_t evindex; - opal_list_t events; - int cache_size; - opal_list_t cache; - opal_list_t dmdx; - bool silence_warning; + opal_pmix_base_component_t super; + pmix_proc_t myproc; + opal_list_t jobids; + bool native_launch; + size_t evindex; + opal_list_t events; + int cache_size; + opal_list_t cache; + opal_list_t dmdx; + bool silence_warning; } mca_pmix_pmix3x_component_t; OPAL_DECLSPEC extern mca_pmix_pmix3x_component_t mca_pmix_pmix3x_component; diff --git a/opal/mca/pmix/pmix3x/pmix3x_client.c b/opal/mca/pmix/pmix3x/pmix3x_client.c index 58a7e0b39dd..97343c07143 100644 --- a/opal/mca/pmix/pmix3x/pmix3x_client.c +++ b/opal/mca/pmix/pmix3x/pmix3x_client.c @@ -5,7 +5,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2017 Mellanox Technologies, Inc. * All rights reserved. - * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -38,7 +38,6 @@ #include "pmix.h" #include "pmix_tool.h" -static pmix_proc_t my_proc; static char *dbgvalue=NULL; static void errreg_cbfunc (pmix_status_t status, @@ -105,7 +104,7 @@ int pmix3x_client_init(opal_list_t *ilist) } OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); - rc = PMIx_Init(&my_proc, pinfo, ninfo); + rc = PMIx_Init(&mca_pmix_pmix3x_component.myproc, pinfo, ninfo); if (NULL != pinfo) { PMIX_INFO_FREE(pinfo, ninfo); } @@ -127,20 +126,20 @@ int pmix3x_client_init(opal_list_t *ilist) /* if we were launched by the OMPI RTE, then * the jobid is in a special format - so get it */ mca_pmix_pmix3x_component.native_launch = true; - opal_convert_string_to_jobid(&pname.jobid, my_proc.nspace); + opal_convert_string_to_jobid(&pname.jobid, mca_pmix_pmix3x_component.myproc.nspace); } else { /* we were launched by someone else, so make the * jobid just be the hash of the nspace */ - OPAL_HASH_JOBID(my_proc.nspace, pname.jobid); + OPAL_HASH_JOBID(mca_pmix_pmix3x_component.myproc.nspace, pname.jobid); } /* insert this into our list of jobids - it will be the * first, and so we'll check it first */ job = OBJ_NEW(opal_pmix3x_jobid_trkr_t); - (void)strncpy(job->nspace, my_proc.nspace, PMIX_MAX_NSLEN); + (void)strncpy(job->nspace, mca_pmix_pmix3x_component.myproc.nspace, PMIX_MAX_NSLEN); job->jobid = pname.jobid; opal_list_append(&mca_pmix_pmix3x_component.jobids, &job->super); - pname.vpid = pmix3x_convert_rank(my_proc.rank); + pname.vpid = pmix3x_convert_rank(mca_pmix_pmix3x_component.myproc.rank); opal_proc_set_name(&pname); /* release the thread in case the event handler fires when @@ -221,10 +220,10 @@ int pmix3x_tool_init(opal_list_t *info) /* check to see if our name is being given from above */ if (0 == strcmp(val->key, OPAL_PMIX_TOOL_NSPACE)) { opal_convert_string_to_jobid(&pname.jobid, val->data.string); - (void)strncpy(my_proc.nspace, val->data.string, PMIX_MAX_NSLEN); + (void)strncpy(mca_pmix_pmix3x_component.myproc.nspace, val->data.string, PMIX_MAX_NSLEN); } else if (0 == strcmp(val->key, OPAL_PMIX_TOOL_RANK)) { pname.vpid = val->data.name.vpid; - my_proc.rank = pname.vpid; + mca_pmix_pmix3x_component.myproc.rank = pname.vpid; } } } else { @@ -236,7 +235,7 @@ int pmix3x_tool_init(opal_list_t *info) mca_pmix_pmix3x_component.native_launch = true; OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); - rc = PMIx_tool_init(&my_proc, pinfo, ninfo); + rc = PMIx_tool_init(&mca_pmix_pmix3x_component.myproc, pinfo, ninfo); if (NULL != pinfo) { PMIX_INFO_FREE(pinfo, ninfo); } @@ -254,13 +253,13 @@ int pmix3x_tool_init(opal_list_t *info) } /* store our jobid and rank */ - opal_convert_string_to_jobid(&pname.jobid, my_proc.nspace); - pname.vpid = pmix3x_convert_rank(my_proc.rank); + opal_convert_string_to_jobid(&pname.jobid, mca_pmix_pmix3x_component.myproc.nspace); + pname.vpid = pmix3x_convert_rank(mca_pmix_pmix3x_component.myproc.rank); /* insert this into our list of jobids - it will be the * first, and so we'll check it first */ job = OBJ_NEW(opal_pmix3x_jobid_trkr_t); - (void)strncpy(job->nspace, my_proc.nspace, PMIX_MAX_NSLEN); + (void)strncpy(job->nspace, mca_pmix_pmix3x_component.myproc.nspace, PMIX_MAX_NSLEN); job->jobid = pname.jobid; opal_list_append(&mca_pmix_pmix3x_component.jobids, &job->super); @@ -399,7 +398,7 @@ int pmix3x_store_local(const opal_process_name_t *proc, opal_value_t *val) p.rank = pmix3x_convert_opalrank(proc->vpid); } else { /* use our name */ - (void)strncpy(p.nspace, my_proc.nspace, PMIX_MAX_NSLEN); + (void)strncpy(p.nspace, mca_pmix_pmix3x_component.myproc.nspace, PMIX_MAX_NSLEN); p.rank = pmix3x_convert_opalrank(OPAL_PROC_MY_NAME.vpid); } @@ -614,7 +613,7 @@ int pmix3x_get(const opal_process_name_t *proc, const char *key, if (0 == strcmp(key, OPAL_PMIX_RANK)) { (*val) = OBJ_NEW(opal_value_t); (*val)->type = OPAL_INT; - (*val)->data.integer = pmix3x_convert_rank(my_proc.rank); + (*val)->data.integer = pmix3x_convert_rank(mca_pmix_pmix3x_component.myproc.rank); OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); return OPAL_SUCCESS; } @@ -622,7 +621,7 @@ int pmix3x_get(const opal_process_name_t *proc, const char *key, *val = NULL; if (NULL == proc) { - (void)strncpy(p.nspace, my_proc.nspace, PMIX_MAX_NSLEN); + (void)strncpy(p.nspace, mca_pmix_pmix3x_component.myproc.nspace, PMIX_MAX_NSLEN); p.rank = pmix3x_convert_rank(PMIX_RANK_WILDCARD); } else { if (NULL == (nsptr = pmix3x_convert_jobid(proc->jobid))) { @@ -719,7 +718,7 @@ int pmix3x_getnb(const opal_process_name_t *proc, const char *key, if (NULL != cbfunc) { val = OBJ_NEW(opal_value_t); val->type = OPAL_INT; - val->data.integer = pmix3x_convert_rank(my_proc.rank); + val->data.integer = pmix3x_convert_rank(mca_pmix_pmix3x_component.myproc.rank); cbfunc(OPAL_SUCCESS, val, cbdata); } OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock); @@ -733,7 +732,7 @@ int pmix3x_getnb(const opal_process_name_t *proc, const char *key, op->cbdata = cbdata; if (NULL == proc) { - (void)strncpy(op->p.nspace, my_proc.nspace, PMIX_MAX_NSLEN); + (void)strncpy(op->p.nspace, mca_pmix_pmix3x_component.myproc.nspace, PMIX_MAX_NSLEN); op->p.rank = pmix3x_convert_rank(PMIX_RANK_WILDCARD); } else { if (NULL == (nsptr = pmix3x_convert_jobid(proc->jobid))) { diff --git a/opal/mca/pmix/pmix_types.h b/opal/mca/pmix/pmix_types.h index f9c58e7d735..4c18ba7eb48 100644 --- a/opal/mca/pmix/pmix_types.h +++ b/opal/mca/pmix/pmix_types.h @@ -118,6 +118,7 @@ BEGIN_C_DECLS /* information about relative ranks as assigned by the RM */ +#define OPAL_PMIX_CLUSTER_ID "pmix.clid" // (char*) a string name for the cluster this proc is executing on #define OPAL_PMIX_PROCID "pmix.procid" // (opal_process_name_t) process identifier #define OPAL_PMIX_NSPACE "pmix.nspace" // (char*) nspace of a job #define OPAL_PMIX_JOBID "pmix.jobid" // (uint32_t) jobid assigned by scheduler @@ -189,6 +190,7 @@ BEGIN_C_DECLS #define OPAL_PMIX_NOTIFY_COMPLETION "pmix.notecomp" // (bool) notify parent process upon termination of child job #define OPAL_PMIX_RANGE "pmix.range" // (int) opal_pmix_data_range_t value for calls to publish/lookup/unpublish #define OPAL_PMIX_PERSISTENCE "pmix.persist" // (int) opal_pmix_persistence_t value for calls to publish +#define OPAL_PMIX_DATA_SCOPE "pmix.scope" // (pmix_scope_t) scope of the data to be found in a PMIx_Get call #define OPAL_PMIX_OPTIONAL "pmix.optional" // (bool) look only in the immediate data store for the requested value - do // not request data from the server if not found #define OPAL_PMIX_EMBED_BARRIER "pmix.embed.barrier" // (bool) execute a blocking fence operation before executing the @@ -364,6 +366,16 @@ BEGIN_C_DECLS #define OPAL_PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned #define OPAL_PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted #define OPAL_PMIX_JOB_CTRL_TERMINATE "pmix.jctrl.term" // (bool) politely terminate the specified procs +#define OPAL_PMIX_REGISTER_CLEANUP "pmix.reg.cleanup" // (char*) comma-delimited list of files/directories to + // be removed upon process termination +#define OPAL_PMIX_CLEANUP_RECURSIVE "pmix.clnup.recurse" // (bool) recursively cleanup all subdirectories under the + // specified one(s) +#define OPAL_PMIX_CLEANUP_EMPTY "pmix.clnup.empty" // (bool) only remove empty subdirectories +#define OPAL_PMIX_CLEANUP_IGNORE "pmix.clnup.ignore" // (char*) comma-delimited list of filenames that are not + // to be removed +#define OPAL_PMIX_CLEANUP_LEAVE_TOPDIR "pmix.clnup.lvtop" // (bool) when recursively cleaning subdirs, do not remove + // the top-level directory (the one given in the + // cleanup request) /* monitoring attributes */ diff --git a/opal/util/output.c b/opal/util/output.c index f096a88b478..633901ee593 100644 --- a/opal/util/output.c +++ b/opal/util/output.c @@ -16,6 +16,7 @@ * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -44,6 +45,7 @@ #include "opal/util/output.h" #include "opal/threads/mutex.h" #include "opal/constants.h" +#include "opal/mca/pmix/pmix.h" /* * Private data @@ -505,10 +507,10 @@ void opal_output_finalize(void) output_dir = NULL; if(NULL != temp_str) { - free(temp_str); - temp_str = NULL; - temp_str_len = 0; - } + free(temp_str); + temp_str = NULL; + temp_str_len = 0; + } OBJ_DESTRUCT(&verbose); OBJ_DESTRUCT(&mutex); } @@ -785,18 +787,24 @@ static int open_file(int i) /* Actually open the file */ info[i].ldi_fd = open(filename, flags, 0644); - free(filename); /* release the filename in all cases */ if (-1 == info[i].ldi_fd) { info[i].ldi_used = false; + free(filename); /* release the filename in all cases */ return OPAL_ERR_IN_ERRNO; } /* Make the file be close-on-exec to prevent child inheritance * problems */ if (-1 == fcntl(info[i].ldi_fd, F_SETFD, 1)) { - return OPAL_ERR_IN_ERRNO; + free(filename); /* release the filename in all cases */ + return OPAL_ERR_IN_ERRNO; } + /* register it to be ignored */ + if (NULL != opal_pmix.register_cleanup) { + opal_pmix.register_cleanup(filename, true, false); + } + free(filename); /* release the filename in all cases */ } /* Return successfully even if the session dir did not exist yet; @@ -814,20 +822,20 @@ static void free_descriptor(int output_id) output_desc_t *ldi; if (output_id >= 0 && output_id < OPAL_OUTPUT_MAX_STREAMS && - info[output_id].ldi_used && info[output_id].ldi_enabled) { - ldi = &info[output_id]; + info[output_id].ldi_used && info[output_id].ldi_enabled) { + ldi = &info[output_id]; - if (-1 != ldi->ldi_fd) { - close(ldi->ldi_fd); - } - ldi->ldi_used = false; + if (-1 != ldi->ldi_fd) { + close(ldi->ldi_fd); + } + ldi->ldi_used = false; - /* If we strduped a prefix, suffix, or syslog ident, free it */ + /* If we strduped a prefix, suffix, or syslog ident, free it */ - if (NULL != ldi->ldi_prefix) { - free(ldi->ldi_prefix); - } - ldi->ldi_prefix = NULL; + if (NULL != ldi->ldi_prefix) { + free(ldi->ldi_prefix); + } + ldi->ldi_prefix = NULL; if (NULL != ldi->ldi_suffix) { free(ldi->ldi_suffix); @@ -835,14 +843,14 @@ static void free_descriptor(int output_id) ldi->ldi_suffix = NULL; if (NULL != ldi->ldi_file_suffix) { - free(ldi->ldi_file_suffix); - } - ldi->ldi_file_suffix = NULL; - - if (NULL != ldi->ldi_syslog_ident) { - free(ldi->ldi_syslog_ident); - } - ldi->ldi_syslog_ident = NULL; + free(ldi->ldi_file_suffix); + } + ldi->ldi_file_suffix = NULL; + + if (NULL != ldi->ldi_syslog_ident) { + free(ldi->ldi_syslog_ident); + } + ldi->ldi_syslog_ident = NULL; } } diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 475304a8e23..22817cbcd9c 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -129,7 +129,6 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_errmgr_base_open"; goto error; } - /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output, @@ -147,6 +146,22 @@ int orte_ess_base_app_setup(bool db_restrict_local) proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); + /* register the directory for cleanup */ + if (NULL != opal_pmix.register_cleanup) { + if (orte_standalone_operation) { + if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.top_session_dir, false, true))) { + ORTE_ERROR_LOG(ret); + error = "register cleanup"; + goto error; + } + } else { + if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.jobfam_session_dir, false, false))) { + ORTE_ERROR_LOG(ret); + error = "register cleanup"; + goto error; + } + } + } } /* Setup the communication infrastructure */ /* Routed system */ @@ -357,7 +372,9 @@ int orte_ess_base_app_finalize(void) (void) mca_base_framework_close(&orte_oob_base_framework); (void) mca_base_framework_close(&orte_state_base_framework); - orte_session_dir_finalize(ORTE_PROC_MY_NAME); + if (NULL == opal_pmix.register_cleanup) { + orte_session_dir_finalize(ORTE_PROC_MY_NAME); + } /* cleanup the process info */ orte_proc_info_finalize(); From 47fd2313abc1cd78f42488ac04384c16ad5703bc Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 13 Dec 2017 11:10:18 -0700 Subject: [PATCH 2/2] btl/vader: move backing files into /dev/shm on Linux This commit moves the backing files to /dev/shm to avoid limitations that may be set on /tmp. The files are registered with pmix to ensure they are cleaned up after an erroneous exit. Signed-off-by: Nathan Hjelm (cherry picked from commit 48101278160672317ade352365592f56ef3b8977) --- opal/mca/btl/vader/btl_vader.h | 4 +++- opal/mca/btl/vader/btl_vader_component.c | 21 +++++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/opal/mca/btl/vader/btl_vader.h b/opal/mca/btl/vader/btl_vader.h index 5290a7faa78..f0e8ef678f5 100644 --- a/opal/mca/btl/vader/btl_vader.h +++ b/opal/mca/btl/vader/btl_vader.h @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2010-2017 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Mellanox Technologies. All rights reserved. * @@ -136,6 +136,8 @@ struct mca_btl_vader_component_t { opal_list_t pending_endpoints; /**< list of endpoints with pending fragments */ opal_list_t pending_fragments; /**< fragments pending remote completion */ + char *backing_directory; /**< directory to place shared memory backing files */ + /* knem stuff */ #if OPAL_BTL_VADER_HAVE_KNEM unsigned int knem_dma_min; /**< minimum size to enable DMA for knem transfers (0 disables) */ diff --git a/opal/mca/btl/vader/btl_vader_component.c b/opal/mca/btl/vader/btl_vader_component.c index 83246da2009..ccbc0aa4647 100644 --- a/opal/mca/btl/vader/btl_vader_component.c +++ b/opal/mca/btl/vader/btl_vader_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2015 Los Alamos National Security, LLC. + * Copyright (c) 2010-2017 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. @@ -211,6 +211,19 @@ static int mca_btl_vader_component_register (void) OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, &mca_btl_vader_component.single_copy_mechanism); OBJ_RELEASE(new_enum); + if (0 == access ("/dev/shm", W_OK)) { + mca_btl_vader_component.backing_directory = "/dev/shm"; + } else { + mca_btl_vader_component.backing_directory = opal_process_info.proc_session_dir; + } + (void) mca_base_component_var_register (&mca_btl_vader_component.super.btl_version, "backing_directory", + "Directory to place backing files for shared memory communication. " + "This directory should be on a local filesystem such as /tmp or " + "/dev/shm (default: (linux) /dev/shm, (others) session directory)", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_vader_component.backing_directory); + + #if OPAL_BTL_VADER_HAVE_KNEM /* Currently disabling DMA mode by default; it's not clear that this is useful in all applications and architectures. */ mca_btl_vader_component.knem_dma_min = 0; @@ -491,13 +504,17 @@ static mca_btl_base_module_t **mca_btl_vader_component_init (int *num_btls, if (MCA_BTL_VADER_XPMEM != mca_btl_vader_component.single_copy_mechanism) { char *sm_file; - rc = asprintf(&sm_file, "%s" OPAL_PATH_SEP "vader_segment.%s.%d", opal_process_info.proc_session_dir, + rc = asprintf(&sm_file, "%s" OPAL_PATH_SEP "vader_segment.%s.%d", mca_btl_vader_component.backing_directory, opal_process_info.nodename, MCA_BTL_VADER_LOCAL_RANK); if (0 > rc) { free (btls); return NULL; } + if (NULL != opal_pmix.register_cleanup) { + opal_pmix.register_cleanup (sm_file, false, false); + } + rc = opal_shmem_segment_create (&component->seg_ds, sm_file, component->segment_size); free (sm_file); if (OPAL_SUCCESS != rc) {