diff --git a/src/mca/state/base/state_base_fns.c b/src/mca/state/base/state_base_fns.c index 5d06c299d5..3b16bc737f 100644 --- a/src/mca/state/base/state_base_fns.c +++ b/src/mca/state/base/state_base_fns.c @@ -5,7 +5,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -598,6 +598,7 @@ void prte_state_base_check_all_complete(int fd, short args, void *cbdata) int32_t i32, *i32ptr; prte_pmix_lock_t lock; prte_app_context_t *app; + pmix_server_pset_t *pst, *pst2; PRTE_HIDE_UNUSED_PARAMS(fd, args); PMIX_ACQUIRE_OBJECT(caddy); @@ -750,6 +751,13 @@ void prte_state_base_check_all_complete(int fd, short args, void *cbdata) PMIX_RELEASE(map); jdata->map = NULL; } + // if this job has apps that named a pset, then remove them + PMIX_LIST_FOREACH_SAFE(pst, pst2, &prte_pmix_server_globals.psets, pmix_server_pset_t) { + if (pst->jdata == jdata) { + pmix_list_remove_item(&prte_pmix_server_globals.psets, &pst->super); + PMIX_RELEASE(pst); + } + } CHECK_ALIVE: /* now check to see if all jobs are done - trigger notification of this jdata diff --git a/src/mca/state/dvm/state_dvm.c b/src/mca/state/dvm/state_dvm.c index 7da013bf46..b0d16ad85c 100644 --- a/src/mca/state/dvm/state_dvm.c +++ b/src/mca/state/dvm/state_dvm.c @@ -4,7 +4,7 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. * Copyright (c) 2020 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -516,6 +516,7 @@ static void check_complete(int fd, short args, void *cbdata) hwloc_obj_type_t type; hwloc_cpuset_t boundcpus, tgt; bool takeall, sep, *sepptr = &sep; + pmix_server_pset_t *pst, *pst2; PRTE_HIDE_UNUSED_PARAMS(fd, args); PMIX_ACQUIRE_OBJECT(caddy); @@ -812,6 +813,13 @@ static void check_complete(int fd, short args, void *cbdata) PMIX_RELEASE(map); jdata->map = NULL; } + // if this job has apps that named a pset, then remove them + PMIX_LIST_FOREACH_SAFE(pst, pst2, &prte_pmix_server_globals.psets, pmix_server_pset_t) { + if (pst->jdata == jdata) { + pmix_list_remove_item(&prte_pmix_server_globals.psets, &pst->super); + PMIX_RELEASE(pst); + } + } /* if requested, check fd status for leaks */ if (prte_state_base.run_fdcheck) { diff --git a/src/prted/pmix/pmix_server.c b/src/prted/pmix/pmix_server.c index 8735d57198..e7b78efac3 100644 --- a/src/prted/pmix/pmix_server.c +++ b/src/prted/pmix/pmix_server.c @@ -18,7 +18,7 @@ * All rights reserved. * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2023 Triad National Security, LLC. All rights reserved. * $COPYRIGHT$ * @@ -2066,6 +2066,7 @@ PMIX_CLASS_INSTANCE(pmix_server_req_t, static void pscon(pmix_server_pset_t *p) { p->name = NULL; + p->jdata = NULL; p->members = NULL; p->num_members = 0; } @@ -2074,6 +2075,9 @@ static void psdes(pmix_server_pset_t *p) if (NULL != p->name) { free(p->name); } + if (NULL != p->jdata) { + PMIX_RELEASE(p->jdata); + } if (NULL != p->members) { free(p->members); } diff --git a/src/prted/pmix/pmix_server_internal.h b/src/prted/pmix/pmix_server_internal.h index f78e5adba7..012f61687b 100644 --- a/src/prted/pmix/pmix_server_internal.h +++ b/src/prted/pmix/pmix_server_internal.h @@ -18,7 +18,7 @@ * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -354,6 +354,7 @@ pmix_server_session_ctrl_fn(const pmix_proc_t *requestor, typedef struct { pmix_list_item_t super; char *name; + prte_job_t *jdata; pmix_proc_t *members; size_t num_members; } pmix_server_pset_t; diff --git a/src/prted/pmix/pmix_server_queries.c b/src/prted/pmix/pmix_server_queries.c index bbbd44ee9f..39b84ea2f7 100644 --- a/src/prted/pmix/pmix_server_queries.c +++ b/src/prted/pmix/pmix_server_queries.c @@ -19,7 +19,7 @@ * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2024 Nanook Consulting All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -319,7 +319,10 @@ static void _query(int sd, short args, void *cbdata) } /* add our findings to the results */ PMIX_INFO_LIST_CONVERT(rc, cache, &dry); - if (PMIX_SUCCESS != rc) { + if (PMIX_SUCCESS != rc && PMIX_ERR_EMPTY != rc) { + // if the array is empty, then there is nothing wrong - we + // simply didn't find any runnning jobs + // otherwise, report the error and abort PMIX_ERROR_LOG(rc); PMIX_INFO_LIST_RELEASE(cache); goto done; @@ -587,18 +590,19 @@ static void _query(int sd, short args, void *cbdata) PMIX_ARGV_APPEND_NOSIZE_COMPAT(&ans, ps->name); } if (NULL == ans) { - ret = PMIX_ERR_NOT_FOUND; - goto done; + tmp = NULL;; } else { tmp = PMIX_ARGV_JOIN_COMPAT(ans, ','); PMIX_ARGV_FREE_COMPAT(ans); ans = NULL; - PMIX_INFO_LIST_ADD(rc, results, PMIX_QUERY_PSET_NAMES, tmp, PMIX_STRING); + } + PMIX_INFO_LIST_ADD(rc, results, PMIX_QUERY_PSET_NAMES, tmp, PMIX_STRING); + if (NULL != tmp) { free(tmp); - if (PMIX_SUCCESS != rc) { - PMIX_ERROR_LOG(rc); - goto done; - } + } + if (PMIX_SUCCESS != rc) { + PMIX_ERROR_LOG(rc); + goto done; } } else if (0 == strcmp(q->keys[n], PMIX_QUERY_PSET_MEMBERSHIP)) { diff --git a/src/prted/pmix/pmix_server_register_fns.c b/src/prted/pmix/pmix_server_register_fns.c index 58b8b296e1..f9061696d6 100644 --- a/src/prted/pmix/pmix_server_register_fns.c +++ b/src/prted/pmix/pmix_server_register_fns.c @@ -19,7 +19,7 @@ * Copyright (c) 2014-2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017-2020 IBM Corporation. All rights reserved. - * Copyright (c) 2021-2024 Nanook Consulting. All rights reserved. + * Copyright (c) 2021-2025 Nanook Consulting All rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -402,6 +402,8 @@ int prte_pmix_server_register_nspace(prte_job_t *jdata) /* register it */ pset = PMIX_NEW(pmix_server_pset_t); pset->name = strdup(tmp); + PMIX_RETAIN(jdata); + pset->jdata = jdata; pmix_list_append(&prte_pmix_server_globals.psets, &pset->super); free(tmp); /* and its membership */