From d7686854a2b694824cb7f8d94479d24350887728 Mon Sep 17 00:00:00 2001 From: Kaiming Ouyang Date: Sat, 31 Oct 2020 19:01:59 -0500 Subject: [PATCH 1/3] ch4/posix: decrease shm_limit_counter when freeing comm obj shm_limit_counter is increased when allocating shm memory for coll calls, but it is not properly decreased when freeing comm obj. This commit fixes this bug. --- .../shm/posix/release_gather/release_gather.c | 29 +++++++++++++++++-- .../release_gather/release_gather_types.h | 5 +++- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/mpid/ch4/shm/posix/release_gather/release_gather.c b/src/mpid/ch4/shm/posix/release_gather/release_gather.c index bcb9df02560..2973e590ea7 100644 --- a/src/mpid/ch4/shm/posix/release_gather/release_gather.c +++ b/src/mpid/ch4/shm/posix/release_gather/release_gather.c @@ -265,8 +265,6 @@ int MPIDI_POSIX_mpi_release_gather_comm_init(MPIR_Comm * comm_ptr, MPIR_Bcast_impl(&fallback, 1, MPI_INT, 0, comm_ptr, &errflag); MPIR_ERR_SETANDJUMP(mpi_errno_ret, MPI_ERR_NO_MEM, "**nomem"); } else { - /* More shm can be created, update the shared counter */ - MPL_atomic_fetch_add_uint64(MPIDI_POSIX_shm_limit_counter, memory_to_be_allocated); fallback = 0; mpi_errno = MPIR_Bcast_impl(&fallback, 1, MPI_INT, 0, comm_ptr, &errflag); if (mpi_errno) { @@ -376,6 +374,10 @@ int MPIDI_POSIX_mpi_release_gather_comm_init(MPIR_Comm * comm_ptr, release_gather_info_ptr->reduce_buf_addr = NULL; release_gather_info_ptr->child_reduce_buf_addr = NULL; + RELEASE_GATHER_FIELD(comm_ptr, flags_shm_size) = flags_shm_size; + /* update the shared counter */ + if (rank == 0) + MPL_atomic_fetch_add_uint64(MPIDI_POSIX_shm_limit_counter, flags_shm_size); mpi_errno = MPIDU_shm_alloc(comm_ptr, flags_shm_size, (void **) &(release_gather_info_ptr->flags_addr), &mapfail_flag); @@ -411,6 +413,12 @@ int MPIDI_POSIX_mpi_release_gather_comm_init(MPIR_Comm * comm_ptr, } if (initialize_bcast_buf) { + RELEASE_GATHER_FIELD(comm_ptr, bcast_shm_size) = + MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE; + if (rank == 0) + MPL_atomic_fetch_add_uint64(MPIDI_POSIX_shm_limit_counter, + RELEASE_GATHER_FIELD(comm_ptr, bcast_shm_size)); + /* Allocate the shared memory for bcast buffer */ mpi_errno = MPIDU_shm_alloc(comm_ptr, MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE, @@ -432,6 +440,12 @@ int MPIDI_POSIX_mpi_release_gather_comm_init(MPIR_Comm * comm_ptr, RELEASE_GATHER_FIELD(comm_ptr, child_reduce_buf_addr) = MPL_malloc(num_ranks * sizeof(void *), MPL_MEM_COLL); + RELEASE_GATHER_FIELD(comm_ptr, reduce_shm_size) = + MPIR_CVAR_BCAST_INTRANODE_BUFFER_TOTAL_SIZE; + if (rank == 0) + MPL_atomic_fetch_add_uint64(MPIDI_POSIX_shm_limit_counter, + RELEASE_GATHER_FIELD(comm_ptr, reduce_shm_size)); + mpi_errno = MPIDU_shm_alloc(comm_ptr, num_ranks * MPIR_CVAR_REDUCE_INTRANODE_BUFFER_TOTAL_SIZE, (void **) &(RELEASE_GATHER_FIELD(comm_ptr, reduce_buf_addr)), @@ -480,6 +494,11 @@ int MPIDI_POSIX_mpi_release_gather_comm_free(MPIR_Comm * comm_ptr) goto fn_exit; } + /* decrease shm memory limit counter */ + if (comm_ptr->rank == 0) + MPL_atomic_fetch_sub_uint64(MPIDI_POSIX_shm_limit_counter, + RELEASE_GATHER_FIELD(comm_ptr, flags_shm_size)); + /* destroy and detach shared memory used for flags */ mpi_errno = MPIDU_shm_free(RELEASE_GATHER_FIELD(comm_ptr, flags_addr)); if (mpi_errno) { @@ -492,6 +511,9 @@ int MPIDI_POSIX_mpi_release_gather_comm_free(MPIR_Comm * comm_ptr) } if (RELEASE_GATHER_FIELD(comm_ptr, bcast_buf_addr) != NULL) { + if (comm_ptr->rank == 0) + MPL_atomic_fetch_sub_uint64(MPIDI_POSIX_shm_limit_counter, + RELEASE_GATHER_FIELD(comm_ptr, bcast_shm_size)); /* destroy and detach shared memory used for bcast buffer */ mpi_errno = MPIDU_shm_free(RELEASE_GATHER_FIELD(comm_ptr, bcast_buf_addr)); if (mpi_errno) { @@ -505,6 +527,9 @@ int MPIDI_POSIX_mpi_release_gather_comm_free(MPIR_Comm * comm_ptr) } if (RELEASE_GATHER_FIELD(comm_ptr, reduce_buf_addr) != NULL) { + if (comm_ptr->rank == 0) + MPL_atomic_fetch_sub_uint64(MPIDI_POSIX_shm_limit_counter, + RELEASE_GATHER_FIELD(comm_ptr, reduce_shm_size)); /* destroy and detach shared memory used for reduce buffers */ mpi_errno = MPIDU_shm_free(RELEASE_GATHER_FIELD(comm_ptr, reduce_buf_addr)); if (mpi_errno) { diff --git a/src/mpid/ch4/shm/posix/release_gather/release_gather_types.h b/src/mpid/ch4/shm/posix/release_gather/release_gather_types.h index 7e026933dee..4d84baa36bd 100644 --- a/src/mpid/ch4/shm/posix/release_gather/release_gather_types.h +++ b/src/mpid/ch4/shm/posix/release_gather/release_gather_types.h @@ -26,7 +26,10 @@ typedef struct MPIDI_POSIX_release_gather_comm_t { int num_collective_calls; MPIR_Treealgo_tree_t bcast_tree, reduce_tree; - int flags_shm_size; + /* shm mem allocated to this comm */ + uint64_t flags_shm_size; + uint64_t bcast_shm_size; + uint64_t reduce_shm_size; uint64_t gather_state, release_state; void *flags_addr, *bcast_buf_addr, *reduce_buf_addr; From a0b46e9da2fd13de467edef7a64d2275b8cdf54e Mon Sep 17 00:00:00 2001 From: Kaiming Ouyang Date: Thu, 5 Nov 2020 09:40:55 -0600 Subject: [PATCH 2/3] ch4/posix: set coll fallback as silent when shm mem exceeds predefined threshold, coll calls need to fallback to send/recv based implementation. Current main branch sets fallback action as error which aborts program. This behavior will cause improper exit, and this commit change fallback action to silent to solve this issue. --- src/mpi/coll/src/coll_impl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi/coll/src/coll_impl.c b/src/mpi/coll/src/coll_impl.c index 0af50c61749..e9d9e9b29c0 100644 --- a/src/mpi/coll/src/coll_impl.c +++ b/src/mpi/coll/src/coll_impl.c @@ -27,7 +27,7 @@ - name : MPIR_CVAR_COLLECTIVE_FALLBACK category : COLLECTIVE type : enum - default : error + default : silent class : none verbosity : MPI_T_VERBOSITY_USER_BASIC scope : MPI_T_SCOPE_ALL_EQ From b40f2b819281c6c490fb110f6970464443008884 Mon Sep 17 00:00:00 2001 From: Kaiming Ouyang Date: Fri, 6 Nov 2020 15:48:05 -0600 Subject: [PATCH 3/3] ch4/posix: set MPIDI_POSIX_shm_limit_counter as dummy counter in coll finalize MPIDI_POSIX_global.shm_ptr is freed but will be referenced during builtin comm free; here we set MPIDI_POSIX_shm_limit_counter as dummy counter to avoid segmentation fault --- src/mpid/ch4/shm/posix/posix_init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/mpid/ch4/shm/posix/posix_init.c b/src/mpid/ch4/shm/posix/posix_init.c index 91e98d85cf3..f25c891262e 100644 --- a/src/mpid/ch4/shm/posix/posix_init.c +++ b/src/mpid/ch4/shm/posix/posix_init.c @@ -257,6 +257,7 @@ int MPIDI_POSIX_coll_init(int rank, int size) int MPIDI_POSIX_coll_finalize(void) { int mpi_errno = MPI_SUCCESS; + static MPL_atomic_uint64_t MPIDI_POSIX_dummy_shm_limit_counter; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_POSIX_COLL_FINALIZE); @@ -265,6 +266,11 @@ int MPIDI_POSIX_coll_finalize(void) * per node for intra-node collectives */ mpi_errno = MPIDU_Init_shm_free(MPIDI_POSIX_global.shm_ptr); + /* MPIDI_POSIX_global.shm_ptr is freed but will be referenced during builtin + * comm free; here we set MPIDI_POSIX_shm_limit_counter as dummy counter to + * avoid segmentation fault */ + MPIDI_POSIX_shm_limit_counter = &MPIDI_POSIX_dummy_shm_limit_counter; + if (MPIDI_global.shm.posix.csel_root) { mpi_errno = MPIR_Csel_free(MPIDI_global.shm.posix.csel_root); MPIR_ERR_CHECK(mpi_errno);