From 0ebea598a5023200bcae8a647478e2f297bc1a41 Mon Sep 17 00:00:00 2001 From: Luke Robison Date: Wed, 14 Feb 2024 21:14:29 +0000 Subject: [PATCH] btl/smcuda: Add atomic_wmb() before sm_fifo_write This change fixes https://github.com/open-mpi/ompi/issues/12270 Testing on c7g instance type (arm64) confirms this change elminates hangs and crashes that were previously observed in 1 in 30 runs of IMB alltoall benchmark. Tested with over 300 runs and no failures. The write memory barrier prevents other CPUs from observing the fifo get updated before they observe the updated contents of the header itself. Without the barrier, uninitialized header contents caused the crashes and invalid data. Signed-off-by: Luke Robison (cherry picked from commit 71f378d28cb89dd80379dbad570849b297594cde) --- opal/mca/btl/smcuda/btl_smcuda_fifo.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/opal/mca/btl/smcuda/btl_smcuda_fifo.h b/opal/mca/btl/smcuda/btl_smcuda_fifo.h index ca1257b5c56..56369dba9b3 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_fifo.h +++ b/opal/mca/btl/smcuda/btl_smcuda_fifo.h @@ -85,6 +85,8 @@ static void add_pending(struct mca_btl_base_endpoint_t *ep, void *data, bool res #define MCA_BTL_SMCUDA_FIFO_WRITE(endpoint_peer, my_smp_rank, peer_smp_rank, hdr, resend, \ retry_pending_sends, rc) \ do { \ + /* memory barrier: ensure writes to the hdr have completed */ \ + opal_atomic_wmb(); \ sm_fifo_t *fifo = &(mca_btl_smcuda_component.fifo[peer_smp_rank][FIFO_MAP(my_smp_rank)]); \ \ if (retry_pending_sends) { \