From 4a4fad091dc80a4ec05a2458ffb88454cc702073 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Sat, 30 Sep 2023 19:19:36 +0000
Subject: [PATCH] Prevent race condition in accelerator copies management

Prevent the owner device from repurposing a GPU copy while another GPU
is planning to use it as a candidate for a d2d transfer.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
---
 parsec/mca/device/cuda/device_cuda_module.c | 132 ++++++++++++--------
 1 file changed, 77 insertions(+), 55 deletions(-)

diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c
index c3822e130..d5862634e 100644
--- a/parsec/mca/device/cuda/device_cuda_module.c
+++ b/parsec/mca/device/cuda/device_cuda_module.c
@@ -875,7 +875,7 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
     parsec_data_t* master, *oldmaster;
     const parsec_flow_t *flow;
     int i, j, data_avail_epoch = 0;
-    parsec_gpu_data_copy_t *gpu_mem_lru_cycling;
+    parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL;
     parsec_device_gpu_module_t *gpu_device = &cuda_device->super;
 
 #if defined(PARSEC_DEBUG_NOISIER)
@@ -888,7 +888,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
      * corresponding data on the GPU available.
      */
     for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
-        gpu_mem_lru_cycling = NULL;
         flow = gpu_task->flow[i];
         assert( flow && (flow->flow_index == i) );
 
@@ -902,7 +901,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
         if (this_task->data[i].data_in == NULL)
             continue;
 
-	/* BEWARE: here we are using the CPU copy as an input */
         master   = this_task->data[i].data_in->original;
         parsec_atomic_lock(&master->lock);
         gpu_elem = PARSEC_DATA_GET_COPY(master, gpu_device->super.device_index);
@@ -916,10 +914,10 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
                                  flow->name, i, gpu_elem,
                                  gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : "");
             if ( gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ) {
-	      /* We might want to do something special if the data is under transfer, but in the current
-	       * version we don't need to because an event is always generated for the push_in of each
-	       * task on the unique push_in stream.
-	       */
+                /* We might want to do something special if the data is under transfer, but in the current
+                 * version we don't need to because an event is always generated for the push_in of each
+                 * task on the unique push_in stream.
+                 */
             }
             parsec_atomic_unlock(&master->lock);
             continue;
@@ -932,27 +930,28 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
                              gpu_device->super.name, task_name,
                              gpu_elem, gpu_task->flow_nb_elts[i], gpu_elem->super.super.obj_reference_count, master);
         gpu_elem->flags = PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED;
-      malloc_data:
+    malloc_data:
         assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
         gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]);
         if( NULL == gpu_elem->device_private ) {
 #endif
 
-          find_another_data:
+        find_another_data:
+            temp_loc[i] = NULL;
             /* Look for a data_copy to free */
             lru_gpu_elem = (parsec_gpu_data_copy_t*)parsec_list_pop_front(&gpu_device->gpu_mem_lru);
             if( NULL == lru_gpu_elem ) {
                 /* We can't find enough room on the GPU. Insert the tiles in the begining of
-                 * the LRU (in order to be reused asap) and return without scheduling the task.
+                 * the LRU (in order to be reused asap) and return with error.
                  */
-	    release_temp_and_return:
+            release_temp_and_return:
 #if defined(PARSEC_DEBUG_NOISIER)
                 PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
                                      "GPU[%s]:%s:\tRequest space on GPU failed for flow %s index %d/%d for task %s",
                                      gpu_device->super.name, task_name,
                                      flow->name, i, this_task->task_class->nb_flows, task_name );
 #endif  /* defined(PARSEC_DEBUG_NOISIER) */
-                for( j = 0; j < i; j++ ) {
+                for( j = 0; j <= i; j++ ) {
                     /* This flow could be a control flow */
                     if( NULL == temp_loc[j] ) continue;
                     /* This flow could be non-parsec-owned, in which case we can't reclaim it */
@@ -970,7 +969,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
                 parsec_atomic_unlock(&master->lock);
                 return PARSEC_HOOK_RETURN_AGAIN;
             }
-
             PARSEC_LIST_ITEM_SINGLETON(lru_gpu_elem);
             PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                                  "GPU[%s]:%s: Evaluate LRU-retrieved CUDA copy %p [ref_count %d] original %p",
@@ -978,6 +976,14 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
                                  lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count,
                                  lru_gpu_elem->original);
 
+            if( gpu_mem_lru_cycling == lru_gpu_elem ) {
+                PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
+                                     "GPU[%s]: Cycle detected on allocating memory for %s",
+                                     gpu_device->super.name, task_name);
+                temp_loc[i] = lru_gpu_elem;  /* save it such that it gets pushed back into the LRU */
+                goto release_temp_and_return;
+            }
+
             /* If there are pending readers, let the gpu_elem loose. This is a weak coordination
              * protocol between here and the parsec_gpu_data_stage_in, where the readers don't necessarily
              * always remove the data from the LRU.
@@ -987,7 +993,12 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
                                      "GPU[%s]:%s: Drop LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p",
                                      gpu_device->super.name, task_name,
                                      lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original);
-                goto find_another_data; // TODO: add an assert of some sort to check for leaks here?
+                /* We do not add the copy back into the LRU. This means that for now this copy is not
+                 * tracked via the LRU (despite being only used in read mode) and instead is dangling
+                 * on other tasks. Thus, it will eventually need to be added back into the LRU when
+                 * current task using it completes.
+                 */
+                goto find_another_data;
             }
             /* It's also possible that the ref_count of that element is bigger than 1
              * In that case, it's because some task completion did not execute yet, and
@@ -997,29 +1008,20 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
                 /* It's also possible (although unlikely) that we livelock here:
                  * if gpu_mem_lru has *only* elements with readers == 0 but
                  * ref_count > 1, then we might pop/push forever. We save the
-		 * earliest element found and if we see it again it means we
-		 * run over the entire list without finding a suitable replacement.
-		 * We need to make progress on something else. This remains safe for as long as the
-		 * LRU is only modified by a single thread (in this case the current thread).
-		 */
+                 * earliest element found and if we see it again it means we
+                 * run over the entire list without finding a suitable replacement.
+                 * We need to make progress on something else. This remains safe for as long as the
+                 * LRU is only modified by a single thread (in this case the current thread).
+                 */
                 PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                                      "GPU[%s]:%s: Push back LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p",
                                      gpu_device->super.name, task_name,
                                      lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original);
                 assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
                 parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super);
-		if( gpu_mem_lru_cycling == lru_gpu_elem ) {
-		    PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
-					 "GPU[%s]: Cycle detected on allocating memory for %s",
-					 gpu_device->super.name, task_name);
-		    goto release_temp_and_return;
-		}
-                if( NULL == gpu_mem_lru_cycling ) {
-                    gpu_mem_lru_cycling = lru_gpu_elem;
-                }
-		goto find_another_data;
+                gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling;  /* update the cycle detector */
+                goto find_another_data;
             }
-
             /* Make sure the new GPU element is clean and ready to be used */
             assert( master != lru_gpu_elem->original );
             if ( NULL != lru_gpu_elem->original ) {
@@ -1031,16 +1033,8 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
                      * need to protect all accesses to gpu_mem_lru with the locked version */
                     assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
                     parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super);
-		    if( gpu_mem_lru_cycling == lru_gpu_elem ) {
-		        PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
-					     "GPU[%s]: Cycle detected on allocating memory for %s",
-					     gpu_device->super.name, task_name);
-			goto release_temp_and_return;
-		    }
-                    if( NULL == gpu_mem_lru_cycling ) {
-                        gpu_mem_lru_cycling = lru_gpu_elem;
-                    }
-		    goto find_another_data;
+                    gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling;  /* update the cycle detector */
+                    goto find_another_data;
                 }
                 for( j = 0; j < i; j++ ) {
                     if( NULL == this_task->data[j].data_in ) continue;
@@ -1052,19 +1046,31 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
                         /* If we are the owner of this tile we need to make sure it remains available for
                          * other tasks or we run in deadlock situations.
                          */
-			assert( temp_loc[j] == lru_gpu_elem );  /* dont understand how this cannot be true */
                         parsec_atomic_unlock( &oldmaster->lock );
                         goto find_another_data;
                     }
                 }
-                if( lru_gpu_elem->readers != 0 ) {
-                    /* Damn, another thread started to use this data (as source for an NVLINK transfer). */
+                /* There is still one last thing to ensure: if another accelerator uses this copy as a source
+                 * for a d2d transfer it will mark it by atomically increasing the readers. So, we need to
+                 * avoid altering the copy while they are using it, by protecting the access to the readers
+                 * with a cas.
+                 */
+                if( !parsec_atomic_cas_int32(&lru_gpu_elem->readers, 0, INT_MIN) ) {
+                    /* we can't use this copy, push it back */
+                    parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super);
+                    gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling;  /* update the cycle detector */
+                    PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
+                                         "GPU[%s]:%s: Push back LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p : Concurrent accesses",
+                                         gpu_device->super.name, task_name,
+                                         lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count,
+                                         lru_gpu_elem->original);
                     parsec_atomic_unlock( &oldmaster->lock );
                     goto find_another_data;
                 }
+                /* Check if this copy is the last dangling reference to the oldmaster. This is safe to do as we own one of the data refcounts. */
                 int do_unlock = oldmaster->super.obj_reference_count != 1;
                 parsec_data_copy_detach(oldmaster, lru_gpu_elem, gpu_device->super.device_index);
-		/* detach could have released the oldmaster if it only had a single refcount */
+                /* detach could have released the oldmaster if it only had a single refcount */
                 if( do_unlock )
                     parsec_atomic_unlock( &oldmaster->lock );
                 assert(lru_gpu_elem->readers == 0);
@@ -1120,8 +1126,8 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
                              gpu_elem, gpu_elem->device_private, gpu_elem->super.super.obj_reference_count, master);
 #if defined(PARSEC_PROF_TRACE)
         if((parsec_gpu_trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) &&
-                        (gpu_device->exec_stream[0]->prof_event_track_enable ||
-                         gpu_device->exec_stream[1]->prof_event_track_enable)) {
+           (gpu_device->exec_stream[0]->prof_event_track_enable ||
+            gpu_device->exec_stream[1]->prof_event_track_enable)) {
             parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
                                          parsec_gpu_allocate_memory_key, (int64_t)gpu_elem->device_private,
                                          gpu_device->super.device_index,
@@ -1131,9 +1137,15 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
 #else
         gpu_elem = lru_gpu_elem;
 #endif
-        assert( 0 == gpu_elem->readers );
+
+        /* Do not push it back into the LRU for now to prevent others from discovering
+         * this copy and trying to acquire it. If we fail to find all the copies we need
+         * we will push it back in the release_temp_and_return, otherwise they will become
+         * available once properly updated.
+         */
         gpu_elem->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
-        gpu_elem->version = 0;
+        gpu_elem->version = UINT_MAX;  /* scrap value for now */
+        gpu_elem->readers = 0;         /* safe to set to no access */
         PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
                              "GPU[%s]: GPU copy %p [ref_count %d] gets created with version 0 at %s:%d",
                              gpu_device->super.name,
@@ -1149,7 +1161,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
                              gpu_device->super.name, task_name,
                              gpu_elem, gpu_elem->super.super.obj_reference_count);
         assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
-        parsec_list_push_back(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_elem);
         parsec_atomic_unlock(&master->lock);
     }
     if( data_avail_epoch ) {
@@ -1242,7 +1253,7 @@ parsec_default_cuda_stage_out(parsec_gpu_task_t        *gtask,
 /**
  * If the most current version of the data is not yet available on the GPU memory
  * schedule a transfer.
- * Returns hook special retrun codes or a positive number:
+ * Returns hook special return codes or a positive number:
  *    HOOK_DONE: The most recent version of the data is already available on the GPU
  *    1: A copy has been scheduled on the corresponding stream
  *   HOOK_ERROR: A copy cannot be issued due to CUDA.
@@ -1295,7 +1306,7 @@ parsec_gpu_data_stage_in( parsec_device_cuda_module_t* cuda_device,
     transfer_from = parsec_data_start_transfer_ownership_to_copy(original, gpu_device->super.device_index, (uint8_t)type);
 
     if( PARSEC_FLOW_ACCESS_WRITE & type && gpu_task->task_type != PARSEC_GPU_TASK_TYPE_PREFETCH ) {
-        gpu_elem->version++;  /* on to the next version */
+        gpu_elem->version = candidate->version + 1;  /* on to the next version */
         PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
                              "GPU[%s]: GPU copy %p [ref_count %d] increments version to %d at %s:%d",
                              gpu_device->super.name,
@@ -1385,7 +1396,15 @@ parsec_gpu_data_stage_in( parsec_device_cuda_module_t* cuda_device,
             PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
                                  "GPU[%s]:\tData copy %p [ref_count %d] on CUDA device %d is the best candidate to do Device to Device copy, increasing its readers to %d",
                                  gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, target->cuda_index, candidate->readers+1);
-            parsec_atomic_fetch_inc_int32( &candidate->readers );
+            int readers = parsec_atomic_fetch_inc_int32(&candidate->readers);
+            if( readers < 0 ) {
+                PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
+                                     "GPU[%s]:\tCandidate %p [ref_count %d] on CUDA device %d is being repurposed by owner device. Looking for another candidate",
+                                     gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, target->cuda_index);
+                /* We are trying to use a candidate that is repurposed by the owner device. Let's find another one */
+                parsec_atomic_fetch_add_int32(&candidate->readers, -1);
+                continue;
+            }
             candidate_dev = target;
             goto src_selected;
         }
@@ -2040,8 +2059,11 @@ progress_stream( parsec_device_gpu_module_t* gpu_device,
 }
 
 /**
- *  @brief This function schedule the move of all the data required for a
- *  specific task from the main memory into the GPU memory.
+ *  @brief This function prepare memory on the target device for all the inputs and output
+ *  of the task, and then initiate the necessary copies from the best location of the input
+ *  data. The best location is defined as any other accelerator that has the same version
+ *  of the data (taking advantage of faster accelerator-to-accelerator connectors, such as
+ *  NVLink), or from the CPU memory if no other candidate is found.
  *
  *  @returns
  *     a positive number: the number of data to be moved.
@@ -2061,7 +2083,7 @@ parsec_cuda_kernel_push( parsec_device_gpu_module_t      *gpu_device,
     char tmp[MAX_TASK_STRLEN];
 #endif
 
-    /* if not changes were made to the available memory dont waste time */
+    /* if no changes were made to the available memory dont waste time */
     if( gpu_task->last_data_check_epoch == gpu_device->data_avail_epoch )
         return PARSEC_HOOK_RETURN_AGAIN;
     PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,