Skip to content

Commit

Permalink
Prevent race condition in accelerator copies management
Browse files Browse the repository at this point in the history
Prevent the owner device from repurposing a GPU copy while another GPU
is planning to use it as a candidate for a d2d transfer.

Signed-off-by: George Bosilca <[email protected]>
  • Loading branch information
bosilca committed Sep 30, 2023
1 parent c174b39 commit 4a4fad0
Showing 1 changed file with 77 additions and 55 deletions.
132 changes: 77 additions & 55 deletions parsec/mca/device/cuda/device_cuda_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
parsec_data_t* master, *oldmaster;
const parsec_flow_t *flow;
int i, j, data_avail_epoch = 0;
parsec_gpu_data_copy_t *gpu_mem_lru_cycling;
parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL;
parsec_device_gpu_module_t *gpu_device = &cuda_device->super;

#if defined(PARSEC_DEBUG_NOISIER)
Expand All @@ -888,7 +888,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
* corresponding data on the GPU available.
*/
for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
gpu_mem_lru_cycling = NULL;
flow = gpu_task->flow[i];
assert( flow && (flow->flow_index == i) );

Expand All @@ -902,7 +901,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
if (this_task->data[i].data_in == NULL)
continue;

/* BEWARE: here we are using the CPU copy as an input */
master = this_task->data[i].data_in->original;
parsec_atomic_lock(&master->lock);
gpu_elem = PARSEC_DATA_GET_COPY(master, gpu_device->super.device_index);
Expand All @@ -916,10 +914,10 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
flow->name, i, gpu_elem,
gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : "");
if ( gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ) {
/* We might want to do something special if the data is under transfer, but in the current
* version we don't need to because an event is always generated for the push_in of each
* task on the unique push_in stream.
*/
/* We might want to do something special if the data is under transfer, but in the current
* version we don't need to because an event is always generated for the push_in of each
* task on the unique push_in stream.
*/
}
parsec_atomic_unlock(&master->lock);
continue;
Expand All @@ -932,27 +930,28 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
gpu_device->super.name, task_name,
gpu_elem, gpu_task->flow_nb_elts[i], gpu_elem->super.super.obj_reference_count, master);
gpu_elem->flags = PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED;
malloc_data:
malloc_data:
assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]);
if( NULL == gpu_elem->device_private ) {
#endif

find_another_data:
find_another_data:
temp_loc[i] = NULL;
/* Look for a data_copy to free */
lru_gpu_elem = (parsec_gpu_data_copy_t*)parsec_list_pop_front(&gpu_device->gpu_mem_lru);
if( NULL == lru_gpu_elem ) {
/* We can't find enough room on the GPU. Insert the tiles in the begining of
* the LRU (in order to be reused asap) and return without scheduling the task.
* the LRU (in order to be reused asap) and return with error.
*/
release_temp_and_return:
release_temp_and_return:
#if defined(PARSEC_DEBUG_NOISIER)
PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
"GPU[%s]:%s:\tRequest space on GPU failed for flow %s index %d/%d for task %s",
gpu_device->super.name, task_name,
flow->name, i, this_task->task_class->nb_flows, task_name );
#endif /* defined(PARSEC_DEBUG_NOISIER) */
for( j = 0; j < i; j++ ) {
for( j = 0; j <= i; j++ ) {
/* This flow could be a control flow */
if( NULL == temp_loc[j] ) continue;
/* This flow could be non-parsec-owned, in which case we can't reclaim it */
Expand All @@ -970,14 +969,21 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
parsec_atomic_unlock(&master->lock);
return PARSEC_HOOK_RETURN_AGAIN;
}

PARSEC_LIST_ITEM_SINGLETON(lru_gpu_elem);
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s]:%s: Evaluate LRU-retrieved CUDA copy %p [ref_count %d] original %p",
gpu_device->super.name, task_name,
lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count,
lru_gpu_elem->original);

if( gpu_mem_lru_cycling == lru_gpu_elem ) {
PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
"GPU[%s]: Cycle detected on allocating memory for %s",
gpu_device->super.name, task_name);
temp_loc[i] = lru_gpu_elem; /* save it such that it gets pushed back into the LRU */
goto release_temp_and_return;
}

/* If there are pending readers, let the gpu_elem loose. This is a weak coordination
* protocol between here and the parsec_gpu_data_stage_in, where the readers don't necessarily
* always remove the data from the LRU.
Expand All @@ -987,7 +993,12 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
"GPU[%s]:%s: Drop LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p",
gpu_device->super.name, task_name,
lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original);
goto find_another_data; // TODO: add an assert of some sort to check for leaks here?
/* We do not add the copy back into the LRU. This means that for now this copy is not
* tracked via the LRU (despite being only used in read mode) and instead is dangling
* on other tasks. Thus, it will eventually need to be added back into the LRU when
* current task using it completes.
*/
goto find_another_data;
}
/* It's also possible that the ref_count of that element is bigger than 1
* In that case, it's because some task completion did not execute yet, and
Expand All @@ -997,29 +1008,20 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
/* It's also possible (although unlikely) that we livelock here:
* if gpu_mem_lru has *only* elements with readers == 0 but
* ref_count > 1, then we might pop/push forever. We save the
* earliest element found and if we see it again it means we
* run over the entire list without finding a suitable replacement.
* We need to make progress on something else. This remains safe for as long as the
* LRU is only modified by a single thread (in this case the current thread).
*/
* earliest element found and if we see it again it means we
* run over the entire list without finding a suitable replacement.
* We need to make progress on something else. This remains safe for as long as the
* LRU is only modified by a single thread (in this case the current thread).
*/
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s]:%s: Push back LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p",
gpu_device->super.name, task_name,
lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original);
assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super);
if( gpu_mem_lru_cycling == lru_gpu_elem ) {
PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
"GPU[%s]: Cycle detected on allocating memory for %s",
gpu_device->super.name, task_name);
goto release_temp_and_return;
}
if( NULL == gpu_mem_lru_cycling ) {
gpu_mem_lru_cycling = lru_gpu_elem;
}
goto find_another_data;
gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling; /* update the cycle detector */
goto find_another_data;
}

/* Make sure the new GPU element is clean and ready to be used */
assert( master != lru_gpu_elem->original );
if ( NULL != lru_gpu_elem->original ) {
Expand All @@ -1031,16 +1033,8 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
* need to protect all accesses to gpu_mem_lru with the locked version */
assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super);
if( gpu_mem_lru_cycling == lru_gpu_elem ) {
PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
"GPU[%s]: Cycle detected on allocating memory for %s",
gpu_device->super.name, task_name);
goto release_temp_and_return;
}
if( NULL == gpu_mem_lru_cycling ) {
gpu_mem_lru_cycling = lru_gpu_elem;
}
goto find_another_data;
gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling; /* update the cycle detector */
goto find_another_data;
}
for( j = 0; j < i; j++ ) {
if( NULL == this_task->data[j].data_in ) continue;
Expand All @@ -1052,19 +1046,31 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
/* If we are the owner of this tile we need to make sure it remains available for
* other tasks or we run in deadlock situations.
*/
assert( temp_loc[j] == lru_gpu_elem ); /* dont understand how this cannot be true */
parsec_atomic_unlock( &oldmaster->lock );
goto find_another_data;
}
}
if( lru_gpu_elem->readers != 0 ) {
/* Damn, another thread started to use this data (as source for an NVLINK transfer). */
/* There is still one last thing to ensure: if another accelerator uses this copy as a source
* for a d2d transfer it will mark it by atomically increasing the readers. So, we need to
* avoid altering the copy while they are using it, by protecting the access to the readers
* with a cas.
*/
if( !parsec_atomic_cas_int32(&lru_gpu_elem->readers, 0, INT_MIN) ) {
/* we can't use this copy, push it back */
parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super);
gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling; /* update the cycle detector */
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%s]:%s: Push back LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p : Concurrent accesses",
gpu_device->super.name, task_name,
lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count,
lru_gpu_elem->original);
parsec_atomic_unlock( &oldmaster->lock );
goto find_another_data;
}
/* Check if this copy is the last dangling reference to the oldmaster. This is safe to do as we own one of the data refcounts. */
int do_unlock = oldmaster->super.obj_reference_count != 1;
parsec_data_copy_detach(oldmaster, lru_gpu_elem, gpu_device->super.device_index);
/* detach could have released the oldmaster if it only had a single refcount */
/* detach could have released the oldmaster if it only had a single refcount */
if( do_unlock )
parsec_atomic_unlock( &oldmaster->lock );
assert(lru_gpu_elem->readers == 0);
Expand Down Expand Up @@ -1120,8 +1126,8 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
gpu_elem, gpu_elem->device_private, gpu_elem->super.super.obj_reference_count, master);
#if defined(PARSEC_PROF_TRACE)
if((parsec_gpu_trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) &&
(gpu_device->exec_stream[0]->prof_event_track_enable ||
gpu_device->exec_stream[1]->prof_event_track_enable)) {
(gpu_device->exec_stream[0]->prof_event_track_enable ||
gpu_device->exec_stream[1]->prof_event_track_enable)) {
parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
parsec_gpu_allocate_memory_key, (int64_t)gpu_elem->device_private,
gpu_device->super.device_index,
Expand All @@ -1131,9 +1137,15 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
#else
gpu_elem = lru_gpu_elem;
#endif
assert( 0 == gpu_elem->readers );

/* Do not push it back into the LRU for now to prevent others from discovering
* this copy and trying to acquire it. If we fail to find all the copies we need
* we will push it back in the release_temp_and_return, otherwise they will become
* available once properly updated.
*/
gpu_elem->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
gpu_elem->version = 0;
gpu_elem->version = UINT_MAX; /* scrap value for now */
gpu_elem->readers = 0; /* safe to set to no access */
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
"GPU[%s]: GPU copy %p [ref_count %d] gets created with version 0 at %s:%d",
gpu_device->super.name,
Expand All @@ -1149,7 +1161,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device,
gpu_device->super.name, task_name,
gpu_elem, gpu_elem->super.super.obj_reference_count);
assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
parsec_list_push_back(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_elem);
parsec_atomic_unlock(&master->lock);
}
if( data_avail_epoch ) {
Expand Down Expand Up @@ -1242,7 +1253,7 @@ parsec_default_cuda_stage_out(parsec_gpu_task_t *gtask,
/**
* If the most current version of the data is not yet available on the GPU memory
* schedule a transfer.
* Returns hook special retrun codes or a positive number:
* Returns hook special return codes or a positive number:
* HOOK_DONE: The most recent version of the data is already available on the GPU
* 1: A copy has been scheduled on the corresponding stream
* HOOK_ERROR: A copy cannot be issued due to CUDA.
Expand Down Expand Up @@ -1295,7 +1306,7 @@ parsec_gpu_data_stage_in( parsec_device_cuda_module_t* cuda_device,
transfer_from = parsec_data_start_transfer_ownership_to_copy(original, gpu_device->super.device_index, (uint8_t)type);

if( PARSEC_FLOW_ACCESS_WRITE & type && gpu_task->task_type != PARSEC_GPU_TASK_TYPE_PREFETCH ) {
gpu_elem->version++; /* on to the next version */
gpu_elem->version = candidate->version + 1; /* on to the next version */
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
"GPU[%s]: GPU copy %p [ref_count %d] increments version to %d at %s:%d",
gpu_device->super.name,
Expand Down Expand Up @@ -1385,7 +1396,15 @@ parsec_gpu_data_stage_in( parsec_device_cuda_module_t* cuda_device,
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
"GPU[%s]:\tData copy %p [ref_count %d] on CUDA device %d is the best candidate to do Device to Device copy, increasing its readers to %d",
gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, target->cuda_index, candidate->readers+1);
parsec_atomic_fetch_inc_int32( &candidate->readers );
int readers = parsec_atomic_fetch_inc_int32(&candidate->readers);
if( readers < 0 ) {
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
"GPU[%s]:\tCandidate %p [ref_count %d] on CUDA device %d is being repurposed by owner device. Looking for another candidate",
gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, target->cuda_index);
/* We are trying to use a candidate that is repurposed by the owner device. Let's find another one */
parsec_atomic_fetch_add_int32(&candidate->readers, -1);
continue;
}
candidate_dev = target;
goto src_selected;
}
Expand Down Expand Up @@ -2040,8 +2059,11 @@ progress_stream( parsec_device_gpu_module_t* gpu_device,
}

/**
* @brief This function schedule the move of all the data required for a
* specific task from the main memory into the GPU memory.
* @brief This function prepare memory on the target device for all the inputs and output
* of the task, and then initiate the necessary copies from the best location of the input
* data. The best location is defined as any other accelerator that has the same version
* of the data (taking advantage of faster accelerator-to-accelerator connectors, such as
* NVLink), or from the CPU memory if no other candidate is found.
*
* @returns
* a positive number: the number of data to be moved.
Expand All @@ -2061,7 +2083,7 @@ parsec_cuda_kernel_push( parsec_device_gpu_module_t *gpu_device,
char tmp[MAX_TASK_STRLEN];
#endif

/* if not changes were made to the available memory dont waste time */
/* if no changes were made to the available memory dont waste time */
if( gpu_task->last_data_check_epoch == gpu_device->data_avail_epoch )
return PARSEC_HOOK_RETURN_AGAIN;
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
Expand Down

0 comments on commit 4a4fad0

Please sign in to comment.