From 4a4fad091dc80a4ec05a2458ffb88454cc702073 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sat, 30 Sep 2023 19:19:36 +0000 Subject: [PATCH] Prevent race condition in accelerator copies management Prevent the owner device from repurposing a GPU copy while another GPU is planning to use it as a candidate for a d2d transfer. Signed-off-by: George Bosilca --- parsec/mca/device/cuda/device_cuda_module.c | 132 ++++++++++++-------- 1 file changed, 77 insertions(+), 55 deletions(-) diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c index c3822e130..d5862634e 100644 --- a/parsec/mca/device/cuda/device_cuda_module.c +++ b/parsec/mca/device/cuda/device_cuda_module.c @@ -875,7 +875,7 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, parsec_data_t* master, *oldmaster; const parsec_flow_t *flow; int i, j, data_avail_epoch = 0; - parsec_gpu_data_copy_t *gpu_mem_lru_cycling; + parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL; parsec_device_gpu_module_t *gpu_device = &cuda_device->super; #if defined(PARSEC_DEBUG_NOISIER) @@ -888,7 +888,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, * corresponding data on the GPU available. */ for( i = 0; i < this_task->task_class->nb_flows; i++ ) { - gpu_mem_lru_cycling = NULL; flow = gpu_task->flow[i]; assert( flow && (flow->flow_index == i) ); @@ -902,7 +901,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, if (this_task->data[i].data_in == NULL) continue; - /* BEWARE: here we are using the CPU copy as an input */ master = this_task->data[i].data_in->original; parsec_atomic_lock(&master->lock); gpu_elem = PARSEC_DATA_GET_COPY(master, gpu_device->super.device_index); @@ -916,10 +914,10 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, flow->name, i, gpu_elem, gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : ""); if ( gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ) { - /* We might want to do something special if the data is under transfer, but in the current - * version we don't need to because an event is always generated for the push_in of each - * task on the unique push_in stream. - */ + /* We might want to do something special if the data is under transfer, but in the current + * version we don't need to because an event is always generated for the push_in of each + * task on the unique push_in stream. + */ } parsec_atomic_unlock(&master->lock); continue; @@ -932,27 +930,28 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, gpu_device->super.name, task_name, gpu_elem, gpu_task->flow_nb_elts[i], gpu_elem->super.super.obj_reference_count, master); gpu_elem->flags = PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED; - malloc_data: + malloc_data: assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]); if( NULL == gpu_elem->device_private ) { #endif - find_another_data: + find_another_data: + temp_loc[i] = NULL; /* Look for a data_copy to free */ lru_gpu_elem = (parsec_gpu_data_copy_t*)parsec_list_pop_front(&gpu_device->gpu_mem_lru); if( NULL == lru_gpu_elem ) { /* We can't find enough room on the GPU. Insert the tiles in the begining of - * the LRU (in order to be reused asap) and return without scheduling the task. + * the LRU (in order to be reused asap) and return with error. */ - release_temp_and_return: + release_temp_and_return: #if defined(PARSEC_DEBUG_NOISIER) PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream, "GPU[%s]:%s:\tRequest space on GPU failed for flow %s index %d/%d for task %s", gpu_device->super.name, task_name, flow->name, i, this_task->task_class->nb_flows, task_name ); #endif /* defined(PARSEC_DEBUG_NOISIER) */ - for( j = 0; j < i; j++ ) { + for( j = 0; j <= i; j++ ) { /* This flow could be a control flow */ if( NULL == temp_loc[j] ) continue; /* This flow could be non-parsec-owned, in which case we can't reclaim it */ @@ -970,7 +969,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, parsec_atomic_unlock(&master->lock); return PARSEC_HOOK_RETURN_AGAIN; } - PARSEC_LIST_ITEM_SINGLETON(lru_gpu_elem); PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%s]:%s: Evaluate LRU-retrieved CUDA copy %p [ref_count %d] original %p", @@ -978,6 +976,14 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original); + if( gpu_mem_lru_cycling == lru_gpu_elem ) { + PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream, + "GPU[%s]: Cycle detected on allocating memory for %s", + gpu_device->super.name, task_name); + temp_loc[i] = lru_gpu_elem; /* save it such that it gets pushed back into the LRU */ + goto release_temp_and_return; + } + /* If there are pending readers, let the gpu_elem loose. This is a weak coordination * protocol between here and the parsec_gpu_data_stage_in, where the readers don't necessarily * always remove the data from the LRU. @@ -987,7 +993,12 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, "GPU[%s]:%s: Drop LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p", gpu_device->super.name, task_name, lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original); - goto find_another_data; // TODO: add an assert of some sort to check for leaks here? + /* We do not add the copy back into the LRU. This means that for now this copy is not + * tracked via the LRU (despite being only used in read mode) and instead is dangling + * on other tasks. Thus, it will eventually need to be added back into the LRU when + * current task using it completes. + */ + goto find_another_data; } /* It's also possible that the ref_count of that element is bigger than 1 * In that case, it's because some task completion did not execute yet, and @@ -997,29 +1008,20 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, /* It's also possible (although unlikely) that we livelock here: * if gpu_mem_lru has *only* elements with readers == 0 but * ref_count > 1, then we might pop/push forever. We save the - * earliest element found and if we see it again it means we - * run over the entire list without finding a suitable replacement. - * We need to make progress on something else. This remains safe for as long as the - * LRU is only modified by a single thread (in this case the current thread). - */ + * earliest element found and if we see it again it means we + * run over the entire list without finding a suitable replacement. + * We need to make progress on something else. This remains safe for as long as the + * LRU is only modified by a single thread (in this case the current thread). + */ PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%s]:%s: Push back LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p", gpu_device->super.name, task_name, lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original); assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super); - if( gpu_mem_lru_cycling == lru_gpu_elem ) { - PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream, - "GPU[%s]: Cycle detected on allocating memory for %s", - gpu_device->super.name, task_name); - goto release_temp_and_return; - } - if( NULL == gpu_mem_lru_cycling ) { - gpu_mem_lru_cycling = lru_gpu_elem; - } - goto find_another_data; + gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling; /* update the cycle detector */ + goto find_another_data; } - /* Make sure the new GPU element is clean and ready to be used */ assert( master != lru_gpu_elem->original ); if ( NULL != lru_gpu_elem->original ) { @@ -1031,16 +1033,8 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, * need to protect all accesses to gpu_mem_lru with the locked version */ assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super); - if( gpu_mem_lru_cycling == lru_gpu_elem ) { - PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream, - "GPU[%s]: Cycle detected on allocating memory for %s", - gpu_device->super.name, task_name); - goto release_temp_and_return; - } - if( NULL == gpu_mem_lru_cycling ) { - gpu_mem_lru_cycling = lru_gpu_elem; - } - goto find_another_data; + gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling; /* update the cycle detector */ + goto find_another_data; } for( j = 0; j < i; j++ ) { if( NULL == this_task->data[j].data_in ) continue; @@ -1052,19 +1046,31 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, /* If we are the owner of this tile we need to make sure it remains available for * other tasks or we run in deadlock situations. */ - assert( temp_loc[j] == lru_gpu_elem ); /* dont understand how this cannot be true */ parsec_atomic_unlock( &oldmaster->lock ); goto find_another_data; } } - if( lru_gpu_elem->readers != 0 ) { - /* Damn, another thread started to use this data (as source for an NVLINK transfer). */ + /* There is still one last thing to ensure: if another accelerator uses this copy as a source + * for a d2d transfer it will mark it by atomically increasing the readers. So, we need to + * avoid altering the copy while they are using it, by protecting the access to the readers + * with a cas. + */ + if( !parsec_atomic_cas_int32(&lru_gpu_elem->readers, 0, INT_MIN) ) { + /* we can't use this copy, push it back */ + parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super); + gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling; /* update the cycle detector */ + PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, + "GPU[%s]:%s: Push back LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p : Concurrent accesses", + gpu_device->super.name, task_name, + lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, + lru_gpu_elem->original); parsec_atomic_unlock( &oldmaster->lock ); goto find_another_data; } + /* Check if this copy is the last dangling reference to the oldmaster. This is safe to do as we own one of the data refcounts. */ int do_unlock = oldmaster->super.obj_reference_count != 1; parsec_data_copy_detach(oldmaster, lru_gpu_elem, gpu_device->super.device_index); - /* detach could have released the oldmaster if it only had a single refcount */ + /* detach could have released the oldmaster if it only had a single refcount */ if( do_unlock ) parsec_atomic_unlock( &oldmaster->lock ); assert(lru_gpu_elem->readers == 0); @@ -1120,8 +1126,8 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, gpu_elem, gpu_elem->device_private, gpu_elem->super.super.obj_reference_count, master); #if defined(PARSEC_PROF_TRACE) if((parsec_gpu_trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) && - (gpu_device->exec_stream[0]->prof_event_track_enable || - gpu_device->exec_stream[1]->prof_event_track_enable)) { + (gpu_device->exec_stream[0]->prof_event_track_enable || + gpu_device->exec_stream[1]->prof_event_track_enable)) { parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, parsec_gpu_allocate_memory_key, (int64_t)gpu_elem->device_private, gpu_device->super.device_index, @@ -1131,9 +1137,15 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, #else gpu_elem = lru_gpu_elem; #endif - assert( 0 == gpu_elem->readers ); + + /* Do not push it back into the LRU for now to prevent others from discovering + * this copy and trying to acquire it. If we fail to find all the copies we need + * we will push it back in the release_temp_and_return, otherwise they will become + * available once properly updated. + */ gpu_elem->coherency_state = PARSEC_DATA_COHERENCY_INVALID; - gpu_elem->version = 0; + gpu_elem->version = UINT_MAX; /* scrap value for now */ + gpu_elem->readers = 0; /* safe to set to no access */ PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%s]: GPU copy %p [ref_count %d] gets created with version 0 at %s:%d", gpu_device->super.name, @@ -1149,7 +1161,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, gpu_device->super.name, task_name, gpu_elem, gpu_elem->super.super.obj_reference_count); assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); - parsec_list_push_back(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_elem); parsec_atomic_unlock(&master->lock); } if( data_avail_epoch ) { @@ -1242,7 +1253,7 @@ parsec_default_cuda_stage_out(parsec_gpu_task_t *gtask, /** * If the most current version of the data is not yet available on the GPU memory * schedule a transfer. - * Returns hook special retrun codes or a positive number: + * Returns hook special return codes or a positive number: * HOOK_DONE: The most recent version of the data is already available on the GPU * 1: A copy has been scheduled on the corresponding stream * HOOK_ERROR: A copy cannot be issued due to CUDA. @@ -1295,7 +1306,7 @@ parsec_gpu_data_stage_in( parsec_device_cuda_module_t* cuda_device, transfer_from = parsec_data_start_transfer_ownership_to_copy(original, gpu_device->super.device_index, (uint8_t)type); if( PARSEC_FLOW_ACCESS_WRITE & type && gpu_task->task_type != PARSEC_GPU_TASK_TYPE_PREFETCH ) { - gpu_elem->version++; /* on to the next version */ + gpu_elem->version = candidate->version + 1; /* on to the next version */ PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%s]: GPU copy %p [ref_count %d] increments version to %d at %s:%d", gpu_device->super.name, @@ -1385,7 +1396,15 @@ parsec_gpu_data_stage_in( parsec_device_cuda_module_t* cuda_device, PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%s]:\tData copy %p [ref_count %d] on CUDA device %d is the best candidate to do Device to Device copy, increasing its readers to %d", gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, target->cuda_index, candidate->readers+1); - parsec_atomic_fetch_inc_int32( &candidate->readers ); + int readers = parsec_atomic_fetch_inc_int32(&candidate->readers); + if( readers < 0 ) { + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, + "GPU[%s]:\tCandidate %p [ref_count %d] on CUDA device %d is being repurposed by owner device. Looking for another candidate", + gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, target->cuda_index); + /* We are trying to use a candidate that is repurposed by the owner device. Let's find another one */ + parsec_atomic_fetch_add_int32(&candidate->readers, -1); + continue; + } candidate_dev = target; goto src_selected; } @@ -2040,8 +2059,11 @@ progress_stream( parsec_device_gpu_module_t* gpu_device, } /** - * @brief This function schedule the move of all the data required for a - * specific task from the main memory into the GPU memory. + * @brief This function prepare memory on the target device for all the inputs and output + * of the task, and then initiate the necessary copies from the best location of the input + * data. The best location is defined as any other accelerator that has the same version + * of the data (taking advantage of faster accelerator-to-accelerator connectors, such as + * NVLink), or from the CPU memory if no other candidate is found. * * @returns * a positive number: the number of data to be moved. @@ -2061,7 +2083,7 @@ parsec_cuda_kernel_push( parsec_device_gpu_module_t *gpu_device, char tmp[MAX_TASK_STRLEN]; #endif - /* if not changes were made to the available memory dont waste time */ + /* if no changes were made to the available memory dont waste time */ if( gpu_task->last_data_check_epoch == gpu_device->data_avail_epoch ) return PARSEC_HOOK_RETURN_AGAIN; PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,