From ac4799d74a0bfd37532290319cd70e18ec34baee Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Mon, 2 Oct 2023 19:13:12 +0000 Subject: [PATCH 1/2] Fix the device statistics. Signed-off-by: George Bosilca --- parsec/mca/device/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parsec/mca/device/device.c b/parsec/mca/device/device.c index c89df8ca6..0ea0ddf31 100644 --- a/parsec/mca/device/device.c +++ b/parsec/mca/device/device.c @@ -385,7 +385,7 @@ void parsec_mca_device_dump_and_reset_statistics(parsec_context_t* parsec_contex required_out[device->device_index] += device->required_data_out; d2dtmp = 0; for(unsigned int j = 1; j < device->data_in_array_size; j++) { - d2dtmp += device->data_in_from_device[i]; + d2dtmp += device->data_in_from_device[j]; } d2d[device->device_index] += d2dtmp; /* Update the context-level statistics */ From f69460cbae5ab6be34b4f7c16c44ea973718c287 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Sat, 30 Sep 2023 19:19:36 +0000 Subject: [PATCH 2/2] Prevent race condition in accelerator copies management Prevent the owner device from repurposing a GPU copy while another GPU is planning to use it as a candidate for a d2d transfer. Signed-off-by: George Bosilca --- parsec/data.c | 6 +- parsec/mca/device/cuda/device_cuda_module.c | 192 ++++++++++++-------- 2 files changed, 114 insertions(+), 84 deletions(-) diff --git a/parsec/data.c b/parsec/data.c index 525b04bfd..dff05f287 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -329,11 +329,11 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data, copy = data->device_copies[device]; assert( NULL != copy ); - + PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "DEV[%d]: start transfer ownership of data %p to copy %p in mode %d", device, data, copy, access_mode); - + switch( copy->coherency_state ) { case PARSEC_DATA_COHERENCY_INVALID: transfer_required = 1; @@ -415,8 +415,6 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data, } } - assert( (!transfer_required) || (data->device_copies[valid_copy]->version >= copy->version) ); - if( PARSEC_FLOW_ACCESS_READ & access_mode ) { copy->readers++; } diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c index 0f79c92a6..07c0da75e 100644 --- a/parsec/mca/device/cuda/device_cuda_module.c +++ b/parsec/mca/device/cuda/device_cuda_module.c @@ -27,6 +27,8 @@ #include #include +#define PARSEC_CUDA_DATA_COPY_ATOMIC_SENTINEL 1024 + static int parsec_cuda_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int advice); /** * According to @@ -874,8 +876,8 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, parsec_gpu_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem; parsec_data_t* master, *oldmaster; const parsec_flow_t *flow; - int i, j, data_avail_epoch = 0; - parsec_gpu_data_copy_t *gpu_mem_lru_cycling; + int i, j, data_avail_epoch = 0, copy_readers_update = 0; + parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL; parsec_device_gpu_module_t *gpu_device = &cuda_device->super; #if defined(PARSEC_DEBUG_NOISIER) @@ -888,7 +890,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, * corresponding data on the GPU available. */ for( i = 0; i < this_task->task_class->nb_flows; i++ ) { - gpu_mem_lru_cycling = NULL; flow = gpu_task->flow[i]; assert( flow && (flow->flow_index == i) ); @@ -902,7 +903,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, if (this_task->data[i].data_in == NULL) continue; - /* BEWARE: here we are using the CPU copy as an input */ master = this_task->data[i].data_in->original; parsec_atomic_lock(&master->lock); gpu_elem = PARSEC_DATA_GET_COPY(master, gpu_device->super.device_index); @@ -916,10 +916,10 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, flow->name, i, gpu_elem, gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : ""); if ( gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ) { - /* We might want to do something special if the data is under transfer, but in the current - * version we don't need to because an event is always generated for the push_in of each - * task on the unique push_in stream. - */ + /* We might want to do something special if the data is under transfer, but in the current + * version we don't need to because an event is always generated for the push_in of each + * task on the unique push_in stream. + */ } parsec_atomic_unlock(&master->lock); continue; @@ -932,27 +932,29 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, gpu_device->super.name, task_name, gpu_elem, gpu_task->flow_nb_elts[i], gpu_elem->super.super.obj_reference_count, master); gpu_elem->flags = PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED; - malloc_data: + malloc_data: assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]); if( NULL == gpu_elem->device_private ) { #endif - find_another_data: + find_another_data: + copy_readers_update = 1; + temp_loc[i] = NULL; /* Look for a data_copy to free */ lru_gpu_elem = (parsec_gpu_data_copy_t*)parsec_list_pop_front(&gpu_device->gpu_mem_lru); if( NULL == lru_gpu_elem ) { /* We can't find enough room on the GPU. Insert the tiles in the begining of - * the LRU (in order to be reused asap) and return without scheduling the task. + * the LRU (in order to be reused asap) and return with error. */ - release_temp_and_return: + release_temp_and_return: #if defined(PARSEC_DEBUG_NOISIER) PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream, "GPU[%s]:%s:\tRequest space on GPU failed for flow %s index %d/%d for task %s", gpu_device->super.name, task_name, flow->name, i, this_task->task_class->nb_flows, task_name ); #endif /* defined(PARSEC_DEBUG_NOISIER) */ - for( j = 0; j < i; j++ ) { + for( j = 0; j <= i; j++ ) { /* This flow could be a control flow */ if( NULL == temp_loc[j] ) continue; /* This flow could be non-parsec-owned, in which case we can't reclaim it */ @@ -970,7 +972,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, parsec_atomic_unlock(&master->lock); return PARSEC_HOOK_RETURN_AGAIN; } - PARSEC_LIST_ITEM_SINGLETON(lru_gpu_elem); PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%s]:%s: Evaluate LRU-retrieved CUDA copy %p [ref_count %d] original %p", @@ -978,6 +979,14 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original); + if( gpu_mem_lru_cycling == lru_gpu_elem ) { + PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream, + "GPU[%s]: Cycle detected on allocating memory for %s", + gpu_device->super.name, task_name); + temp_loc[i] = lru_gpu_elem; /* save it such that it gets pushed back into the LRU */ + goto release_temp_and_return; + } + /* If there are pending readers, let the gpu_elem loose. This is a weak coordination * protocol between here and the parsec_gpu_data_stage_in, where the readers don't necessarily * always remove the data from the LRU. @@ -987,7 +996,12 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, "GPU[%s]:%s: Drop LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p", gpu_device->super.name, task_name, lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original); - goto find_another_data; // TODO: add an assert of some sort to check for leaks here? + /* We do not add the copy back into the LRU. This means that for now this copy is not + * tracked via the LRU (despite being only used in read mode) and instead is dangling + * on other tasks. Thus, it will eventually need to be added back into the LRU when + * current task using it completes. + */ + goto find_another_data; } /* It's also possible that the ref_count of that element is bigger than 1 * In that case, it's because some task completion did not execute yet, and @@ -997,29 +1011,20 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, /* It's also possible (although unlikely) that we livelock here: * if gpu_mem_lru has *only* elements with readers == 0 but * ref_count > 1, then we might pop/push forever. We save the - * earliest element found and if we see it again it means we - * run over the entire list without finding a suitable replacement. - * We need to make progress on something else. This remains safe for as long as the - * LRU is only modified by a single thread (in this case the current thread). - */ + * earliest element found and if we see it again it means we + * run over the entire list without finding a suitable replacement. + * We need to make progress on something else. This remains safe for as long as the + * LRU is only modified by a single thread (in this case the current thread). + */ PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%s]:%s: Push back LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p", gpu_device->super.name, task_name, lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original); assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super); - if( gpu_mem_lru_cycling == lru_gpu_elem ) { - PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream, - "GPU[%s]: Cycle detected on allocating memory for %s", - gpu_device->super.name, task_name); - goto release_temp_and_return; - } - if( NULL == gpu_mem_lru_cycling ) { - gpu_mem_lru_cycling = lru_gpu_elem; - } - goto find_another_data; + gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling; /* update the cycle detector */ + goto find_another_data; } - /* Make sure the new GPU element is clean and ready to be used */ assert( master != lru_gpu_elem->original ); if ( NULL != lru_gpu_elem->original ) { @@ -1031,16 +1036,8 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, * need to protect all accesses to gpu_mem_lru with the locked version */ assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super); - if( gpu_mem_lru_cycling == lru_gpu_elem ) { - PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream, - "GPU[%s]: Cycle detected on allocating memory for %s", - gpu_device->super.name, task_name); - goto release_temp_and_return; - } - if( NULL == gpu_mem_lru_cycling ) { - gpu_mem_lru_cycling = lru_gpu_elem; - } - goto find_another_data; + gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling; /* update the cycle detector */ + goto find_another_data; } for( j = 0; j < i; j++ ) { if( NULL == this_task->data[j].data_in ) continue; @@ -1052,22 +1049,37 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, /* If we are the owner of this tile we need to make sure it remains available for * other tasks or we run in deadlock situations. */ - assert( temp_loc[j] == lru_gpu_elem ); /* dont understand how this cannot be true */ parsec_atomic_unlock( &oldmaster->lock ); goto find_another_data; } } - if( lru_gpu_elem->readers != 0 ) { - /* Damn, another thread started to use this data (as source for an NVLINK transfer). */ + /* There is still one last thing to ensure: if another accelerator uses this copy as a source + * for a d2d transfer it will mark it by atomically increasing the readers. So, we need to + * avoid altering the copy while they are using it, by protecting the access to the readers + * with a cas. + */ + if( !parsec_atomic_cas_int32(&lru_gpu_elem->readers, 0, -PARSEC_CUDA_DATA_COPY_ATOMIC_SENTINEL) ) { + assert(lru_gpu_elem->readers > 0); + /* we can't use this copy, push it back */ + parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super); + gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling; /* update the cycle detector */ + PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, + "GPU[%s]:%s: Push back LRU-retrieved CUDA copy %p [readers %d, ref_count %d] original %p : Concurrent accesses", + gpu_device->super.name, task_name, + lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, + lru_gpu_elem->original); parsec_atomic_unlock( &oldmaster->lock ); goto find_another_data; } + copy_readers_update = PARSEC_CUDA_DATA_COPY_ATOMIC_SENTINEL; + /* Check if this copy is the last dangling reference to the oldmaster. This is safe to do as we own one of the data refcounts. */ int do_unlock = oldmaster->super.obj_reference_count != 1; parsec_data_copy_detach(oldmaster, lru_gpu_elem, gpu_device->super.device_index); - /* detach could have released the oldmaster if it only had a single refcount */ + parsec_atomic_wmb(); + /* detach could have released the oldmaster if it only had a single refcount */ if( do_unlock ) parsec_atomic_unlock( &oldmaster->lock ); - assert(lru_gpu_elem->readers == 0); + /* The data is not used, it's not one of ours, and it has been detached from the device * so no other device can use it as a source for their copy : we can free it or reuse it */ PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, @@ -1120,8 +1132,8 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, gpu_elem, gpu_elem->device_private, gpu_elem->super.super.obj_reference_count, master); #if defined(PARSEC_PROF_TRACE) if((parsec_gpu_trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) && - (gpu_device->exec_stream[0]->prof_event_track_enable || - gpu_device->exec_stream[1]->prof_event_track_enable)) { + (gpu_device->exec_stream[0]->prof_event_track_enable || + gpu_device->exec_stream[1]->prof_event_track_enable)) { parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, parsec_gpu_allocate_memory_key, (int64_t)gpu_elem->device_private, gpu_device->super.device_index, @@ -1131,9 +1143,18 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, #else gpu_elem = lru_gpu_elem; #endif - assert( 0 == gpu_elem->readers ); + + /* Do not push it back into the LRU for now to prevent others from discovering + * this copy and trying to acquire it. If we fail to find all the copies we need + * we will push it back in the release_temp_and_return, otherwise they will become + * available once properly updated. + */ gpu_elem->coherency_state = PARSEC_DATA_COHERENCY_INVALID; - gpu_elem->version = 0; + gpu_elem->version = UINT_MAX; /* scrap value for now */ + /* The readers must be manipulated via atomic operations to avoid race conditions + * with threads that would use them as candidate for updating their own copies. + */ + parsec_atomic_fetch_add_int32(&gpu_elem->readers, copy_readers_update); PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%s]: GPU copy %p [ref_count %d] gets created with version 0 at %s:%d", gpu_device->super.name, @@ -1149,7 +1170,6 @@ parsec_gpu_data_reserve_device_space( parsec_device_cuda_module_t* cuda_device, gpu_device->super.name, task_name, gpu_elem, gpu_elem->super.super.obj_reference_count); assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); - parsec_list_push_back(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_elem); parsec_atomic_unlock(&master->lock); } if( data_avail_epoch ) { @@ -1242,7 +1262,7 @@ parsec_default_cuda_stage_out(parsec_gpu_task_t *gtask, /** * If the most current version of the data is not yet available on the GPU memory * schedule a transfer. - * Returns hook special retrun codes or a positive number: + * Returns hook special return codes or a positive number: * HOOK_DONE: The most recent version of the data is already available on the GPU * 1: A copy has been scheduled on the corresponding stream * HOOK_ERROR: A copy cannot be issued due to CUDA. @@ -1295,7 +1315,7 @@ parsec_gpu_data_stage_in( parsec_device_cuda_module_t* cuda_device, transfer_from = parsec_data_start_transfer_ownership_to_copy(original, gpu_device->super.device_index, (uint8_t)type); if( PARSEC_FLOW_ACCESS_WRITE & type && gpu_task->task_type != PARSEC_GPU_TASK_TYPE_PREFETCH ) { - gpu_elem->version++; /* on to the next version */ + gpu_elem->version = candidate->version + 1; /* on to the next version */ PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%s]: GPU copy %p [ref_count %d] increments version to %d at %s:%d", gpu_device->super.name, @@ -1380,14 +1400,26 @@ parsec_gpu_data_stage_in( parsec_device_cuda_module_t* cuda_device, gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, target->cuda_index); continue; } - /* candidate is the best candidate to do D2D. Let's register as a reader for this - * data copy, and we can unlock and schedule the D2D. */ + /* We have a candidate for the d2d transfer. */ + int readers = parsec_atomic_fetch_inc_int32(&candidate->readers); + if( readers >= 0 ) { + parsec_atomic_rmb(); + /* Coordination protocol with the owner of the candidate. If the owner had repurposed the copy, by the + * time we succesfully increase the readers, the device copy will be associated with a different data. + */ + if( (candidate->original == original) && (candidate->version == task_data->data_in->version) ) { + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, + "GPU[%s]:\tData copy %p [ref_count %d] on CUDA device %d is the best candidate to do Device to Device copy, increasing its readers to %d", + gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, target->cuda_index, candidate->readers+1); + candidate_dev = target; + goto src_selected; + } + } PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, - "GPU[%s]:\tData copy %p [ref_count %d] on CUDA device %d is the best candidate to do Device to Device copy, increasing its readers to %d", - gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, target->cuda_index, candidate->readers+1); - parsec_atomic_fetch_inc_int32( &candidate->readers ); - candidate_dev = target; - goto src_selected; + "GPU[%s]:\tCandidate %p [ref_count %d] on CUDA device %d is being repurposed by owner device. Looking for another candidate", + gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, target->cuda_index); + /* We are trying to use a candidate that is repurposed by the owner device. Let's find another one */ + parsec_atomic_fetch_add_int32(&candidate->readers, -1); } if( potential_alt_src ) { /* We found a potential alternative source, but it's not ready now, @@ -1413,8 +1445,6 @@ parsec_gpu_data_stage_in( parsec_device_cuda_module_t* cuda_device, candidate_dev->super.super.device_index, candidate->version, (void*)candidate->device_private, candidate, candidate->super.super.obj_reference_count, gpu_device->super.device_index, gpu_elem->version, (void*)gpu_elem->device_private); - assert((gpu_elem->version < candidate->version) || (gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_NOT_TRANSFER)); - #if defined(PARSEC_PROF_TRACE) if( gpu_stream->prof_event_track_enable ) { parsec_profile_data_collection_info_t info; @@ -1822,15 +1852,15 @@ parsec_gpu_callback_complete_push(parsec_device_gpu_module_t *gpu_device, /* Nobody is at the door to handle that event on the source of that data... * we do the command directly */ parsec_atomic_lock( &source->original->lock ); - source->readers--; + int readers = parsec_atomic_fetch_sub_int32(&source->readers, 1);; PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%s]:\tExecuting D2D transfer complete for copy %p [ref_count %d] for " "device %s -- readers now %d", gpu_device->super.name, source, source->super.super.obj_reference_count, src_device->super.name, - source->readers); - assert(source->readers >= 0); - if(0 == source->readers) { + readers); + assert(readers >= 0); + if(0 == readers) { PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%s]:\tMake read-only copy %p [ref_count %d] available", gpu_device->super.name, source, @@ -1892,8 +1922,8 @@ parsec_gpu_callback_complete_push(parsec_device_gpu_module_t *gpu_device, tmp, gpu_copy->readers, gpu_copy->device_index, gpu_copy->version, gpu_copy->flags, gpu_copy->coherency_state, gpu_copy->data_transfer_status); - gpu_copy->readers--; - if( 0 == gpu_copy->readers ) { + int readers = parsec_atomic_fetch_sub_int32(&gpu_copy->readers, 1); + if( 0 == readers ) { parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); PARSEC_LIST_ITEM_SINGLETON(gpu_copy); PARSEC_DEBUG_VERBOSE(3, parsec_gpu_output_stream, @@ -2040,8 +2070,11 @@ progress_stream( parsec_device_gpu_module_t* gpu_device, } /** - * @brief This function schedule the move of all the data required for a - * specific task from the main memory into the GPU memory. + * @brief This function prepare memory on the target device for all the inputs and output + * of the task, and then initiate the necessary copies from the best location of the input + * data. The best location is defined as any other accelerator that has the same version + * of the data (taking advantage of faster accelerator-to-accelerator connectors, such as + * NVLink), or from the CPU memory if no other candidate is found. * * @returns * a positive number: the number of data to be moved. @@ -2061,7 +2094,7 @@ parsec_cuda_kernel_push( parsec_device_gpu_module_t *gpu_device, char tmp[MAX_TASK_STRLEN]; #endif - /* if not changes were made to the available memory dont waste time */ + /* if no changes were made to the available memory dont waste time */ if( gpu_task->last_data_check_epoch == gpu_device->data_avail_epoch ) return PARSEC_HOOK_RETURN_AGAIN; PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, @@ -2268,18 +2301,17 @@ parsec_cuda_kernel_pop( parsec_device_gpu_module_t *gpu_device, } parsec_atomic_lock(&original->lock); if( flow->flow_flags & PARSEC_FLOW_ACCESS_READ ) { - gpu_copy->readers--; - if( gpu_copy->readers < 0 ) { + int current_readers = parsec_atomic_fetch_sub_int32(&gpu_copy->readers, 1) - 1; + if( current_readers < 0 ) { PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%s]: While trying to Pop %s, gpu_copy %p [ref_count %d] on flow %d with original %p had a negative number of readers (%d)", gpu_device->super.name, parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task), gpu_copy, gpu_copy->super.super.obj_reference_count, - i, original, gpu_copy->readers); + i, original, current_readers); } - assert(gpu_copy->readers >= 0); - if( (0 == gpu_copy->readers) && - !(flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) { + assert(current_readers >= 0); + if( (0 == current_readers) && !(flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) { PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%s]:\tMake read-only copy %p [ref_count %d] available on flow %s", gpu_device->super.name, gpu_copy, gpu_copy->super.super.obj_reference_count, flow->name); @@ -2292,7 +2324,7 @@ parsec_cuda_kernel_pop( parsec_device_gpu_module_t *gpu_device, } PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%s]:\tread copy %p [ref_count %d] on flow %s has readers (%i)", - gpu_device->super.name, gpu_copy, gpu_copy->super.super.obj_reference_count, flow->name, gpu_copy->readers); + gpu_device->super.name, gpu_copy, gpu_copy->super.super.obj_reference_count, flow->name, current_readers); } if( flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE ) { assert( gpu_copy == parsec_data_get_copy(gpu_copy->original, gpu_device->super.device_index) ); @@ -2444,7 +2476,7 @@ parsec_cuda_kernel_epilog( parsec_device_gpu_module_t *gpu_device, */ this_task->data[i].data_out = cpu_copy; - assert( 0 == gpu_copy->readers ); + assert( 0 <= gpu_copy->readers ); if( gpu_task->pushout & (1 << i) ) { PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, @@ -2665,7 +2697,7 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es, /* Reschedule the task. As the chore_id has been modified, another incarnation of the task will be executed. */ if( NULL != progress_task ) { - assert(PARSEC_HOOK_RETURN_NEXT == rc); + assert(PARSEC_HOOK_RETURN_NEXT == rc); parsec_cuda_kernel_cleanout(gpu_device, progress_task); __parsec_reschedule(es, progress_task->ec); gpu_task = progress_task;