diff --git a/CMakeLists.txt b/CMakeLists.txt index dd2c7ce1a..0e2934523 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,6 +151,7 @@ set(PARSEC_MAX_LOCAL_COUNT 20 CACHE STRING "Number of local variables for tasks set(PARSEC_MAX_PARAM_COUNT 20 CACHE STRING "Number of parameters for tasks (default 20)") set(PARSEC_MAX_DEP_IN_COUNT 10 CACHE STRING "Number of input flows for each task (default 10)") set(PARSEC_MAX_DEP_OUT_COUNT 10 CACHE STRING "Number of output flows for each task (default 10)") +set(PARSEC_MAX_DEVICE_FLOWS ${PARSEC_MAX_PARAM_COUNT} CACHE STRING "Number of parameters for tasks (default: same as PARSEC_MAX_PARAM_COUNT)") ### PaRSEC PP options set(PARSEC_PTGPP_FLAGS "--noline" CACHE STRING "Additional parsec-ptgpp precompiling flags (separate flags with ';')" ) diff --git a/parsec/include/parsec/parsec_config_bottom.h b/parsec/include/parsec/parsec_config_bottom.h index 88a9d1045..bda96035f 100644 --- a/parsec/include/parsec/parsec_config_bottom.h +++ b/parsec/include/parsec/parsec_config_bottom.h @@ -160,6 +160,24 @@ typedef int32_t parsec_dependency_t; typedef int32_t parsec_dependency_t; #endif +#if ((MAX_PARAM_COUNT <= 16) && (PARSEC_MAX_DEVICE_FLOWS <= 16)) +typedef uint16_t parsec_flow_mask_t; +#elif ((MAX_PARAM_COUNT <= 32) && (PARSEC_MAX_DEVICE_FLOWS <= 32)) +typedef uint32_t parsec_flow_mask_t; +#elif ((MAX_PARAM_COUNT <= 64) && (PARSEC_MAX_DEVICE_FLOWS <= 64)) +typedef uint64_t parsec_flow_mask_t; +#elif ((MAX_PARAM_COUNT <= 128) && (PARSEC_MAX_DEVICE_FLOWS <= 128)) && defined(PARSEC_HAVE_INT128) +typedef __int128_t parsec_flow_mask_t; +#else +#error Failed to find proper type for PaRSEC flow mask type. \ + Make sure MAX_PARAM_COUNT and PARSEC_MAX_DEVICE_FLOWS \ + is max 128 or 64 if 128bit integer are not supported. +#endif + +#define PARSEC_FLOW_MASK(_id) (((parsec_flow_mask_t)1) << _id) +#define PARSEC_CHECK_FLOW_MASK(_mask, _id) (!!(_mask & PARSEC_FLOW_MASK(_id))) + + /* * A set of constants defining the capabilities of the underlying * runtime. diff --git a/parsec/include/parsec/parsec_options.h.in b/parsec/include/parsec/parsec_options.h.in index 85af2db8e..b925adff1 100644 --- a/parsec/include/parsec/parsec_options.h.in +++ b/parsec/include/parsec/parsec_options.h.in @@ -152,6 +152,9 @@ /* The max number of output dependencies (not flows) for each task */ #define MAX_DEP_OUT_COUNT @PARSEC_MAX_DEP_OUT_COUNT@ +/* The max number of flows handled by device tasks */ +#define PARSEC_MAX_DEVICE_FLOWS @PARSEC_MAX_DEVICE_FLOWS@ + #include "parsec/parsec_config_bottom.h" #endif /* PARSEC_CONFIG_H_HAS_BEEN_INCLUDED */ diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index d932e975e..02b08e899 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -841,7 +841,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, parsec_gpu_task_t *gpu_task ) { parsec_task_t *this_task = gpu_task->ec; - parsec_gpu_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem; + parsec_gpu_data_copy_t* temp_loc[PARSEC_MAX_DEVICE_FLOWS], *gpu_elem, *lru_gpu_elem; parsec_data_t* master, *oldmaster; const parsec_flow_t *flow; int i, j, data_avail_epoch = 0, copy_readers_update = 0; @@ -1163,7 +1163,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, */ int parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, - uint32_t flow_mask, + parsec_flow_mask_t flow_mask, parsec_gpu_exec_stream_t *gpu_stream) { int ret; @@ -1176,7 +1176,7 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, parsec_device_transfer_direction_t dir; for(int i = 0; i < task->task_class->nb_flows; i++) { - if( !(flow_mask & (1U << i)) ) continue; + if( !PARSEC_CHECK_FLOW_MASK(flow_mask, i) ) continue; source = gtask->sources[i]; dest = task->data[i].data_out; src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index); @@ -1213,7 +1213,7 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, */ int parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, - uint32_t flow_mask, + parsec_flow_mask_t flow_mask, parsec_gpu_exec_stream_t *gpu_stream) { int ret; @@ -1225,7 +1225,7 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, parsec_device_transfer_direction_t dir; int i; for(i = 0; i < task->task_class->nb_flows; i++){ - if(flow_mask & (1U << i)){ + if( PARSEC_CHECK_FLOW_MASK(flow_mask, i) ){ source = task->data[i].data_out; dest = source->original->device_copies[0]; dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index); @@ -1497,7 +1497,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, #endif gpu_task->sources[flow->flow_index] = candidate; /* save the candidate for release on transfer completion */ /* Push data into the GPU from the source device */ - int rc = gpu_task->stage_in ? gpu_task->stage_in(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS; + int rc = gpu_task->stage_in ? gpu_task->stage_in(gpu_task, PARSEC_FLOW_MASK(flow->flow_index), gpu_stream): PARSEC_SUCCESS; if(PARSEC_SUCCESS != rc) { parsec_warning( "GPU[%d:%s]: gpu_task->stage_in to device rc=%d @%s:%d\n" "\t<<%p on device %d:%s>> -> <<%p on device %d:%s>> [%zu, %s]", @@ -2117,7 +2117,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, /* If the gpu copy is not owned by parsec, we don't manage it at all */ if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue; original = gpu_copy->original; - rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, (1U << i), gpu_stream): PARSEC_SUCCESS; + rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, PARSEC_FLOW_MASK(i), gpu_stream): PARSEC_SUCCESS; if(PARSEC_SUCCESS != rc) { parsec_warning( "GPU[%d:%s]: gpu_task->stage_out from device rc=%d @%s:%d\n" "\tdata %s <<%p>> -> <<%p>>\n", @@ -2206,7 +2206,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, assert( ((parsec_list_item_t*)gpu_copy)->list_prev == (parsec_list_item_t*)gpu_copy ); assert( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state ); - if( gpu_task->pushout & (1 << i) ) { + if( PARSEC_CHECK_FLOW_MASK(gpu_task->pushout, i) ) { /* TODO: make sure no readers are working on the CPU version */ original = gpu_copy->original; PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, @@ -2238,7 +2238,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, } #endif /* Move the data back into main memory */ - rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS; + rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, PARSEC_FLOW_MASK(flow->flow_index), gpu_stream): PARSEC_SUCCESS; if(PARSEC_SUCCESS != rc) { parsec_warning( "GPU[%d:%s]: gpu_task->stage_out from device rc=%d @%s:%d\n" "\tdata %s <<%p>> -> <<%p>>\n", @@ -2342,7 +2342,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device, assert( 0 <= gpu_copy->readers ); - if( gpu_task->pushout & (1 << i) ) { + if( PARSEC_CHECK_FLOW_MASK(gpu_task->pushout, i) ) { PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU copy %p [ref_count %d] moved to the read LRU in %s", gpu_copy, gpu_copy->super.super.obj_reference_count, __func__); diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h index fa25b87a3..37bca14b8 100644 --- a/parsec/mca/device/device_gpu.h +++ b/parsec/mca/device/device_gpu.h @@ -60,7 +60,7 @@ typedef int (*parsec_advance_task_function_t)(parsec_device_gpu_module_t *gpu_d * */ typedef int (parsec_stage_in_function_t)(parsec_gpu_task_t *gtask, - uint32_t flow_mask, + parsec_flow_mask_t flow_mask, parsec_gpu_exec_stream_t *gpu_stream); @@ -74,7 +74,7 @@ typedef int (parsec_stage_in_function_t)(parsec_gpu_task_t *gtask, * */ typedef int (parsec_stage_out_function_t)(parsec_gpu_task_t *gtask, - uint32_t flow_mask, + parsec_flow_mask_t flow_mask, parsec_gpu_exec_stream_t *gpu_stream); /* Function type for releasing a device task. The DSL is responsible for allocating such tasks, @@ -86,8 +86,8 @@ typedef void (*parsec_release_device_task_function_t)(void*); struct parsec_gpu_task_s { parsec_list_item_t list_item; uint16_t task_type; - uint16_t pushout; int32_t last_status; + parsec_flow_mask_t pushout; parsec_advance_task_function_t submit; parsec_complete_stage_function_t complete_stage; parsec_stage_in_function_t *stage_in; @@ -102,23 +102,23 @@ struct parsec_gpu_task_s { struct { parsec_task_t *ec; uint64_t last_data_check_epoch; - const parsec_flow_t *flow[MAX_PARAM_COUNT]; /* There is no consistent way to access the flows from the task_class, - * so the DSL need to provide these flows here. - */ - size_t flow_nb_elts[MAX_PARAM_COUNT]; /* for each flow, size of the data to be allocated - * on the GPU. - */ - parsec_data_collection_t *flow_dc[MAX_PARAM_COUNT]; /* for each flow, data collection from which the data - * to be transferred logically belongs to. - * This gives the user the chance to indicate on the JDF - * a data collection to inspect during GPU transfer. - * User may want info from the DC (e.g. mtype), - * & otherwise remote copies don't have any info. - */ + const parsec_flow_t *flow[PARSEC_MAX_DEVICE_FLOWS]; /* There is no consistent way to access the flows from the task_class, + * so the DSL need to provide these flows here. + */ + size_t flow_nb_elts[PARSEC_MAX_DEVICE_FLOWS]; /* for each flow, size of the data to be allocated + * on the GPU. + */ + parsec_data_collection_t *flow_dc[PARSEC_MAX_DEVICE_FLOWS]; /* for each flow, data collection from which the data + * to be transferred logically belongs to. + * This gives the user the chance to indicate on the JDF + * a data collection to inspect during GPU transfer. + * User may want info from the DC (e.g. mtype), + * & otherwise remote copies don't have any info. + */ /* These are private and should not be used outside the device driver */ - parsec_data_copy_t *sources[MAX_PARAM_COUNT]; /* If the driver decides to acquire the data from a different - * source, it will temporary store the best candidate here. - */ + parsec_data_copy_t *sources[PARSEC_MAX_DEVICE_FLOWS]; /* If the driver decides to acquire the data from a different + * source, it will temporary store the best candidate here. + */ }; struct { parsec_data_copy_t *copy; @@ -376,7 +376,7 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, */ int parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, - uint32_t flow_mask, + parsec_flow_mask_t flow_mask, parsec_gpu_exec_stream_t *gpu_stream); /* Default stage_out function to transfer data from the GPU device. @@ -390,7 +390,7 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, */ int parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, - uint32_t flow_mask, + parsec_flow_mask_t flow_mask, parsec_gpu_exec_stream_t *gpu_stream); END_C_DECLS diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c index 5d0afb61b..1683a2ff7 100644 --- a/parsec/mca/device/transfer_gpu.c +++ b/parsec/mca/device/transfer_gpu.c @@ -106,7 +106,7 @@ static int datatype_lookup_of_gpu_d2h_task( parsec_execution_stream_t * es, const parsec_gpu_d2h_task_t* this_task, const parsec_task_t * parent_task, - uint32_t * flow_mask, + parsec_flow_mask_t * flow_mask, parsec_dep_data_description_t * data) { (void)es; (void)this_task; (void)parent_task; (void)flow_mask; (void)data; @@ -183,9 +183,9 @@ int32_t parsec_gpu_d2h_max_flows = 0; static const parsec_task_class_t parsec_gpu_d2h_task_class = { .name = "GPU D2H data transfer", .task_class_id = 0, - .nb_flows = MAX_PARAM_COUNT, /* This value will have an impact on the duration of the - * search for additional data to move. As this search is linear - * we need to keep this upper bound set to a reasonable value. */ + .nb_flows = PARSEC_MAX_DEVICE_FLOWS, /* This value will have an impact on the duration of the + * search for additional data to move. As this search is linear + * we need to keep this upper bound set to a reasonable value. */ .nb_parameters = 1, .nb_locals = 0, .params = {&symb_gpu_d2h_task_param}, @@ -217,7 +217,7 @@ static const parsec_task_class_t parsec_gpu_d2h_task_class = { /** - * Transfer at most the MAX_PARAM_COUNT oldest data from the GPU back + * Transfer at most the PARSEC_MAX_DEVICE_FLOWS oldest data from the GPU back * to main memory. Create a single task to move them all out, then switch the * GPU data copy in shared mode. */ diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf index 7df99800f..91b3a9b9d 100644 --- a/tests/runtime/cuda/stage_custom.jdf +++ b/tests/runtime/cuda/stage_custom.jdf @@ -27,7 +27,7 @@ extern "C" %{ static int stage_stride_in(parsec_gpu_task_t *gtask, - uint32_t flow_mask, + parsec_flow_mask_t flow_mask, parsec_gpu_exec_stream_t *gpu_stream){ parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream; cudaError_t ret = 0; @@ -39,7 +39,7 @@ stage_stride_in(parsec_gpu_task_t *gtask, int elem_sz; int i; for(i = 0; i < task->task_class->nb_flows; i++){ - if(flow_mask & (1U << i)){ + if(PARSEC_CHECK_FLOW_MASK(flow_mask, i)){ copy_in = task->data[i].data_in; copy_out = task->data[i].data_out; dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i]; @@ -75,7 +75,7 @@ stage_stride_in(parsec_gpu_task_t *gtask, static int stage_stride_out(parsec_gpu_task_t *gtask, - uint32_t flow_mask, + parsec_flow_mask_t flow_mask, parsec_gpu_exec_stream_t *gpu_stream){ parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream; cudaError_t ret; @@ -86,7 +86,7 @@ stage_stride_out(parsec_gpu_task_t *gtask, int elem_sz; int i; for(i = 0; i < task->task_class->nb_flows; i++){ - if(flow_mask & (1U << i)){ + if(PARSEC_CHECK_FLOW_MASK(flow_mask, i)){ copy_in = task->data[i].data_out; copy_out = copy_in->original->device_copies[0]; dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];