diff --git a/include/ur_api.h b/include/ur_api.h index eb8b07221c..1de876cb7f 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -9560,6 +9560,7 @@ typedef enum ur_exp_launch_property_id_t { UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions + UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation /// @cond UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -9573,10 +9574,12 @@ typedef enum ur_exp_launch_property_id_t { /// _Analogues_ /// - **CUlaunchAttributeValue** typedef union ur_exp_launch_property_value_t { - uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each - ///< value must be a divisor of the corresponding global work-size - ///< dimension (in units of work-group). - int cooperative; ///< [in] non-zero value indicates a cooperative kernel + uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each + ///< value must be a divisor of the corresponding global work-size + ///< dimension (in units of work-group). + int cooperative; ///< [in] non-zero value indicates a cooperative kernel + size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to + ///< allocate in bytes } ur_exp_launch_property_value_t; @@ -9617,6 +9620,7 @@ typedef struct ur_exp_launch_property_t { /// + NULL == hQueue /// + NULL == hKernel /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == launchPropList` /// + NULL == pGlobalWorkSize @@ -9645,6 +9649,8 @@ urEnqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel ///< function @@ -11554,6 +11560,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t { ur_queue_handle_t *phQueue; ur_kernel_handle_t *phKernel; uint32_t *pworkDim; + const size_t **ppGlobalWorkOffset; const size_t **ppGlobalWorkSize; const size_t **ppLocalWorkSize; uint32_t *pnumPropsInLaunchPropList; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index 40a6c5c269..cdf90eda6d 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1467,6 +1467,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)( uint32_t, const size_t *, const size_t *, + const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t, diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 8888a74f91..190d3f9cd5 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -10397,6 +10397,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION"; break; + case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: + os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY"; + break; default: os << "unknown enumerator"; break; @@ -10433,6 +10436,13 @@ inline ur_result_t printUnion( os << (params.cooperative); + break; + case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: + + os << ".workgroup_mem_size = "; + + os << (params.workgroup_mem_size); + break; default: os << ""; @@ -15100,6 +15110,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct os << *(params->pworkDim); + os << ", "; + os << ".pGlobalWorkOffset = "; + + ur::details::printPtr(os, + *(params->ppGlobalWorkOffset)); + os << ", "; os << ".pGlobalWorkSize = "; diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml index 9e66e9ea06..ca28421815 100644 --- a/scripts/core/exp-launch-properties.yml +++ b/scripts/core/exp-launch-properties.yml @@ -29,6 +29,8 @@ etors: desc: "Whether to launch a cooperative kernel" - name: CLUSTER_DIMENSION desc: "work-group cluster dimensions" + - name: WORK_GROUP_MEMORY + desc: "Implicit work group memory allocation" --- #-------------------------------------------------------------------------- type: union desc: "Specifies a launch property value" @@ -45,6 +47,10 @@ members: name: cooperative desc: "[in] non-zero value indicates a cooperative kernel" tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE + - type: size_t + name: workgroup_mem_size + desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes" + tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY --- #-------------------------------------------------------------------------- type: struct desc: "Kernel launch property" @@ -82,6 +88,9 @@ params: - type: uint32_t name: workDim desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items" + - type: "const size_t*" + name: pGlobalWorkOffset + desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item" - type: const size_t* name: pGlobalWorkSize desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" @@ -97,10 +106,10 @@ params: - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" - - type: const ur_event_handle_t* + - type: const $x_event_handle_t* name: phEventWaitList desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. " - - type: ur_event_handle_t* + - type: $x_event_handle_t* name: phEvent desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array." returns: diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 54a0f778fb..2a4a2cf54f 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -422,11 +422,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { +static ur_result_t +enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, size_t WorkGroupMemory) { // Preconditions UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(), UR_RESULT_ERROR_INVALID_KERNEL); @@ -444,6 +446,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + // Set work group memory so we can compute the whole memory requirement + if (WorkGroupMemory) + hKernel->setWorkGroupMemory(WorkGroupMemory); uint32_t LocalSize = hKernel->getLocalSize(); CUfunction CuFunc = hKernel->get(); @@ -503,6 +508,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent, + /*WorkGroupMemory=*/0); +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, @@ -513,8 +529,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE; coop_prop.value.cooperative = 1; return urEnqueueKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1, - &coop_prop, numEventsInWaitList, phEventWaitList, phEvent); + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList, + phEvent); } return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, @@ -523,16 +540,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - if (numPropsInLaunchPropList == 0) { - urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize, - pLocalWorkSize, numEventsInWaitList, phEventWaitList, - phEvent); + size_t WorkGroupMemory = [&]() -> size_t { + const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if( + launchPropList, launchPropList + numPropsInLaunchPropList, + [](const ur_exp_launch_property_t &Prop) { + return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY; + }); + if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList) + return WorkGroupMemoryProp->value.workgroup_mem_size; + return 0; + }(); + + if (numPropsInLaunchPropList == 0 || + (WorkGroupMemory && numPropsInLaunchPropList == 1)) { + return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent, + WorkGroupMemory); } #if CUDA_VERSION >= 11080 // Preconditions @@ -545,7 +575,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - std::vector launch_attribute(numPropsInLaunchPropList); + std::vector launch_attribute; + launch_attribute.reserve(numPropsInLaunchPropList); // Early exit for zero size kernel if (*pGlobalWorkSize == 0) { @@ -558,40 +589,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + // Set work group memory so we can compute the whole memory requirement + if (WorkGroupMemory) + hKernel->setWorkGroupMemory(WorkGroupMemory); uint32_t LocalSize = hKernel->getLocalSize(); CUfunction CuFunc = hKernel->get(); for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { switch (launchPropList[i].id) { case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: { - launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE; + auto &attr = launch_attribute.emplace_back(); + attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE; break; } case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: { - - launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; + auto &attr = launch_attribute.emplace_back(); + attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; // Note that cuda orders from right to left wrt SYCL dimensional order. if (workDim == 3) { - launch_attribute[i].value.clusterDim.x = - launchPropList[i].value.clusterDim[2]; - launch_attribute[i].value.clusterDim.y = - launchPropList[i].value.clusterDim[1]; - launch_attribute[i].value.clusterDim.z = - launchPropList[i].value.clusterDim[0]; + attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2]; + attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1]; + attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0]; } else if (workDim == 2) { - launch_attribute[i].value.clusterDim.x = - launchPropList[i].value.clusterDim[1]; - launch_attribute[i].value.clusterDim.y = - launchPropList[i].value.clusterDim[0]; - launch_attribute[i].value.clusterDim.z = - launchPropList[i].value.clusterDim[2]; + attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1]; + attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0]; + attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2]; } else { - launch_attribute[i].value.clusterDim.x = - launchPropList[i].value.clusterDim[0]; - launch_attribute[i].value.clusterDim.y = - launchPropList[i].value.clusterDim[1]; - launch_attribute[i].value.clusterDim.z = - launchPropList[i].value.clusterDim[2]; + attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0]; + attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1]; + attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2]; } UR_CHECK_ERROR(cuFuncSetAttribute( @@ -600,9 +626,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( break; } case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: { - launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; - launch_attribute[i].value.cooperative = - launchPropList[i].value.cooperative; + auto &attr = launch_attribute.emplace_back(); + attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; + attr.value.cooperative = launchPropList[i].value.cooperative; + break; + } + case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: { break; } default: { @@ -615,8 +644,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( // using the standard UR_CHECK_ERROR if (ur_result_t Ret = setKernelParams(hQueue->getContext(), hQueue->Device, workDim, - nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel, - CuFunc, ThreadsPerBlock, BlocksPerGrid); + pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); Ret != UR_RESULT_SUCCESS) return Ret; @@ -664,7 +693,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.sharedMemBytes = LocalSize; launch_config.hStream = CuStream; launch_config.attrs = &launch_attribute[0]; - launch_config.numAttrs = numPropsInLaunchPropList; + launch_config.numAttrs = launch_attribute.size(); UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, const_cast(ArgIndices.data()), diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index 2b04dfba43..d1b3b61244 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -76,6 +76,7 @@ struct ur_kernel_handle_t_ { /// padded to appropriate alignment. Zero if the argument at the index /// isn't a local memory argument. args_size_t OriginalLocalMemSize; + size_t WorkGroupMemory = 0; // A struct to keep track of memargs so that we can do dependency analysis // at urEnqueueKernelLaunch @@ -134,9 +135,10 @@ struct ur_kernel_handle_t_ { OriginalLocalMemSize[Index] = Size; // Calculate the current starting offset into local data - const size_t LocalOffset = std::accumulate( - std::begin(AlignedLocalMemSize), - std::next(std::begin(AlignedLocalMemSize), Index), size_t{0}); + const size_t LocalOffset = + std::accumulate(std::begin(AlignedLocalMemSize), + std::next(std::begin(AlignedLocalMemSize), Index), + size_t{WorkGroupMemory}); // Maximum required alignment is the size of the largest vector type const size_t MaxAlignment = sizeof(double) * 16; @@ -156,20 +158,11 @@ struct ur_kernel_handle_t_ { return std::make_pair(AlignedLocalSize, AlignedLocalOffset); } - void addLocalArg(size_t Index, size_t Size) { - // Get the aligned argument size and offset into local data - auto [AlignedLocalSize, AlignedLocalOffset] = - calcAlignedLocalArgument(Index, Size); - - // Store argument details - addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), - AlignedLocalSize); - - // For every existing local argument which follows at later argument - // indices, update the offset and pointer into the kernel local memory. - // Required as padding will need to be recalculated. + // Iterate over all existing local argument which follows StartIndex + // index, update the offset and pointer into the kernel local memory. + void updateLocalArgOffset(size_t StartIndex) { const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg - for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) { + for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) { const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex]; if (OriginalLocalSize == 0) { // Skip if successor argument isn't a local memory arg @@ -192,6 +185,20 @@ struct ur_kernel_handle_t_ { } } + void addLocalArg(size_t Index, size_t Size) { + // Get the aligned argument size and offset into local data + auto [AlignedLocalSize, AlignedLocalOffset] = + calcAlignedLocalArgument(Index, Size); + + // Store argument details + addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), + AlignedLocalSize); + // For every existing local argument which follows at later argument + // indices, update the offset and pointer into the kernel local memory. + // Required as padding will need to be recalculated. + updateLocalArgOffset(Index + 1); + } + void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { assert(hMem && "Invalid mem handle"); // To avoid redundancy we are not storing mem obj with index i at index @@ -206,6 +213,16 @@ struct ur_kernel_handle_t_ { MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags}); } + void setWorkGroupMemory(size_t MemSize) { + // If the WorkGroupMemory is the same as MemSize, then all accessors + // offsets accounted for this extra memory + if (WorkGroupMemory == MemSize) + return; + WorkGroupMemory = MemSize; + // Update local accessor offsets + updateLocalArgOffset(/*StartIndex=*/0); + } + void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) { assert(Size == sizeof(std::uint32_t) * 3); std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); @@ -215,7 +232,8 @@ struct ur_kernel_handle_t_ { uint32_t getLocalSize() const { return std::accumulate(std::begin(AlignedLocalMemSize), - std::end(AlignedLocalMemSize), 0); + std::end(AlignedLocalMemSize), 0) + + WorkGroupMemory; } } Args; @@ -300,6 +318,7 @@ struct ur_kernel_handle_t_ { return Args.getIndices(); } + void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); } uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } size_t getRegsPerThread() const noexcept { return RegsPerThread; }; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index c4598f3472..95c8d026a7 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -902,14 +902,15 @@ ur_result_t urQueueFlush( ur_result_t urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { std::ignore = hQueue; std::ignore = hKernel; std::ignore = workDim; + std::ignore = pGlobalWorkOffset; std::ignore = pGlobalWorkSize; std::ignore = pLocalWorkSize; std::ignore = numPropsInLaunchPropList; diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index 1215d6449e..0832303b50 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -694,8 +694,8 @@ ur_result_t urEnqueueTimestampRecordingExp( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); ur_result_t urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp index b7b45625a2..e4659b5f2c 100644 --- a/source/adapters/level_zero/v2/queue_api.cpp +++ b/source/adapters/level_zero/v2/queue_api.cpp @@ -391,13 +391,13 @@ ur_result_t urEnqueueTimestampRecordingExp( } ur_result_t urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { return hQueue->enqueueKernelLaunchCustomExp( - hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numPropsInLaunchPropList, launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } catch (...) { diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index 7cb039ccdd..c59f084fc4 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -144,9 +144,9 @@ struct ur_queue_handle_t_ { const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueKernelLaunchCustomExp( - ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, uint32_t, - const ur_exp_launch_property_t *, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) = 0; + ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, + const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, uint32_t, const ur_event_handle_t *, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 519b0ffc1e..05e48c8740 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -1069,13 +1069,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( } ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( - ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkSize, + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { std::ignore = hKernel; std::ignore = workDim; + std::ignore = pGlobalWorkOffset; std::ignore = pGlobalWorkSize; std::ignore = pLocalWorkSize; std::ignore = numPropsInLaunchPropList; diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 33e060ded3..bdd3009d63 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -263,8 +263,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ { ur_event_handle_t *phEvent) override; ur_result_t enqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index 42c342444d..c8ce408756 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -10126,6 +10126,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -10153,11 +10156,17 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ur_result_t result = UR_RESULT_SUCCESS; ur_enqueue_kernel_launch_custom_exp_params_t params = { - &hQueue, &hKernel, - &workDim, &pGlobalWorkSize, - &pLocalWorkSize, &numPropsInLaunchPropList, - &launchPropList, &numEventsInWaitList, - &phEventWaitList, &phEvent}; + &hQueue, + &hKernel, + &workDim, + &pGlobalWorkOffset, + &pGlobalWorkSize, + &pLocalWorkSize, + &numPropsInLaunchPropList, + &launchPropList, + &numEventsInWaitList, + &phEventWaitList, + &phEvent}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -10176,6 +10185,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } result = UR_RESULT_SUCCESS; } diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 64489c39ac..afd1411ae8 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -8698,6 +8698,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -8730,11 +8733,17 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } ur_enqueue_kernel_launch_custom_exp_params_t params = { - &hQueue, &hKernel, - &workDim, &pGlobalWorkSize, - &pLocalWorkSize, &numPropsInLaunchPropList, - &launchPropList, &numEventsInWaitList, - &phEventWaitList, &phEvent}; + &hQueue, + &hKernel, + &workDim, + &pGlobalWorkOffset, + &pGlobalWorkSize, + &pLocalWorkSize, + &numPropsInLaunchPropList, + &launchPropList, + &numEventsInWaitList, + &phEventWaitList, + &phEvent}; uint64_t instance = getContext()->notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, "urEnqueueKernelLaunchCustomExp", ¶ms); @@ -8743,9 +8752,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( logger.info(" ---> urEnqueueKernelLaunchCustomExp\n"); ur_result_t result = pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, phEvent); + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, "urEnqueueKernelLaunchCustomExp", ¶ms, &result, diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index b3969de10f..c2dcc7be6f 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -9726,6 +9726,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -9766,6 +9769,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } + if (NULL == pGlobalWorkOffset) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + if (NULL == pGlobalWorkSize) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -9794,9 +9801,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } ur_result_t result = pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, phEvent); + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); return result; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 86a6ad95a0..602b8f1a82 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -8866,6 +8866,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -8908,11 +8911,35 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( // convert loader handle to platform handle hKernel = reinterpret_cast(hKernel)->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform - result = pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, - launchPropList, numEventsInWaitList, - phEventWaitList, phEvent); + result = pfnKernelLaunchCustomExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitListLocal.data(), phEvent); + + // In the event of ERROR_ADAPTER_SPECIFIC we should still attempt to wrap any output handles below. + if (UR_RESULT_SUCCESS != result && + UR_RESULT_ERROR_ADAPTER_SPECIFIC != result) { + return result; + } + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 3340363737..b2e26a8b8b 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -8992,6 +8992,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( /// + NULL == hQueue /// + NULL == hKernel /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == launchPropList` /// + NULL == pGlobalWorkSize @@ -9020,6 +9021,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -9050,10 +9054,10 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, - launchPropList, numEventsInWaitList, - phEventWaitList, phEvent); + return pfnKernelLaunchCustomExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 853d61472e..0b2e6a0f74 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7625,6 +7625,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( /// + NULL == hQueue /// + NULL == hKernel /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == launchPropList` /// + NULL == pGlobalWorkSize @@ -7653,6 +7654,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp index a54a44ecaf..23ba56ff4b 100644 --- a/test/conformance/exp_launch_properties/launch_properties.cpp +++ b/test/conformance/exp_launch_properties/launch_properties.cpp @@ -95,8 +95,8 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) { AddPodArg(val); ASSERT_SUCCESS(urEnqueueKernelLaunchCustomExp( - queue, kernel, n_dimensions, &global_size, nullptr, 1, &props[0], 0, - nullptr, nullptr)); + queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 1, + &props[0], 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); ValidateBuffer(buffer, sizeof(val) * global_size, val); }