diff --git a/include/ur_api.h b/include/ur_api.h
index eb8b07221c..1de876cb7f 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -9560,6 +9560,7 @@ typedef enum ur_exp_launch_property_id_t {
     UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0,            ///< The property has no effect
     UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1,       ///< Whether to launch a cooperative kernel
     UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
+    UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
     /// @cond
     UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -9573,10 +9574,12 @@ typedef enum ur_exp_launch_property_id_t {
 ///   _Analogues_
 ///     - **CUlaunchAttributeValue**
 typedef union ur_exp_launch_property_value_t {
-    uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
-                            ///< value must be a divisor of the corresponding global work-size
-                            ///< dimension (in units of work-group).
-    int cooperative;        ///< [in] non-zero value indicates a cooperative kernel
+    uint32_t clusterDim[3];    ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
+                               ///< value must be a divisor of the corresponding global work-size
+                               ///< dimension (in units of work-group).
+    int cooperative;           ///< [in] non-zero value indicates a cooperative kernel
+    size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
+                               ///< allocate in bytes
 
 } ur_exp_launch_property_value_t;
 
@@ -9617,6 +9620,7 @@ typedef struct ur_exp_launch_property_t {
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -9645,6 +9649,8 @@ urEnqueueKernelLaunchCustomExp(
     ur_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
     uint32_t workDim,                               ///< [in] number of dimensions, from 1 to 3, to specify the global and
                                                     ///< work-group work-items
+    const size_t *pGlobalWorkOffset,                ///< [in] pointer to an array of workDim unsigned values that specify the
+                                                    ///< offset used to calculate the global ID of a work-item
     const size_t *pGlobalWorkSize,                  ///< [in] pointer to an array of workDim unsigned values that specify the
                                                     ///< number of global work-items in workDim that will execute the kernel
                                                     ///< function
@@ -11554,6 +11560,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
     ur_queue_handle_t *phQueue;
     ur_kernel_handle_t *phKernel;
     uint32_t *pworkDim;
+    const size_t **ppGlobalWorkOffset;
     const size_t **ppGlobalWorkSize;
     const size_t **ppLocalWorkSize;
     uint32_t *pnumPropsInLaunchPropList;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index 40a6c5c269..cdf90eda6d 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -1467,6 +1467,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
     uint32_t,
     const size_t *,
     const size_t *,
+    const size_t *,
     uint32_t,
     const ur_exp_launch_property_t *,
     uint32_t,
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 8888a74f91..190d3f9cd5 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -10397,6 +10397,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
         os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
         break;
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
+        os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -10433,6 +10436,13 @@ inline ur_result_t printUnion(
 
         os << (params.cooperative);
 
+        break;
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
+
+        os << ".workgroup_mem_size = ";
+
+        os << (params.workgroup_mem_size);
+
         break;
     default:
         os << "<unknown>";
@@ -15100,6 +15110,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
 
     os << *(params->pworkDim);
 
+    os << ", ";
+    os << ".pGlobalWorkOffset = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppGlobalWorkOffset));
+
     os << ", ";
     os << ".pGlobalWorkSize = ";
 
diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml
index 9e66e9ea06..ca28421815 100644
--- a/scripts/core/exp-launch-properties.yml
+++ b/scripts/core/exp-launch-properties.yml
@@ -29,6 +29,8 @@ etors:
       desc: "Whether to launch a cooperative kernel"
     - name: CLUSTER_DIMENSION
       desc: "work-group cluster dimensions"
+    - name: WORK_GROUP_MEMORY
+      desc: "Implicit work group memory allocation"
 --- #--------------------------------------------------------------------------
 type: union
 desc: "Specifies a launch property value"
@@ -45,6 +47,10 @@ members:
       name: cooperative
       desc: "[in] non-zero value indicates a cooperative kernel"
       tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
+    - type: size_t
+      name: workgroup_mem_size
+      desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
+      tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
 --- #--------------------------------------------------------------------------
 type: struct
 desc: "Kernel launch property"
@@ -82,6 +88,9 @@ params:
     - type: uint32_t
       name: workDim
       desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
+    - type: "const size_t*"
+      name: pGlobalWorkOffset
+      desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
     - type: const size_t*
       name: pGlobalWorkSize
       desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -97,10 +106,10 @@ params:
     - type: uint32_t
       name: numEventsInWaitList
       desc: "[in] size of the event wait list"
-    - type: const ur_event_handle_t*
+    - type: const $x_event_handle_t*
       name: phEventWaitList
       desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
-    - type: ur_event_handle_t*
+    - type: $x_event_handle_t*
       name: phEvent
       desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
 returns:
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 54a0f778fb..2a4a2cf54f 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -422,11 +422,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
                                         phEventWaitList, phEvent);
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
-    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
-    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+static ur_result_t
+enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
+                    uint32_t workDim, const size_t *pGlobalWorkOffset,
+                    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
+                    uint32_t numEventsInWaitList,
+                    const ur_event_handle_t *phEventWaitList,
+                    ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
   // Preconditions
   UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
             UR_RESULT_ERROR_INVALID_KERNEL);
@@ -444,6 +446,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
+  // Set work group memory so we can compute the whole memory requirement
+  if (WorkGroupMemory)
+    hKernel->setWorkGroupMemory(WorkGroupMemory);
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
@@ -503,6 +508,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                             pGlobalWorkSize, pLocalWorkSize,
+                             numEventsInWaitList, phEventWaitList, phEvent,
+                             /*WorkGroupMemory=*/0);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -513,8 +529,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
     coop_prop.value.cooperative = 1;
     return urEnqueueKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
-        &coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
+        phEvent);
   }
   return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
                                pGlobalWorkSize, pLocalWorkSize,
@@ -523,16 +540,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
 
-  if (numPropsInLaunchPropList == 0) {
-    urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
-                          pLocalWorkSize, numEventsInWaitList, phEventWaitList,
-                          phEvent);
+  size_t WorkGroupMemory = [&]() -> size_t {
+    const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
+        launchPropList, launchPropList + numPropsInLaunchPropList,
+        [](const ur_exp_launch_property_t &Prop) {
+          return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
+        });
+    if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
+      return WorkGroupMemoryProp->value.workgroup_mem_size;
+    return 0;
+  }();
+
+  if (numPropsInLaunchPropList == 0 ||
+      (WorkGroupMemory && numPropsInLaunchPropList == 1)) {
+    return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numEventsInWaitList, phEventWaitList, phEvent,
+                               WorkGroupMemory);
   }
 #if CUDA_VERSION >= 11080
   // Preconditions
@@ -545,7 +575,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     return UR_RESULT_ERROR_INVALID_NULL_POINTER;
   }
 
-  std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
+  std::vector<CUlaunchAttribute> launch_attribute;
+  launch_attribute.reserve(numPropsInLaunchPropList);
 
   // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
@@ -558,40 +589,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
+  // Set work group memory so we can compute the whole memory requirement
+  if (WorkGroupMemory)
+    hKernel->setWorkGroupMemory(WorkGroupMemory);
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
   for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
     switch (launchPropList[i].id) {
     case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
-      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
+      auto &attr = launch_attribute.emplace_back();
+      attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE;
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
-
-      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+      auto &attr = launch_attribute.emplace_back();
+      attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       // Note that cuda orders from right to left wrt SYCL dimensional order.
       if (workDim == 3) {
-        launch_attribute[i].value.clusterDim.x =
-            launchPropList[i].value.clusterDim[2];
-        launch_attribute[i].value.clusterDim.y =
-            launchPropList[i].value.clusterDim[1];
-        launch_attribute[i].value.clusterDim.z =
-            launchPropList[i].value.clusterDim[0];
+        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2];
+        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
+        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0];
       } else if (workDim == 2) {
-        launch_attribute[i].value.clusterDim.x =
-            launchPropList[i].value.clusterDim[1];
-        launch_attribute[i].value.clusterDim.y =
-            launchPropList[i].value.clusterDim[0];
-        launch_attribute[i].value.clusterDim.z =
-            launchPropList[i].value.clusterDim[2];
+        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1];
+        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0];
+        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
       } else {
-        launch_attribute[i].value.clusterDim.x =
-            launchPropList[i].value.clusterDim[0];
-        launch_attribute[i].value.clusterDim.y =
-            launchPropList[i].value.clusterDim[1];
-        launch_attribute[i].value.clusterDim.z =
-            launchPropList[i].value.clusterDim[2];
+        attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0];
+        attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
+        attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
       }
 
       UR_CHECK_ERROR(cuFuncSetAttribute(
@@ -600,9 +626,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
-      launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
-      launch_attribute[i].value.cooperative =
-          launchPropList[i].value.cooperative;
+      auto &attr = launch_attribute.emplace_back();
+      attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
+      attr.value.cooperative = launchPropList[i].value.cooperative;
+      break;
+    }
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
       break;
     }
     default: {
@@ -615,8 +644,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   // using the standard UR_CHECK_ERROR
   if (ur_result_t Ret =
           setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
-                          nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
-                          CuFunc, ThreadsPerBlock, BlocksPerGrid);
+                          pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+                          hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
       Ret != UR_RESULT_SUCCESS)
     return Ret;
 
@@ -664,7 +693,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     launch_config.sharedMemBytes = LocalSize;
     launch_config.hStream = CuStream;
     launch_config.attrs = &launch_attribute[0];
-    launch_config.numAttrs = numPropsInLaunchPropList;
+    launch_config.numAttrs = launch_attribute.size();
 
     UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
                                     const_cast<void **>(ArgIndices.data()),
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index 2b04dfba43..d1b3b61244 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -76,6 +76,7 @@ struct ur_kernel_handle_t_ {
     /// padded to appropriate alignment. Zero if the argument at the index
     /// isn't a local memory argument.
     args_size_t OriginalLocalMemSize;
+    size_t WorkGroupMemory = 0;
 
     // A struct to keep track of memargs so that we can do dependency analysis
     // at urEnqueueKernelLaunch
@@ -134,9 +135,10 @@ struct ur_kernel_handle_t_ {
       OriginalLocalMemSize[Index] = Size;
 
       // Calculate the current starting offset into local data
-      const size_t LocalOffset = std::accumulate(
-          std::begin(AlignedLocalMemSize),
-          std::next(std::begin(AlignedLocalMemSize), Index), size_t{0});
+      const size_t LocalOffset =
+          std::accumulate(std::begin(AlignedLocalMemSize),
+                          std::next(std::begin(AlignedLocalMemSize), Index),
+                          size_t{WorkGroupMemory});
 
       // Maximum required alignment is the size of the largest vector type
       const size_t MaxAlignment = sizeof(double) * 16;
@@ -156,20 +158,11 @@ struct ur_kernel_handle_t_ {
       return std::make_pair(AlignedLocalSize, AlignedLocalOffset);
     }
 
-    void addLocalArg(size_t Index, size_t Size) {
-      // Get the aligned argument size and offset into local data
-      auto [AlignedLocalSize, AlignedLocalOffset] =
-          calcAlignedLocalArgument(Index, Size);
-
-      // Store argument details
-      addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
-             AlignedLocalSize);
-
-      // For every existing local argument which follows at later argument
-      // indices, update the offset and pointer into the kernel local memory.
-      // Required as padding will need to be recalculated.
+    // Iterate over all existing local argument which follows StartIndex
+    // index, update the offset and pointer into the kernel local memory.
+    void updateLocalArgOffset(size_t StartIndex) {
       const size_t NumArgs = Indices.size() - 1; // Accounts for implicit arg
-      for (auto SuccIndex = Index + 1; SuccIndex < NumArgs; SuccIndex++) {
+      for (auto SuccIndex = StartIndex; SuccIndex < NumArgs; SuccIndex++) {
         const size_t OriginalLocalSize = OriginalLocalMemSize[SuccIndex];
         if (OriginalLocalSize == 0) {
           // Skip if successor argument isn't a local memory arg
@@ -192,6 +185,20 @@ struct ur_kernel_handle_t_ {
       }
     }
 
+    void addLocalArg(size_t Index, size_t Size) {
+      // Get the aligned argument size and offset into local data
+      auto [AlignedLocalSize, AlignedLocalOffset] =
+          calcAlignedLocalArgument(Index, Size);
+
+      // Store argument details
+      addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
+             AlignedLocalSize);
+      // For every existing local argument which follows at later argument
+      // indices, update the offset and pointer into the kernel local memory.
+      // Required as padding will need to be recalculated.
+      updateLocalArgOffset(Index + 1);
+    }
+
     void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
       assert(hMem && "Invalid mem handle");
       // To avoid redundancy we are not storing mem obj with index i at index
@@ -206,6 +213,16 @@ struct ur_kernel_handle_t_ {
       MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
     }
 
+    void setWorkGroupMemory(size_t MemSize) {
+      // If the WorkGroupMemory is the same as MemSize, then all accessors
+      // offsets accounted for this extra memory
+      if (WorkGroupMemory == MemSize)
+        return;
+      WorkGroupMemory = MemSize;
+      // Update local accessor offsets
+      updateLocalArgOffset(/*StartIndex=*/0);
+    }
+
     void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
       assert(Size == sizeof(std::uint32_t) * 3);
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
@@ -215,7 +232,8 @@ struct ur_kernel_handle_t_ {
 
     uint32_t getLocalSize() const {
       return std::accumulate(std::begin(AlignedLocalMemSize),
-                             std::end(AlignedLocalMemSize), 0);
+                             std::end(AlignedLocalMemSize), 0) +
+             WorkGroupMemory;
     }
   } Args;
 
@@ -300,6 +318,7 @@ struct ur_kernel_handle_t_ {
     return Args.getIndices();
   }
 
+  void setWorkGroupMemory(size_t MemSize) { Args.setWorkGroupMemory(MemSize); }
   uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
 
   size_t getRegsPerThread() const noexcept { return RegsPerThread; };
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index c4598f3472..95c8d026a7 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -902,14 +902,15 @@ ur_result_t urQueueFlush(
 
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   std::ignore = hQueue;
   std::ignore = hKernel;
   std::ignore = workDim;
+  std::ignore = pGlobalWorkOffset;
   std::ignore = pGlobalWorkSize;
   std::ignore = pLocalWorkSize;
   std::ignore = numPropsInLaunchPropList;
diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp
index 1215d6449e..0832303b50 100644
--- a/source/adapters/level_zero/ur_interface_loader.hpp
+++ b/source/adapters/level_zero/ur_interface_loader.hpp
@@ -694,8 +694,8 @@ ur_result_t urEnqueueTimestampRecordingExp(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent);
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent);
diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp
index b7b45625a2..e4659b5f2c 100644
--- a/source/adapters/level_zero/v2/queue_api.cpp
+++ b/source/adapters/level_zero/v2/queue_api.cpp
@@ -391,13 +391,13 @@ ur_result_t urEnqueueTimestampRecordingExp(
 }
 ur_result_t urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) try {
   return hQueue->enqueueKernelLaunchCustomExp(
-      hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
+      hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
       numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
       phEventWaitList, phEvent);
 } catch (...) {
diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp
index 7cb039ccdd..c59f084fc4 100644
--- a/source/adapters/level_zero/v2/queue_api.hpp
+++ b/source/adapters/level_zero/v2/queue_api.hpp
@@ -144,9 +144,9 @@ struct ur_queue_handle_t_ {
                                                    const ur_event_handle_t *,
                                                    ur_event_handle_t *) = 0;
   virtual ur_result_t enqueueKernelLaunchCustomExp(
-      ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, uint32_t,
-      const ur_exp_launch_property_t *, uint32_t, const ur_event_handle_t *,
-      ur_event_handle_t *) = 0;
+      ur_kernel_handle_t, uint32_t, const size_t *, const size_t *,
+      const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t,
+      const ur_event_handle_t *, ur_event_handle_t *) = 0;
   virtual ur_result_t
   enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *,
                                   uint32_t, const ur_event_handle_t *,
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index 519b0ffc1e..05e48c8740 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -1069,13 +1069,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp(
 }
 
 ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp(
-    ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkSize,
+    ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
     const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   std::ignore = hKernel;
   std::ignore = workDim;
+  std::ignore = pGlobalWorkOffset;
   std::ignore = pGlobalWorkSize;
   std::ignore = pLocalWorkSize;
   std::ignore = numPropsInLaunchPropList;
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
index 33e060ded3..bdd3009d63 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp
@@ -263,8 +263,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
                                ur_event_handle_t *phEvent) override;
   ur_result_t enqueueKernelLaunchCustomExp(
       ur_kernel_handle_t hKernel, uint32_t workDim,
-      const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-      uint32_t numPropsInLaunchPropList,
+      const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+      const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
       const ur_exp_launch_property_t *launchPropList,
       uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
       ur_event_handle_t *phEvent) override;
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index 42c342444d..c8ce408756 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -10126,6 +10126,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -10153,11 +10156,17 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     ur_result_t result = UR_RESULT_SUCCESS;
 
     ur_enqueue_kernel_launch_custom_exp_params_t params = {
-        &hQueue,          &hKernel,
-        &workDim,         &pGlobalWorkSize,
-        &pLocalWorkSize,  &numPropsInLaunchPropList,
-        &launchPropList,  &numEventsInWaitList,
-        &phEventWaitList, &phEvent};
+        &hQueue,
+        &hKernel,
+        &workDim,
+        &pGlobalWorkOffset,
+        &pGlobalWorkSize,
+        &pLocalWorkSize,
+        &numPropsInLaunchPropList,
+        &launchPropList,
+        &numEventsInWaitList,
+        &phEventWaitList,
+        &phEvent};
 
     auto beforeCallback = reinterpret_cast<ur_mock_callback_t>(
         mock::getCallbacks().get_before_callback(
@@ -10176,6 +10185,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
         result = replaceCallback(&params);
     } else {
 
+        // optional output handle
+        if (phEvent) {
+            *phEvent = mock::createDummyHandle<ur_event_handle_t>();
+        }
         result = UR_RESULT_SUCCESS;
     }
 
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 64489c39ac..afd1411ae8 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -8698,6 +8698,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -8730,11 +8733,17 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
 
     ur_enqueue_kernel_launch_custom_exp_params_t params = {
-        &hQueue,          &hKernel,
-        &workDim,         &pGlobalWorkSize,
-        &pLocalWorkSize,  &numPropsInLaunchPropList,
-        &launchPropList,  &numEventsInWaitList,
-        &phEventWaitList, &phEvent};
+        &hQueue,
+        &hKernel,
+        &workDim,
+        &pGlobalWorkOffset,
+        &pGlobalWorkSize,
+        &pLocalWorkSize,
+        &numPropsInLaunchPropList,
+        &launchPropList,
+        &numEventsInWaitList,
+        &phEventWaitList,
+        &phEvent};
     uint64_t instance =
         getContext()->notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP,
                                    "urEnqueueKernelLaunchCustomExp", &params);
@@ -8743,9 +8752,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     logger.info("   ---> urEnqueueKernelLaunchCustomExp\n");
 
     ur_result_t result = pfnKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
-        numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
-        phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitList, phEvent);
 
     getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP,
                              "urEnqueueKernelLaunchCustomExp", &params, &result,
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index b3969de10f..c2dcc7be6f 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -9726,6 +9726,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -9766,6 +9769,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
             return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
         }
 
+        if (NULL == pGlobalWorkOffset) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
         if (NULL == pGlobalWorkSize) {
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
@@ -9794,9 +9801,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     }
 
     ur_result_t result = pfnKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize,
-        numPropsInLaunchPropList, launchPropList, numEventsInWaitList,
-        phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitList, phEvent);
 
     return result;
 }
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index 86a6ad95a0..602b8f1a82 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -8866,6 +8866,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -8908,11 +8911,35 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     // convert loader handle to platform handle
     hKernel = reinterpret_cast<ur_kernel_object_t *>(hKernel)->handle;
 
+    // convert loader handles to platform handles
+    auto phEventWaitListLocal =
+        std::vector<ur_event_handle_t>(numEventsInWaitList);
+    for (size_t i = 0; i < numEventsInWaitList; ++i) {
+        phEventWaitListLocal[i] =
+            reinterpret_cast<ur_event_object_t *>(phEventWaitList[i])->handle;
+    }
+
     // forward to device-platform
-    result = pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize,
-                                      pLocalWorkSize, numPropsInLaunchPropList,
-                                      launchPropList, numEventsInWaitList,
-                                      phEventWaitList, phEvent);
+    result = pfnKernelLaunchCustomExp(
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitListLocal.data(), phEvent);
+
+    // In the event of ERROR_ADAPTER_SPECIFIC we should still attempt to wrap any output handles below.
+    if (UR_RESULT_SUCCESS != result &&
+        UR_RESULT_ERROR_ADAPTER_SPECIFIC != result) {
+        return result;
+    }
+    try {
+        // convert platform handle to loader handle
+        if (nullptr != phEvent) {
+            *phEvent = reinterpret_cast<ur_event_handle_t>(
+                context->factories.ur_event_factory.getInstance(*phEvent,
+                                                                dditable));
+        }
+    } catch (std::bad_alloc &) {
+        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
 
     return result;
 }
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 3340363737..b2e26a8b8b 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -8992,6 +8992,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -9020,6 +9021,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
@@ -9050,10 +9054,10 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
         return UR_RESULT_ERROR_UNINITIALIZED;
     }
 
-    return pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize,
-                                    pLocalWorkSize, numPropsInLaunchPropList,
-                                    launchPropList, numEventsInWaitList,
-                                    phEventWaitList, phEvent);
+    return pfnKernelLaunchCustomExp(
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, numPropsInLaunchPropList, launchPropList,
+        numEventsInWaitList, phEventWaitList, phEvent);
 } catch (...) {
     return exceptionToResult(std::current_exception());
 }
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 853d61472e..0b2e6a0f74 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -7625,6 +7625,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -7653,6 +7654,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     uint32_t
         workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
                  ///< work-group work-items
+    const size_t *
+        pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
+    ///< offset used to calculate the global ID of a work-item
     const size_t *
         pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
     ///< number of global work-items in workDim that will execute the kernel
diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp
index a54a44ecaf..23ba56ff4b 100644
--- a/test/conformance/exp_launch_properties/launch_properties.cpp
+++ b/test/conformance/exp_launch_properties/launch_properties.cpp
@@ -95,8 +95,8 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) {
     AddPodArg(val);
 
     ASSERT_SUCCESS(urEnqueueKernelLaunchCustomExp(
-        queue, kernel, n_dimensions, &global_size, nullptr, 1, &props[0], 0,
-        nullptr, nullptr));
+        queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 1,
+        &props[0], 0, nullptr, nullptr));
     ASSERT_SUCCESS(urQueueFinish(queue));
     ValidateBuffer(buffer, sizeof(val) * global_size, val);
 }