From 42d4377a97e025d34c8f38b9504b85151b5473e9 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Wed, 6 Nov 2024 19:47:51 +0400 Subject: [PATCH] [GPU] Fix accuracy issue in PagedAttention kernel for large prompts (#27433) ### Details: - Fix accuracy issue in PagedAttention kernel for large prompts (4K/8K tokens) by correcting index calculation in sub_group_broadcast function to ensure accurate data broadcasting within the subgroup ### Tickets: - [CVS-154881](https://jira.devtools.intel.com/browse/CVS-154881) --- .../intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl index a3bdd7e12dcd49..00c43829d02ea7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl @@ -436,7 +436,7 @@ KERNEL(pa_sdpa_finalization_stage)( partition_num * HEAD_SIZE + head_size_idx; OUTPUT_TYPE out_val = tmp_out[tmp_out_offset]; - acc += TO_SOFTMAX_ACCUMULATOR_TYPE(out_val) * TO_SOFTMAX_ACCUMULATOR_TYPE(sub_group_broadcast(exp_sum[partition_num / SUBGROUP_SIZE], partition_num)) / TO_SOFTMAX_ACCUMULATOR_TYPE(global_sum); + acc += TO_SOFTMAX_ACCUMULATOR_TYPE(out_val) * TO_SOFTMAX_ACCUMULATOR_TYPE(sub_group_broadcast(exp_sum[partition_num / SUBGROUP_SIZE], partition_num % SUBGROUP_SIZE)) / TO_SOFTMAX_ACCUMULATOR_TYPE(global_sum); } const uint out_offset = seq_idx * (HEADS_NUM * HEAD_SIZE) + head_num_idx * HEAD_SIZE +