From 3f4374a3e4b765b1755f7f81542c789a00638e2a Mon Sep 17 00:00:00 2001 From: Tao He Date: Thu, 1 Feb 2024 01:00:13 +0800 Subject: [PATCH] Fixes assertion failure in prefix caching: the lora index mapping should respect prefix_len (#2688) Signed-off-by: Tao He --- vllm/worker/model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 2a12152a70863..2df9fd5215a2d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -142,10 +142,10 @@ def _prepare_prompt( if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) - lora_index_mapping.append([lora_id] * prompt_len) + lora_index_mapping.append([lora_id] * (prompt_len - prefix_len)) lora_prompt_mapping.extend( [lora_id] * - (prompt_len + (prompt_len - prefix_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) if seq_group_metadata.block_tables is None: