From 6ffb033b5aff15d6a097ffa469ac79536f6681ef Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 14 Jan 2025 20:21:28 -0800 Subject: [PATCH] [V1][BugFix] Fix edge case in VLM scheduling (#12065) Signed-off-by: Woosuk Kwon --- vllm/v1/core/scheduler.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index f04e529891287..2503d136aea7e 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -373,18 +373,22 @@ def _try_schedule_encoder_inputs( if self.encoder_cache_manager.has_cache(request, i): # The encoder input is already computed and cached. continue - if not self.encoder_cache_manager.can_allocate(request, i): - # The encoder cache is full. We can only schedule the decoder - # tokens just before the encoder input. - num_new_tokens = start_pos - num_computed_tokens - break - if num_encoder_tokens > encoder_budget: - # The encoder budget is exhausted. We can only schedule the - # decoder tokens up until the encoder input. - # NOTE(woosuk): We assume that the encoder tokens should be - # processed altogether, as the encoder usually uses + if (not self.encoder_cache_manager.can_allocate(request, i) + or num_encoder_tokens > encoder_budget): + # The encoder cache is full or the encoder budget is exhausted. + # NOTE(woosuk): We assume that the encoder input tokens should + # be processed altogether, as the encoder usually uses # bidirectional attention. - num_new_tokens = start_pos - num_computed_tokens + if num_computed_tokens < start_pos: + # We only schedule the decoder tokens just before the + # encoder input. + num_new_tokens = start_pos - num_computed_tokens + else: + # Because of prefix caching, num_computed_tokens is greater + # than start_pos even though its encoder input is not + # available. In this case, we can't schedule any token for + # the request in this step. + num_new_tokens = 0 break encoder_budget -= num_encoder_tokens