Skip to content

Commit

Permalink
fix neuron performance issue (vllm-project#13589)
Browse files Browse the repository at this point in the history
  • Loading branch information
ajayvohra2005 authored Feb 20, 2025
1 parent d3ea501 commit 6a417b8
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions vllm/worker/neuron_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
# Set the number of GPU blocks to be the same as the maximum number of
# sequences that can be processed in a single batch. This is equivalent
# to schedule without PagedAttention.
num_gpu_blocks = self.scheduler_config.max_num_seqs
num_gpu_blocks = self.scheduler_config.max_num_seqs + 1

# Swap not yet supported with Neuron backend.
num_cpu_blocks = 0
Expand All @@ -90,7 +90,7 @@ def initialize_cache(self, num_gpu_blocks: int,

# Different values are not tested.
assert num_cpu_blocks == 0
assert num_gpu_blocks == self.scheduler_config.max_num_seqs
assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1

self.cache_config.num_gpu_blocks = num_gpu_blocks
self.cache_config.num_cpu_blocks = num_cpu_blocks
Expand Down

0 comments on commit 6a417b8

Please sign in to comment.