vllm-project · rkooo567 · May 28, 2024 · Apr 10, 2024 · Apr 30, 2024 · May 2, 2024
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -0,0 +1,130 @@
+import random
+from typing import Iterable, List
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+# relatively small model with 4k sliding window
+MODEL = "bigcode/starcoder2-3b"
+
+
+# the prompt is just under 10k tokens; sliding window is 4k
+# so the answer is outside sliding window, but should still be correct
+def prep_prompts(batch_size: int):
+    prompts: List[str] = []
+    answer: List[int] = []
+    indices: List[int] = []
+    random.seed(1)
+    for _ in range(batch_size):
+        idx = random.randint(30, 90)
+        indices.append(idx)
+        prompt = "```python\n# We set a number of variables, " + \
+                 f"x{idx} will be important later\n"
+        ln = random.randint(800, 1100)
+        for k in range(30, ln):
+            v = random.randint(10, 99)
+            if k == idx:
+                answer.append(v)
+            prompt += f"x{k} = {v}\n"
+        prompt += f"# Now, we check the value of x{idx}:\n"
+        prompt += f"assert x{idx} == "
+        prompts.append(prompt)
+    return prompts, answer, indices
+
+
+def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
+    answer2 = [int(text[0:2].strip()) for text in outputs]
+    print(list(zip(indices, zip(answer, answer2))))
+    numok = 0
+    for a1, a2 in zip(answer, answer2):
+        if a1 == a2:
+            numok += 1
+    frac_ok = numok / len(answer)
+    print(f"Numok: {numok}/{len(answer)} {frac_ok}")
+    assert frac_ok > 0.7
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": MODEL,
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+        "block_size": 16,
+        "num_gpu_blocks_override": 100000 // 16,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "use_v2_block_manager": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
+                                 batch_size, seed):
+    sampling_params = SamplingParams(
+        max_tokens=128,
+        ignore_eos=True,
+        temperature=0.0,
+    )
+
+    prompts, answer, indices = prep_prompts(batch_size)
+
+    print('Getting token ids from block manager v1')
+    baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
+                                                 prompts, sampling_params)
+
+    check_answers(indices, answer, baseline_texts)
+
+    print('Getting token ids from block manager v2')
+    test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
+                                             sampling_params)
+    check_answers(indices, answer, test_texts)
+
+    for expected_text, actual_text in zip(baseline_texts, test_texts):
+        assert expected_text == actual_text
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": MODEL,
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+        "block_size": 16,
+        "num_gpu_blocks_override": 100000 // 16,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "use_v2_block_manager": True,
+    "enable_chunked_prefill": True
+}])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed):
+    sampling_params = SamplingParams(
+        max_tokens=10,
+        ignore_eos=True,
+        temperature=0.0,
+    )
+
+    prompts, answer, indices = prep_prompts(batch_size)
+
+    # We don't compare with the baseline model here, since the results
+    # slightly different due to different tailing in attention.
+    test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
+                                             sampling_params)
+    check_answers(indices, answer, test_texts)
+
+
+def get_text_from_llm_generator(llm_generator: Iterable[LLM], prompts,
+                                sampling_params):
+    for llm in llm_generator:
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        text = [output.outputs[0].text for output in outputs]
+        del llm
+
+    return text
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
@@ -45,6 +45,14 @@ def copy_blocks(
     ) -> None:
         raise NotImplementedError
 
+    @staticmethod
+    @abstractmethod
+    def zero_block(
+        kv_caches: List[torch.Tensor],
+        block_id: int,
+    ) -> None:
+        raise NotImplementedError
+
 
 @dataclass
 class AttentionMetadataPerStage:

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
@@ -52,6 +52,13 @@ def copy_blocks(
     ) -> None:
         PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
+    @staticmethod
+    def zero_block(
+        kv_caches: List[torch.Tensor],
+        block_id: int,
+    ) -> None:
+        PagedAttention.zero_block(kv_caches, block_id)
+
 
 @dataclass
 class FlashAttentionMetadata(AttentionMetadataPerStage,

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -50,6 +50,13 @@ def copy_blocks(
     ) -> None:
         PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
+    @staticmethod
+    def zero_block(
+        kv_caches: List[torch.Tensor],
+        block_id: int,
+    ) -> None:
+        PagedAttention.zero_block(kv_caches, block_id)
+
 
 @dataclass
 class ROCmFlashAttentionMetadata(AttentionMetadataPerStage,

diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
@@ -48,6 +48,13 @@ def copy_blocks(
     ) -> None:
         PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
+    @staticmethod
+    def zero_block(
+        kv_caches: List[torch.Tensor],
+        block_id: int,
+    ) -> None:
+        PagedAttention.zero_block(kv_caches, block_id)
+
 
 @dataclass
 class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata,

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
@@ -53,6 +53,13 @@ def copy_blocks(
     ) -> None:
         PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
+    @staticmethod
+    def zero_block(
+        kv_caches: List[torch.Tensor],
+        block_id: int,
+    ) -> None:
+        PagedAttention.zero_block(kv_caches, block_id)
+
 
 @dataclass
 class XFormersMetadata(AttentionMetadataPerStage, PagedAttentionMetadata):

diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
@@ -214,3 +214,11 @@ def copy_blocks(
         key_caches = [kv_cache[0] for kv_cache in kv_caches]
         value_caches = [kv_cache[1] for kv_cache in kv_caches]
         ops.copy_blocks(key_caches, value_caches, src_to_dists)
+
+    @staticmethod
+    def zero_block(
+        kv_caches: List[torch.Tensor],
+        block_id: int,
+    ) -> None:
+        for kv_cache in kv_caches:
+            kv_cache[:, block_id, :].zero_()
@@ -37,13 +37,15 @@ def __init__(
         block_size: int,
         block_allocator: DeviceAwareBlockAllocator,
         _blocks: Optional[List[Block]] = None,
+        block_sliding_window: Optional[int] = None,
     ):
         self._block_size = block_size
         self._allocator = block_allocator
         if _blocks is None:
             _blocks = []
         self._blocks: List[Block] = _blocks
 
+        self._block_sliding_window = block_sliding_window
         # Use helper method instead of directly calculating, as blocks
         # may not be allocated.
         self._num_full_slots = len(self._get_all_token_ids())
@@ -89,7 +91,8 @@ def allocate(self,
 
     def append_token_ids(self,
                          token_ids: List[int],
-                         num_lookahead_slots: int = 0) -> None:
+                         num_lookahead_slots: int = 0,
+                         num_computed_slots: Optional[int] = None) -> None:
         """Appends a sequence of token IDs to the existing blocks in the
         BlockTable.
 
@@ -105,12 +108,27 @@ def append_token_ids(self,
         Args:
             token_ids (List[int]): The sequence of token IDs to be appended.
         """
-        assert self._is_allocated
+        assert self._is_allocated, "no blocks have been allocated"
         assert len(self._blocks) > 0
 
+        if self._block_sliding_window is not None:
+            null_block = self._allocator.null_block
+            if num_computed_slots is None:
+                num_computed_slots = self._num_full_slots
+            end_idx = (num_computed_slots //
+                       self._block_size) - self._block_sliding_window
+            for idx in range(0, end_idx):
+                b = self._blocks[idx]
+                if b is not null_block:
+                    self._allocator.free(b)
+                    self._blocks[idx] = null_block
+
+        # Ensure there are enough empty slots for the new tokens plus
+        # lookahead slots
         self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
                                     num_lookahead_slots)
 
+        # Update the blocks with the new tokens
         blocks = self._blocks[self._num_full_slots // self._block_size:]
         token_blocks = self._chunk_token_blocks_for_append(token_ids)
 
@@ -168,6 +186,7 @@ def fork(self) -> "BlockTable":
             block_size=self._block_size,
             block_allocator=self._allocator,
             _blocks=forked_blocks,
+            block_sliding_window=self._block_sliding_window,
         )
 
     def free(self) -> None:

@@ -105,11 +105,24 @@ def __init__(
             Device.GPU: gpu_block_allocator,
         }
 
+        self._null_block: Optional[Block] = None
+
         self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
         for _, allocator in self._allocators.items():
             for block_id in allocator.all_block_ids:
                 self._block_ids_to_allocator[block_id] = allocator
 
+    @property
+    def null_block(self) -> Block:
+        if self._null_block is None:
+            self._null_block = self.allocate_mutable(None, Device.GPU)
+
+            def fail(token_ids: List[int]):
+                raise ValueError("null_block should not be modified")
+
+            self._null_block.append_token_ids = fail  # type: ignore
+        return self._null_block
+
     def allocate_mutable(self, prev_block: Optional[Block],
                          device: Device) -> Block:
         """Allocates a new mutable block on the specified device.
@@ -149,6 +162,8 @@ def free(self, block: Block) -> None:
         Args:
             block (Block): The block to be freed.
         """
+        if block is self._null_block:
+            return
         block_id = block.block_id
         assert block_id is not None
         allocator = self._block_ids_to_allocator[block_id]
@@ -165,6 +180,7 @@ def fork(self, last_block: Block) -> List[Block]:
             List[Block]: A new list of blocks that shares the same memory as the
                 original sequence.
         """
+        assert last_block is not self._null_block
         block_id = last_block.block_id
         assert block_id is not None
         allocator = self._block_ids_to_allocator[block_id]

@@ -203,3 +203,8 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
     def get_common_computed_block_ids(
             self, seq_block_ids: List[List[int]]) -> List[int]:
         pass
+
+    @property
+    @abstractmethod
+    def null_block(self) -> Block:
+        pass
@@ -65,9 +65,13 @@ def __init__(
         self.num_total_gpu_blocks = num_gpu_blocks
         self.num_total_cpu_blocks = num_cpu_blocks
 
-        assert sliding_window is None, "Sliding window not yet supported"
-
+        self.sliding_window = sliding_window
+        # block_sliding_window is the max number of blocks that need to be
+        # allocated
+        # We generally need up 1 block more due to the way BlockTable works
         self.block_sliding_window = None
+        if sliding_window is not None:
+            self.block_sliding_window = sliding_window // block_size + 2
 
         self.watermark = watermark
         assert watermark >= 0.0
@@ -82,6 +86,12 @@ def __init__(
             num_cpu_blocks=num_cpu_blocks,
             block_size=block_size,
         )
+        if self.sliding_window is not None:
+            # Allocate the null_block first, so it gets ID of 0.
+            # CacheEngine makes sure the first block is always zeroed-out
+            # so we don't get some nasty NaNs in there.
+            null_block = self.block_allocator.null_block
+            assert null_block.block_id == 0
 
         self.block_tables: Dict[SeqId, BlockTable] = {}
 
@@ -95,7 +105,6 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
             block_size=self.block_size,
         )
 
-        assert self.block_sliding_window is None
         if self.block_sliding_window is not None:
             num_required_blocks = min(num_required_blocks,
                                       self.block_sliding_window)
@@ -124,8 +133,9 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         block_table = BlockTable(
             block_size=self.block_size,
             block_allocator=self.block_allocator,
+            block_sliding_window=self.block_sliding_window,
         )
-        assert self.block_sliding_window is None
+
         block_table.allocate(seq.get_token_ids())
         self.block_tables[seq.seq_id] = block_table
 
@@ -173,6 +183,7 @@ def append_slots(
         block_table.append_token_ids(
             token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
             num_lookahead_slots=num_lookahead_slots,
+            num_computed_slots=seq.data.get_num_computed_tokens(),
         )
 
         # Return any new copy-on-writes.

diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
@@ -48,6 +48,9 @@ def __init__(
         # Initialize the cache.
         self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda")
         self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
+        # Zero out the first block in the cache, in case it gets used as
+        # 'null_block' in the CpuGpuBlockAllocator
+        self.attn_backend.zero_block(self.gpu_cache, 0)
 
     def _allocate_kv_cache(
         self,