vllm-project · LiuXiaoxuanPKU · Jun 28, 2024 · May 6, 2024 · May 6, 2024 · May 7, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -18,6 +18,7 @@ steps:
   commands:
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+  - VLLM_ATTENTION_BACKEND=FLASHINFER pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

diff --git a/Dockerfile b/Dockerfile
@@ -105,6 +105,9 @@ RUN ldconfig /usr/local/cuda-12.4/compat/
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     pip install dist/*.whl --verbose
+
+# Manually install flashinfer
+RUN pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 #################### vLLM installation IMAGE ####################
 
 

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
@@ -1,10 +1,18 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Set, Tuple, Type
 
-import flashinfer
+try:
+    import flashinfer
+    from flash_attn import flash_attn_varlen_func
+    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
+except ImportError:
+    flashinfer = None
+    flash_attn_varlen_func = None
+    BatchDecodeWithPagedKVCacheWrapper = None
+    BatchPrefillWithPagedKVCacheWrapper = None
+
 import torch
-from flashinfer import BatchDecodeWithPagedKVCacheWrapper
-from vllm_flash_attn import flash_attn_varlen_func
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
@@ -62,11 +70,12 @@ class FlashInferMetadata(AttentionMetadata):
 
     use_cuda_graph: bool = False
 
+    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
     decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
 
-    # Metadata for the prefill stage since we still
-    # use flash attention for prefill.
+    # Metadata for the prefill stage
     seq_start_loc: Optional[torch.Tensor] = None
+    query_start_loc: Optional[torch.Tensor] = None
     block_tables: Optional[torch.Tensor] = None
 
     # Metadata for the decode stage
@@ -109,11 +118,15 @@ def __post_init__(self):
                 f"Only {supported_head_sizes} are supported for head_dim,",
                 f"received {self.head_dim}.")
 
-        # When using flashinfer, we are also creating the FlashInferMetadata,
-        # which will also call post_init by default, here we want to skip the
-        # post_init if it's the prefill phase.
-        if self.num_prefills == 0:
-            assert self.num_decode_tokens > 0
+        if self.num_prefill_tokens > 0:
+            self.prefill_wrapper = \
+                flashinfer.BatchPrefillWithPagedKVCacheWrapper(
+                self.workspace_buffer, "NHD")
+            self.prefill_wrapper.begin_forward(
+                self.query_start_loc, self.paged_kv_indptr,
+                self.paged_kv_indices, self.paged_kv_last_page_len,
+                self.num_qo_heads, self.num_kv_heads, self.head_dim)
+        else:
             self.decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
                 self.workspace_buffer, "NHD")
             self.decode_wrapper.begin_forward(
@@ -133,8 +146,9 @@ def asdict_zerocopy(self,
                         ) -> Dict[str, Any]:
         if skip_fields is None:
             skip_fields = set()
-        # We need to skip the decode_wrapper field since it cannot be
+        # We need to skip the prefill/decode_wrapper field since it cannot be
         # broadcasted with nccl when TP is enabled.
+        skip_fields.add('prefill_wrapper')
         skip_fields.add('decode_wrapper')
         return super().asdict_zerocopy(skip_fields)
 
@@ -168,6 +182,7 @@ def __init__(
         alibi_slopes: Optional[List[float]],
         sliding_window: Optional[int],
         kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -217,10 +232,14 @@ def forward(
                 self.kv_cache_dtype,
             )
 
+        query = query.contiguous(
+        )  # Flashinfer requires query to be contiguous
         if prefill_meta := attn_metadata.prefill_metadata:
-            # Prompt run.
-            assert prefill_meta.block_tables is not None
-            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
+            # We will use flash attention for prefill
+            # when kv_cache is not provided.
+            # This happens when vllm runs the profiling to
+            # determine the number of blocks.
+            if kv_cache is None:
                 output = flash_attn_varlen_func(
                     q=query,
                     k=key,
@@ -235,13 +254,14 @@ def forward(
                     alibi_slopes=self.alibi_slopes,
                 )
             else:
-                raise NotImplementedError(
-                    "Prefix caching is not supported with flashinfer yet.")
+                assert prefill_meta is not None
+                assert prefill_meta.prefill_wrapper is not None
+                output = prefill_meta.prefill_wrapper.forward(query,
+                                                              kv_cache,
+                                                              causal=True)
         else:
             assert attn_metadata.decode_metadata is not None
             assert attn_metadata.decode_metadata.decode_wrapper is not None
-            query = query.contiguous(
-            )  # Flashinfer requires query to be contiguous
             output = attn_metadata.decode_metadata.decode_wrapper.forward(
                 query,
                 kv_cache,

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -369,21 +369,13 @@ def _prepare_model_input(
                         if curr_sliding_window_blocks is not None:
                             block_table = block_table[
                                 -curr_sliding_window_blocks:]
-                        if self.attn_backend.get_name() == "flashinfer":
-                            paged_kv_indices.extend(block_table)
-                            paged_kv_indptr.append(paged_kv_indptr[-1] +
-                                                   len(block_table))
-                            last_page_len = seq_data.get_len(
-                            ) % self.block_size
-                            if last_page_len == 0:
-                                last_page_len = self.block_size
-                            paged_kv_last_page_len.append(last_page_len)
                     else:
                         # Only happens when memory profiling runs.
                         block_table = []
                 else:
                     # Prefill without chunked prefill or memory profiling.
                     block_table = []
+
                 block_tables.append(block_table)
 
                 seq_lens.append(sliding_seq_len)
@@ -460,6 +452,15 @@ def _prepare_model_input(
                     slot = block_number * self.block_size + block_offset
                     slot_mapping.append(slot)
 
+                if self.attn_backend.get_name() == "flashinfer":
+                    paged_kv_indices.extend(block_table)
+                    paged_kv_indptr.append(paged_kv_indptr[-1] +
+                                           len(block_table))
+                    last_page_len = seq_len % self.block_size
+                    if last_page_len == 0:
+                        last_page_len = self.block_size
+                    paged_kv_last_page_len.append(last_page_len)
+
         batch_size = len(input_tokens)
         max_query_len = max(query_lens)
         max_prefill_seq_len = max(prefill_seq_lens, default=0)
@@ -590,6 +591,7 @@ def _prepare_model_input(
                 head_dim=self.model_config.get_head_size(),
                 page_size=16,
                 seq_start_loc=seq_start_loc,
+                query_start_loc=query_start_loc,
                 data_type=kv_cache_dtype)
         else:
             attn_metadata = self.attn_backend.make_metadata(