From e150f3cd37ddaa7ba2262efeeb14181af8433518 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Sun, 4 Feb 2024 01:26:50 +0000 Subject: [PATCH 01/21] prefill works, needs decoding impl --- benchmark.sh | 6 ++ vllm/model_executor/input_metadata.py | 4 + vllm/model_executor/layers/attention.py | 118 +++++++++++++++++++----- vllm/model_executor/models/mistral.py | 7 +- vllm/worker/cache_engine.py | 68 +++----------- vllm/worker/model_runner.py | 55 ++++++++++- vllm/worker/worker.py | 3 +- 7 files changed, 174 insertions(+), 87 deletions(-) create mode 100644 benchmark.sh diff --git a/benchmark.sh b/benchmark.sh new file mode 100644 index 0000000000000..5b652f3d22f5c --- /dev/null +++ b/benchmark.sh @@ -0,0 +1,6 @@ +python3 benchmarks/benchmark_throughput.py \ + --input-len 550 \ + --output-len 150 \ + --model mistralai/Mistral-7B-v0.1 \ + --num-prompts 10 \ + --enforce-eager diff --git a/vllm/model_executor/input_metadata.py b/vllm/model_executor/input_metadata.py index f0a88ac8e27f8..5e691fc7a68e1 100644 --- a/vllm/model_executor/input_metadata.py +++ b/vllm/model_executor/input_metadata.py @@ -27,6 +27,8 @@ def __init__( block_tables: Optional[torch.Tensor], use_cuda_graph: bool, kv_cache_dtype: str, + decode_wrapper = None, + prefill_wrapper = None ) -> None: self.is_prompt = is_prompt self.prompt_lens = prompt_lens @@ -38,6 +40,8 @@ def __init__( self.block_tables = block_tables self.use_cuda_graph = use_cuda_graph self.kv_cache_dtype = kv_cache_dtype + self.prefill_wrapper = prefill_wrapper + self.decode_wrapper = decode_wrapper # Set during the execution of the first attention op. # FIXME(woosuk): This is a hack. diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 2ce9d60f08d80..15590901a2a77 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -18,7 +18,7 @@ # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. _PARTITION_SIZE = 512 - +import flashinfer class PagedAttention(nn.Module): """MHA/MQA/GQA layer with PagedAttention. @@ -63,8 +63,8 @@ def forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - key_cache: Optional[torch.Tensor], - value_cache: Optional[torch.Tensor], + kv_cache: Optional[torch.Tensor], + #value_cache: Optional[torch.Tensor], input_metadata: InputMetadata, ) -> torch.Tensor: """PagedAttention forward pass. @@ -81,9 +81,14 @@ def forward( Returns: shape = [batch_size, seq_len, num_heads * head_size] """ + + prefill_wrapper = input_metadata.prefill_wrapper + decode_wrapper = input_metadata.decode_wrapper + + batch_size, seq_len, hidden_size = query.shape # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) + query = query.view(-1, self.num_heads, self.head_size).contiguous() key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) @@ -91,15 +96,19 @@ def forward( # If key_cache and value_cache are not provided, the new key and value # vectors will not be cached. This happens during the initial memory # profiling run. - if key_cache is not None and value_cache is not None: - cache_ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - input_metadata.slot_mapping.flatten(), - input_metadata.kv_cache_dtype, - ) + if kv_cache is not None: + #flashinfer.page. + pass + + + #cache_ops.reshape_and_cache( + # key, + # value, + # key_cache, + # value_cache, + # input_metadata.slot_mapping.flatten(), + # input_metadata.kv_cache_dtype, + #) if input_metadata.is_prompt: # Prompt run. @@ -118,12 +127,12 @@ def forward( self.num_kv_heads, self.num_queries_per_kv, value.shape[-1]) - # normal attention - if (key_cache is None or value_cache is None - or input_metadata.block_tables.numel() == 0): + # old attn + if kv_cache is None: # Set attention bias if not provided. This typically happens at # the very attention layer of every iteration. # FIXME(woosuk): This is a hack. + if input_metadata.attn_bias is None: if self.alibi_slopes is None: attn_bias = BlockDiagonalCausalMask.from_seqlens( @@ -147,7 +156,56 @@ def forward( query = query.unflatten(0, (batch_size, seq_len)) key = key.unflatten(0, (batch_size, seq_len)) value = value.unflatten(0, (batch_size, seq_len)) + + out = xops.memory_efficient_attention_forward( + query, + key, + value, + attn_bias=input_metadata.attn_bias, + p=0.0, + scale=self.scale, + op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if + (is_hip()) else None, + ) + output = out.view_as(query) + elif input_metadata.block_tables.numel() == 0: + # Set attention bias if not provided. This typically happens at + # the very attention layer of every iteration. + # FIXME(woosuk): This is a hack. + + if input_metadata.attn_bias is None: + if self.alibi_slopes is None: + attn_bias = BlockDiagonalCausalMask.from_seqlens( + [seq_len] * batch_size) + if self.sliding_window is not None: + attn_bias = attn_bias.make_local_attention( + self.sliding_window) + input_metadata.attn_bias = attn_bias + else: + input_metadata.attn_bias = _make_alibi_bias( + self.alibi_slopes, self.num_kv_heads, batch_size, + seq_len, query.dtype) + # TODO(woosuk): Too many view operations. Let's try to reduce + # them in the future for code readability. + #if self.alibi_slopes is None: + # query = query.unsqueeze(0) + # key = key.unsqueeze(0) + # value = value.unsqueeze(0) + #else: + # query = query.unflatten(0, (batch_size, seq_len)) + # key = key.unflatten(0, (batch_size, seq_len)) + # value = value.unflatten(0, (batch_size, seq_len)) + + #query = query.unflatten(0, (batch_size, seq_len)) + + query = query.view(5510, 32, 128).contiguous() + out = input_metadata.prefill_wrapper.forward( + query.contiguous(), + kv_cache, + causal=True + ) + exit(0) out = xops.memory_efficient_attention_forward( query, key, @@ -179,14 +237,26 @@ def forward( else: # Decoding run. - output = _paged_attention( - query, - key_cache, - value_cache, - input_metadata, - self.num_kv_heads, - self.scale, - self.alibi_slopes, + #output = _paged_attention( + # query, + # key_cache, + # value_cache, + # input_metadata, + # self.num_kv_heads, + # self.scale, + # self.alibi_slopes, + #) + + #print(query.shape) + #print(input_metadata) + #print(key_cache.shape) + + print(kv_cache.shape) + + exit(0) + + output = flashinfer.batch_decode_with_padded_kv_cache( + query, key_cache, value_cache, "NHD", "LLAMA", rope_scale=self.scale, ) # Reshape the output tensor. diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py index 01cde67844122..abceaa976b6d2 100644 --- a/vllm/model_executor/models/mistral.py +++ b/vllm/model_executor/models/mistral.py @@ -88,6 +88,7 @@ def __init__(self, num_kv_heads: int, max_position: int = 4096 * 32, rope_theta: float = 10000, + linear_method: Optional[LinearMethodBase] = None, sliding_window: Optional[int] = None) -> None: super().__init__() @@ -145,13 +146,13 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, kv_cache: KVCache, - input_metadata: InputMetadata, + input_metadata: InputMetadata ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + #k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, kv_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index bbe33989fc2a4..9f73be3d9cb6f 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -2,6 +2,7 @@ from typing import Dict, List, Tuple import torch +import flashinfer from vllm._C import cache_ops from vllm.config import CacheConfig, ModelConfig, ParallelConfig @@ -48,72 +49,25 @@ def __init__( self.gpu_cache = self.allocate_gpu_cache() self.cpu_cache = self.allocate_cpu_cache() - # Initialize the stream for caching operations. - self.cache_stream = torch.cuda.Stream() - assert self.cache_stream != torch.cuda.current_stream() - # Initialize the events for stream synchronization. - self.events = [torch.cuda.Event() for _ in range(self.num_layers)] - - def get_key_block_shape(self) -> Tuple[int, int, int, int]: + def get_kv_block_shape(self) -> Tuple[int, int, int, int]: element_size = torch.tensor([], dtype=self.dtype).element_size() x = 16 // element_size - return ( - self.num_heads, - self.head_size // x, - self.block_size, - x, - ) - - def get_value_block_shape(self) -> Tuple[int, int, int]: return ( self.num_heads, self.head_size, - self.block_size, ) - + def allocate_gpu_cache(self) -> List[KVCache]: - gpu_cache: List[KVCache] = [] - key_block_shape = self.get_key_block_shape() - value_block_shape = self.get_value_block_shape() + kv_block_shape = self.get_kv_block_shape() + gpu_cache = [] for _ in range(self.num_layers): - key_blocks = torch.empty( - size=(self.num_gpu_blocks, *key_block_shape), - dtype=self.dtype, - device="cuda", - ) - value_blocks = torch.empty( - size=(self.num_gpu_blocks, *value_block_shape), - dtype=self.dtype, - device="cuda", - ) - gpu_cache.append((key_blocks, value_blocks)) + gpu_blocks = torch.empty(self.num_gpu_blocks, 2, self.block_size, *kv_block_shape, dtype=self.dtype, device="cuda") + gpu_cache.append(gpu_blocks) return gpu_cache - + def allocate_cpu_cache(self) -> List[KVCache]: - cpu_cache: List[KVCache] = [] - key_block_shape = self.get_key_block_shape() - value_block_shape = self.get_value_block_shape() - pin_memory = not in_wsl() - if not pin_memory: - # Pinning memory in WSL is not supported. - # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications - logger.warning("Using 'pin_memory=False' as WSL is detected. " - "This may slow down the performance.") - for _ in range(self.num_layers): - key_blocks = torch.empty( - size=(self.num_cpu_blocks, *key_block_shape), - dtype=self.dtype, - pin_memory=pin_memory, - device="cpu", - ) - value_blocks = torch.empty( - size=(self.num_cpu_blocks, *value_block_shape), - dtype=self.dtype, - pin_memory=pin_memory, - device="cpu", - ) - cpu_cache.append((key_blocks, value_blocks)) - return cpu_cache + kv_block_shape = self.get_kv_block_shape() + return torch.empty(self.num_gpu_blocks, 2, self.block_size, *kv_block_shape, dtype=self.dtype, device="cpu") def _swap( self, @@ -121,6 +75,8 @@ def _swap( dst: List[KVCache], src_to_dst: Dict[int, int], ) -> None: + #src + #src_key_cache with torch.cuda.stream(self.cache_stream): for i in range(self.num_layers): src_key_cache, src_value_cache = src[i] diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index fce0009e3097d..503fe6b77e07e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -4,6 +4,7 @@ import numpy as np import torch import torch.nn as nn +import flashinfer from vllm.config import DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig, SchedulerConfig from vllm.logger import init_logger @@ -17,7 +18,6 @@ from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.utils import in_wsl - logger = init_logger(__name__) KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -75,6 +75,14 @@ def __init__( self.in_wsl = in_wsl() self.kv_cache_dtype = kv_cache_dtype + workspace_buffer = torch.empty(16 * 1024 * 1024, dtype=torch.uint8, device="cuda:0") + self.prefill_wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( + workspace_buffer, "NHD" + ) + self.decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( + workspace_buffer, "NHD" + ) + def load_model(self) -> None: self.model = get_model(self.model_config, self.device_config, self.lora_config) @@ -533,11 +541,52 @@ def execute_model( self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + profile = False ) -> Optional[SamplerOutput]: (input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list) + num_qo_heads = 32 + num_kv_heads = 8 + + if not profile: + input_metadata.prefill_wrapper = self.prefill_wrapper + input_metadata.decode_wrapper = self.decode_wrapper + batch_size = input_tokens.shape[0] + + prefix_lens = input_metadata.prompt_lens + seq_lens = [a + b for a, b in zip(input_metadata.prompt_lens, input_metadata.context_lens)] + extend_seq_lens = input_metadata.context_lens + + qo_indptr = torch.zeros( + (batch_size + 1,), dtype=torch.int32, device="cuda" + ) + qo_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) + + paged_kv_indices = input_metadata.slot_mapping.view(-1).type(torch.int32) + paged_kv_indptr = torch.zeros( + (batch_size + 1,), dtype=torch.int32, device="cuda" + ) + paged_kv_indptr[1:] = torch.cumsum(torch.stack(seq_lens, dim=0), dim=0) + paged_kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda") + + + input_metadata.qo_indptr = qo_indptr + input_metadata.paged_kv_indptr = paged_kv_indptr + input_metadata.paged_kv_indices = paged_kv_indices + input_metadata.paged_kv_last_page_len = paged_kv_last_page_len + + if input_metadata.is_prompt: + input_metadata.prefill_wrapper.begin_forward( + qo_indptr, + paged_kv_indptr, + paged_kv_indices, + paged_kv_last_page_len, + num_qo_heads, + num_kv_heads + ) + if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) @@ -611,8 +660,8 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [(None, None)] * num_layers - self.execute_model(seqs, kv_caches) + kv_caches = [None] * num_layers #torch.zeros(1, 2, )[(None, None)] * num_layers + self.execute_model(seqs, kv_caches, profile=True) torch.cuda.synchronize() return diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index c97e82a55a1ee..b9186692586d3 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -145,7 +145,7 @@ def init_cache_engine(self, cache_config: CacheConfig) -> None: self.cache_config = cache_config self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.parallel_config) - self.cache_events = self.cache_engine.events + #self.cache_events = self.cache_engine.events self.gpu_cache = self.cache_engine.gpu_cache self.model_runner.set_block_size(self.cache_engine.block_size) @@ -218,6 +218,7 @@ def execute_model( output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache) + print("model executed") return output def add_lora(self, lora_request: LoRARequest) -> bool: From dc3e5d4fcf98835cd56b508d745ce49c40c707be Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Sun, 4 Feb 2024 17:19:12 +0000 Subject: [PATCH 02/21] add some decoding code --- vllm/model_executor/layers/attention.py | 50 ++++++++++++++--------- vllm/worker/cache_engine.py | 6 +-- vllm/worker/model_runner.py | 53 +++++++++++++++++++------ 3 files changed, 73 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 15590901a2a77..5276646861368 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -98,7 +98,34 @@ def forward( # profiling run. if kv_cache is not None: #flashinfer.page. + pass + """ + + print(key.shape) + print(value.shape) + + append_indptr = torch.zeros( + (batch_size + 1,), dtype=torch.int32, device="cuda" + ) + if input_metadata.is_prompt: + append_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) + else: + append_indptr[1:] = torch.arange(1, batch_size + 1) + + print(append_indptr) + + + flashinfer.page.append_paged_kv_cache( + key.contiguous(), + value.contiguous(), + append_indptr, + kv_cache, + input_metadata.paged_kv_indices, + input_metadata.paged_kv_indptr, + input_metadata.paged_kv_last_page_len + ) + """ #cache_ops.reshape_and_cache( @@ -199,24 +226,14 @@ def forward( #query = query.unflatten(0, (batch_size, seq_len)) - query = query.view(5510, 32, 128).contiguous() + query = query.view(-1, 32, 128).contiguous() out = input_metadata.prefill_wrapper.forward( query.contiguous(), kv_cache, causal=True ) - exit(0) - out = xops.memory_efficient_attention_forward( - query, - key, - value, - attn_bias=input_metadata.attn_bias, - p=0.0, - scale=self.scale, - op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if - (is_hip()) else None, - ) output = out.view_as(query) + else: # prefix-enabled attention output = torch.empty_like(query) @@ -251,12 +268,9 @@ def forward( #print(input_metadata) #print(key_cache.shape) - print(kv_cache.shape) - - exit(0) - - output = flashinfer.batch_decode_with_padded_kv_cache( - query, key_cache, value_cache, "NHD", "LLAMA", rope_scale=self.scale, + output = input_metadata.decode_wrapper.forward( + query.contiguous(), + kv_cache, ) # Reshape the output tensor. diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 9f73be3d9cb6f..d0cb3e2c9ec8e 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -47,7 +47,7 @@ def __init__( # Initialize the cache. self.gpu_cache = self.allocate_gpu_cache() - self.cpu_cache = self.allocate_cpu_cache() + #self.cpu_cache = self.allocate_cpu_cache() def get_kv_block_shape(self) -> Tuple[int, int, int, int]: element_size = torch.tensor([], dtype=self.dtype).element_size() @@ -65,10 +65,6 @@ def allocate_gpu_cache(self) -> List[KVCache]: gpu_cache.append(gpu_blocks) return gpu_cache - def allocate_cpu_cache(self) -> List[KVCache]: - kv_block_shape = self.get_kv_block_shape() - return torch.empty(self.num_gpu_blocks, 2, self.block_size, *kv_block_shape, dtype=self.dtype, device="cpu") - def _swap( self, src: List[KVCache], diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 503fe6b77e07e..405e26b88b56b 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -556,28 +556,44 @@ def execute_model( batch_size = input_tokens.shape[0] prefix_lens = input_metadata.prompt_lens - seq_lens = [a + b for a, b in zip(input_metadata.prompt_lens, input_metadata.context_lens)] + + if input_metadata.is_prompt: + seq_lens = torch.stack([a + b for a, b in zip(input_metadata.prompt_lens, input_metadata.context_lens)], dim=0) + else: + seq_lens = input_metadata.context_lens + print(input_metadata.context_lens) extend_seq_lens = input_metadata.context_lens - - qo_indptr = torch.zeros( - (batch_size + 1,), dtype=torch.int32, device="cuda" - ) - qo_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) - paged_kv_indices = input_metadata.slot_mapping.view(-1).type(torch.int32) + + if input_metadata.is_prompt: + qo_indptr = torch.zeros( + (batch_size + 1,), dtype=torch.int32, device="cuda" + ) + + qo_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) + + input_metadata.qo_indptr = qo_indptr + + kvi = input_metadata.slot_mapping.view(-1).type(torch.int32).to("cuda:0") + paged_kv_indices = kvi // 16 + if not input_metadata.is_prompt: + paged_kv_indices += 1 paged_kv_indptr = torch.zeros( - (batch_size + 1,), dtype=torch.int32, device="cuda" - ) - paged_kv_indptr[1:] = torch.cumsum(torch.stack(seq_lens, dim=0), dim=0) - paged_kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda") + (batch_size + 1,), dtype=torch.int32, device="cuda:0" + ) + paged_kv_indptr[1:] = torch.cumsum(seq_lens, dim=0) + #paged_kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda:0") + paged_kv_last_page_len = kvi[paged_kv_indptr[1:] - 1] % 16 - input_metadata.qo_indptr = qo_indptr input_metadata.paged_kv_indptr = paged_kv_indptr input_metadata.paged_kv_indices = paged_kv_indices input_metadata.paged_kv_last_page_len = paged_kv_last_page_len if input_metadata.is_prompt: + + #print(paged_kv_indices) + input_metadata.prefill_wrapper.begin_forward( qo_indptr, paged_kv_indptr, @@ -585,7 +601,18 @@ def execute_model( paged_kv_last_page_len, num_qo_heads, num_kv_heads - ) + ) + else: + input_metadata.decode_wrapper.begin_forward( + paged_kv_indptr, + paged_kv_indices, + paged_kv_last_page_len, + num_qo_heads, + num_kv_heads, + 128, + 16, + "LLAMA" + ) if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) From 55191f52afb6b885d89175f8bf3aa290c6fea1af Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Sun, 4 Feb 2024 18:59:26 +0000 Subject: [PATCH 03/21] working decoding --- benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark.sh b/benchmark.sh index 5b652f3d22f5c..64f7ee8b12d4f 100644 --- a/benchmark.sh +++ b/benchmark.sh @@ -2,5 +2,5 @@ python3 benchmarks/benchmark_throughput.py \ --input-len 550 \ --output-len 150 \ --model mistralai/Mistral-7B-v0.1 \ - --num-prompts 10 \ + --num-prompts 100 \ --enforce-eager From 5b5fff0983a97eb1e76e331be97f6f2b26f005af Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Sun, 4 Feb 2024 23:07:33 +0000 Subject: [PATCH 04/21] fix index, remove some ops --- vllm/model_executor/layers/attention.py | 20 ++++++++++++------ vllm/worker/model_runner.py | 28 +++++++++++++++++-------- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 5276646861368..b9e2a2c712762 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -199,6 +199,8 @@ def forward( # Set attention bias if not provided. This typically happens at # the very attention layer of every iteration. # FIXME(woosuk): This is a hack. + + """ if input_metadata.attn_bias is None: if self.alibi_slopes is None: @@ -212,6 +214,7 @@ def forward( input_metadata.attn_bias = _make_alibi_bias( self.alibi_slopes, self.num_kv_heads, batch_size, seq_len, query.dtype) + """ # TODO(woosuk): Too many view operations. Let's try to reduce # them in the future for code readability. @@ -226,13 +229,18 @@ def forward( #query = query.unflatten(0, (batch_size, seq_len)) - query = query.view(-1, 32, 128).contiguous() - out = input_metadata.prefill_wrapper.forward( - query.contiguous(), + query = query.view(-1, 32, 128) + output = input_metadata.prefill_wrapper.forward( + query, kv_cache, - causal=True + causal=True, + allow_fp16_qk_reduction=True ) - output = out.view_as(query) + + #print(query.shape) + #print(out.shape) + #exit(0) + #output = out.view_as(query) else: # prefix-enabled attention @@ -269,7 +277,7 @@ def forward( #print(key_cache.shape) output = input_metadata.decode_wrapper.forward( - query.contiguous(), + query, kv_cache, ) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 405e26b88b56b..c40d955e5f778 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -549,6 +549,9 @@ def execute_model( num_qo_heads = 32 num_kv_heads = 8 + if not profile and input_metadata.is_prompt: + if input_metadata.decode_wrapper: + input_metadata.decode_wrapper.end_forward() if not profile: input_metadata.prefill_wrapper = self.prefill_wrapper @@ -558,13 +561,12 @@ def execute_model( prefix_lens = input_metadata.prompt_lens if input_metadata.is_prompt: - seq_lens = torch.stack([a + b for a, b in zip(input_metadata.prompt_lens, input_metadata.context_lens)], dim=0) + seq_lens = input_metadata.prompt_lens + #seq_lens = torch.stack([a + b for a, b in zip(input_metadata.prompt_lens, input_metadata.context_lens)], dim=0) else: seq_lens = input_metadata.context_lens - print(input_metadata.context_lens) extend_seq_lens = input_metadata.context_lens - if input_metadata.is_prompt: qo_indptr = torch.zeros( (batch_size + 1,), dtype=torch.int32, device="cuda" @@ -575,25 +577,28 @@ def execute_model( input_metadata.qo_indptr = qo_indptr kvi = input_metadata.slot_mapping.view(-1).type(torch.int32).to("cuda:0") - paged_kv_indices = kvi // 16 + paged_kv_indices = torch.div(kvi, 16, rounding_mode="floor") if not input_metadata.is_prompt: paged_kv_indices += 1 paged_kv_indptr = torch.zeros( (batch_size + 1,), dtype=torch.int32, device="cuda:0" ) - paged_kv_indptr[1:] = torch.cumsum(seq_lens, dim=0) + if input_metadata.is_prompt: + paged_kv_indptr[1:] = torch.cumsum(seq_lens, dim=0) + else: + paged_kv_indptr[1:] = torch.arange(1, batch_size + 1) #paged_kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda:0") - paged_kv_last_page_len = kvi[paged_kv_indptr[1:] - 1] % 16 + if input_metadata.is_prompt: + paged_kv_last_page_len = kvi[paged_kv_indptr[1:] - 1] % 16 + else: + paged_kv_last_page_len = kvi % 16 input_metadata.paged_kv_indptr = paged_kv_indptr input_metadata.paged_kv_indices = paged_kv_indices input_metadata.paged_kv_last_page_len = paged_kv_last_page_len if input_metadata.is_prompt: - - #print(paged_kv_indices) - input_metadata.prefill_wrapper.begin_forward( qo_indptr, paged_kv_indptr, @@ -635,6 +640,11 @@ def execute_model( hidden_states=hidden_states, sampling_metadata=sampling_metadata, ) + + if not profile: + if input_metadata.is_prompt: + input_metadata.prefill_wrapper.end_forward() + return output @torch.inference_mode() From 34f1743a7c15999526d38532ba4aac42b3014533 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Sun, 4 Feb 2024 23:32:02 +0000 Subject: [PATCH 05/21] 1.5x throughput 1000in/out. TODO fix cache update --- benchmark.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark.sh b/benchmark.sh index 64f7ee8b12d4f..ede1fbe750216 100644 --- a/benchmark.sh +++ b/benchmark.sh @@ -1,6 +1,6 @@ python3 benchmarks/benchmark_throughput.py \ - --input-len 550 \ - --output-len 150 \ + --input-len 1000 \ + --output-len 1000 \ --model mistralai/Mistral-7B-v0.1 \ --num-prompts 100 \ --enforce-eager From a548101846615d44fafaace08b768cbecf9d3c03 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Mon, 5 Feb 2024 08:27:05 +0000 Subject: [PATCH 06/21] tweak kernel --- csrc/cache_kernels.cu | 54 ++++++++++++++++--------- vllm/model_executor/layers/attention.py | 29 ++++++------- 2 files changed, 49 insertions(+), 34 deletions(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index ceb7347d94670..4f404810bfafb 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -155,15 +155,16 @@ template __global__ void reshape_and_cache_kernel( const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] - cache_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - cache_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] + //cache_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + //cache_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] + cacht_t* __restrict__ kv_cache, // [num_blocks, 2, block_size, num_heads, head_size] const int64_t* __restrict__ slot_mapping, // [num_tokens] const int key_stride, const int value_stride, const int num_heads, const int head_size, const int block_size, - const int x) { + ) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; if (slot_idx < 0) { @@ -176,14 +177,16 @@ __global__ void reshape_and_cache_kernel( const int n = num_heads * head_size; for (int i = threadIdx.x; i < n; i += blockDim.x) { + //ok const int64_t src_key_idx = token_idx * key_stride + i; const int64_t src_value_idx = token_idx * value_stride + i; + //ok const int head_idx = i / head_size; const int head_offset = i % head_size; - const int x_idx = head_offset / x; - const int x_offset = head_offset % x; - + //const int x_idx = head_offset / x; + //const int x_offset = head_offset % x; + /* const int64_t tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x + head_idx * (head_size / x) * block_size * x + x_idx * block_size * x @@ -193,18 +196,33 @@ __global__ void reshape_and_cache_kernel( + head_idx * head_size * block_size + head_offset * block_size + block_offset; + [num_blocks, 2, block_size, num_heads, head_size] + [num_blocks, num_heads, head_size, block_size] + */ + const int64_t tgt_key_idx = block_idx * 1 * block_size * num_heads * head_size + + 1 * block_size * num_heads * head_size + + + block_offset * num_heads * head_size + + head_idx * head_size + + head_offset; + + const int64_t tgt_key_idx = block_idx * 2 * block_size * num_heads * head_size + + 2 * block_size * num_heads * head_size + + + block_offset * num_heads * head_size + + head_idx * head_size + + head_offset; + scalar_t tgt_key = key[src_key_idx]; scalar_t tgt_value = value[src_value_idx]; if constexpr (is_fp8_e5m2_kv_cache) { #ifdef ENABLE_FP8_E5M2 - key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_key); - value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_value); + kv_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_key); + kv_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_value); #else assert(false); #endif } else { - key_cache[tgt_key_idx] = tgt_key; - value_cache[tgt_value_idx] = tgt_value; + kv_cache[tgt_key_idx] = tgt_key; + kv_cache[tgt_value_idx] = tgt_value; } } } @@ -215,29 +233,29 @@ __global__ void reshape_and_cache_kernel( vllm::reshape_and_cache_kernel<<>>( \ reinterpret_cast(key.data_ptr()), \ reinterpret_cast(value.data_ptr()), \ - reinterpret_cast(key_cache.data_ptr()), \ - reinterpret_cast(value_cache.data_ptr()), \ + reinterpret_cast(kv_cache.data_ptr()), \ slot_mapping.data_ptr(), \ key_stride, \ value_stride, \ num_heads, \ head_size, \ - block_size, \ - x); + block_size, + ); void reshape_and_cache( torch::Tensor& key, // [num_tokens, num_heads, head_size] torch::Tensor& value, // [num_tokens, num_heads, head_size] - torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] + //torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + //torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] + torch::Tensor& kv_cache, // [num_blocks, 2, block_size, num_heads, head_size] torch::Tensor& slot_mapping, // [num_tokens] const std::string& kv_cache_dtype) { int num_tokens = key.size(0); int num_heads = key.size(1); int head_size = key.size(2); - int block_size = key_cache.size(3); - int x = key_cache.size(4); + int block_size = kv_cache.size(3); + //int x = kv_cache.size(4); int key_stride = key.stride(0); int value_stride = value.stride(0); diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index b9e2a2c712762..89ad3ecdd0f96 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -98,20 +98,15 @@ def forward( # profiling run. if kv_cache is not None: #flashinfer.page. - - pass """ - - print(key.shape) - print(value.shape) - append_indptr = torch.zeros( (batch_size + 1,), dtype=torch.int32, device="cuda" ) if input_metadata.is_prompt: append_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) else: - append_indptr[1:] = torch.arange(1, batch_size + 1) + append_indptr = torch.arange(batch_size + 1, dtype=torch.int32).to("cuda:0") + #append_indptr[1:] = torch.arange(1, batch_size + 1) print(append_indptr) @@ -126,16 +121,18 @@ def forward( input_metadata.paged_kv_last_page_len ) """ + + print(kv_cache.shape) + exit(0) - - #cache_ops.reshape_and_cache( - # key, - # value, - # key_cache, - # value_cache, - # input_metadata.slot_mapping.flatten(), - # input_metadata.kv_cache_dtype, - #) + cache_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + input_metadata.slot_mapping.flatten(), + input_metadata.kv_cache_dtype, + ) if input_metadata.is_prompt: # Prompt run. From 288717fcdb0beab55f6b3f06561966306193ee10 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Mon, 5 Feb 2024 08:27:48 +0000 Subject: [PATCH 07/21] fix ind --- csrc/cache_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 4f404810bfafb..c74c58c089fab 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -199,7 +199,7 @@ __global__ void reshape_and_cache_kernel( [num_blocks, 2, block_size, num_heads, head_size] [num_blocks, num_heads, head_size, block_size] */ - const int64_t tgt_key_idx = block_idx * 1 * block_size * num_heads * head_size + const int64_t tgt_key_idx = block_idx * 2 * block_size * num_heads * head_size + 1 * block_size * num_heads * head_size + + block_offset * num_heads * head_size + head_idx * head_size From 7c8dfaf186c882670a16f9ca1e96531868951739 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Mon, 5 Feb 2024 08:46:56 +0000 Subject: [PATCH 08/21] fix kvcache? --- csrc/cache.h | 3 +-- csrc/cache_kernels.cu | 9 ++++----- vllm/model_executor/layers/attention.py | 6 +----- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/csrc/cache.h b/csrc/cache.h index 21c71830f7942..31a01161477ce 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -18,8 +18,7 @@ void copy_blocks( void reshape_and_cache( torch::Tensor& key, torch::Tensor& value, - torch::Tensor& key_cache, - torch::Tensor& value_cache, + torch::Tensor& kv_cache, torch::Tensor& slot_mapping, const std::string& kv_cache_dtype); diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index c74c58c089fab..16780a058e9d1 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -157,13 +157,13 @@ __global__ void reshape_and_cache_kernel( const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] //cache_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] //cache_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] - cacht_t* __restrict__ kv_cache, // [num_blocks, 2, block_size, num_heads, head_size] + cache_t* __restrict__ kv_cache, // [num_blocks, 2, block_size, num_heads, head_size] const int64_t* __restrict__ slot_mapping, // [num_tokens] const int key_stride, const int value_stride, const int num_heads, const int head_size, - const int block_size, + const int block_size ) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; @@ -205,7 +205,7 @@ __global__ void reshape_and_cache_kernel( + head_idx * head_size + head_offset; - const int64_t tgt_key_idx = block_idx * 2 * block_size * num_heads * head_size + const int64_t tgt_value_idx = block_idx * 2 * block_size * num_heads * head_size + 2 * block_size * num_heads * head_size + + block_offset * num_heads * head_size + head_idx * head_size @@ -239,8 +239,7 @@ __global__ void reshape_and_cache_kernel( value_stride, \ num_heads, \ head_size, \ - block_size, - ); + block_size); void reshape_and_cache( torch::Tensor& key, // [num_tokens, num_heads, head_size] diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 89ad3ecdd0f96..e4aa69fd8ef5b 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -122,14 +122,10 @@ def forward( ) """ - print(kv_cache.shape) - exit(0) - cache_ops.reshape_and_cache( key, value, - key_cache, - value_cache, + kv_cache, input_metadata.slot_mapping.flatten(), input_metadata.kv_cache_dtype, ) From a6bb43a17eff315c0d10a2f026eb357ca15e9404 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Mon, 5 Feb 2024 09:14:09 +0000 Subject: [PATCH 09/21] fix gqa shape --- benchmark.sh | 2 +- generate.py | 22 ++++++++++++++++++ vllm/entrypoints/llm.py | 2 ++ vllm/model_executor/layers/attention.py | 31 +++++++++++++------------ 4 files changed, 41 insertions(+), 16 deletions(-) create mode 100644 generate.py diff --git a/benchmark.sh b/benchmark.sh index ede1fbe750216..863e0e5e9fede 100644 --- a/benchmark.sh +++ b/benchmark.sh @@ -2,5 +2,5 @@ python3 benchmarks/benchmark_throughput.py \ --input-len 1000 \ --output-len 1000 \ --model mistralai/Mistral-7B-v0.1 \ - --num-prompts 100 \ + --num-prompts 10 \ --enforce-eager diff --git a/generate.py b/generate.py new file mode 100644 index 0000000000000..c6409f4d2461f --- /dev/null +++ b/generate.py @@ -0,0 +1,22 @@ +from vllm import LLM, SamplingParams +import torch + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + +llm = LLM(model="mistralai/Mistral-7B-v0.1", enforce_eager=True, dtype=torch.float16) + +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 614e6fa520c8c..61a9e99b4606c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -216,5 +216,7 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: # Sort the outputs by request ID. # This is necessary because some requests may be finished earlier than # its previous requests. + + print(outputs) outputs = sorted(outputs, key=lambda x: int(x.request_id)) return outputs diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index e4aa69fd8ef5b..19f28aa5d6a42 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -132,23 +132,24 @@ def forward( if input_metadata.is_prompt: # Prompt run. - if self.num_kv_heads != self.num_heads: - # As of Nov 2023, xformers only supports MHA. For MQA/GQA, - # project the key and value tensors to the desired number of - # heads. - # TODO(woosuk): Use MQA/GQA kernels for higher performance. - query = query.view(query.shape[0], self.num_kv_heads, - self.num_queries_per_kv, query.shape[-1]) - key = key[:, :, - None, :].expand(key.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - key.shape[-1]) - value = value[:, :, None, :].expand(value.shape[0], - self.num_kv_heads, - self.num_queries_per_kv, - value.shape[-1]) + # old attn if kv_cache is None: + if self.num_kv_heads != self.num_heads: + # As of Nov 2023, xformers only supports MHA. For MQA/GQA, + # project the key and value tensors to the desired number of + # heads. + # TODO(woosuk): Use MQA/GQA kernels for higher performance. + query = query.view(query.shape[0], self.num_kv_heads, + self.num_queries_per_kv, query.shape[-1]) + key = key[:, :, + None, :].expand(key.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + key.shape[-1]) + value = value[:, :, None, :].expand(value.shape[0], + self.num_kv_heads, + self.num_queries_per_kv, + value.shape[-1]) # Set attention bias if not provided. This typically happens at # the very attention layer of every iteration. # FIXME(woosuk): This is a hack. From 3922712022033c131893f9ddf846e6c37bfebb1e Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Tue, 6 Feb 2024 03:22:49 +0000 Subject: [PATCH 10/21] rm unnecessary files, add flashinfer as requirement --- benchmark.sh | 6 ------ generate.py | 22 ---------------------- requirements.txt | 1 + 3 files changed, 1 insertion(+), 28 deletions(-) delete mode 100644 benchmark.sh delete mode 100644 generate.py diff --git a/benchmark.sh b/benchmark.sh deleted file mode 100644 index 863e0e5e9fede..0000000000000 --- a/benchmark.sh +++ /dev/null @@ -1,6 +0,0 @@ -python3 benchmarks/benchmark_throughput.py \ - --input-len 1000 \ - --output-len 1000 \ - --model mistralai/Mistral-7B-v0.1 \ - --num-prompts 10 \ - --enforce-eager diff --git a/generate.py b/generate.py deleted file mode 100644 index c6409f4d2461f..0000000000000 --- a/generate.py +++ /dev/null @@ -1,22 +0,0 @@ -from vllm import LLM, SamplingParams -import torch - -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - -llm = LLM(model="mistralai/Mistral-7B-v0.1", enforce_eager=True, dtype=torch.float16) - -outputs = llm.generate(prompts, sampling_params) - -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - diff --git a/requirements.txt b/requirements.txt index 2bf527ccc3a77..bbe7f53593777 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. aioprometheus[starlette] pynvml == 11.5.0 +flashinfer From 7aa25fac61839541a1d5d782b4269e87b4535b22 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Tue, 6 Feb 2024 21:41:36 +0000 Subject: [PATCH 11/21] fix kv_cache indexing --- csrc/cache_kernels.cu | 6 +- vllm/model_executor/layers/attention.py | 128 ++++++++++-------------- vllm/worker/cache_engine.py | 1 - vllm/worker/model_runner.py | 48 +++++---- 4 files changed, 88 insertions(+), 95 deletions(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 16780a058e9d1..3989ccc3ad6db 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -200,13 +200,13 @@ __global__ void reshape_and_cache_kernel( [num_blocks, num_heads, head_size, block_size] */ const int64_t tgt_key_idx = block_idx * 2 * block_size * num_heads * head_size - + 1 * block_size * num_heads * head_size + + + 0 * block_size * num_heads * head_size + + block_offset * num_heads * head_size + head_idx * head_size + head_offset; const int64_t tgt_value_idx = block_idx * 2 * block_size * num_heads * head_size - + 2 * block_size * num_heads * head_size + + + 1 * block_size * num_heads * head_size + + block_offset * num_heads * head_size + head_idx * head_size + head_offset; @@ -253,7 +253,7 @@ void reshape_and_cache( int num_tokens = key.size(0); int num_heads = key.size(1); int head_size = key.size(2); - int block_size = kv_cache.size(3); + int block_size = kv_cache.size(2); //int x = kv_cache.size(4); int key_stride = key.stride(0); diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 19f28aa5d6a42..92d86a83e538a 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -97,37 +97,12 @@ def forward( # vectors will not be cached. This happens during the initial memory # profiling run. if kv_cache is not None: - #flashinfer.page. - """ - append_indptr = torch.zeros( - (batch_size + 1,), dtype=torch.int32, device="cuda" - ) - if input_metadata.is_prompt: - append_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) - else: - append_indptr = torch.arange(batch_size + 1, dtype=torch.int32).to("cuda:0") - #append_indptr[1:] = torch.arange(1, batch_size + 1) - - print(append_indptr) - - - flashinfer.page.append_paged_kv_cache( - key.contiguous(), - value.contiguous(), - append_indptr, - kv_cache, - input_metadata.paged_kv_indices, - input_metadata.paged_kv_indptr, - input_metadata.paged_kv_last_page_len - ) - """ - cache_ops.reshape_and_cache( key, value, kv_cache, input_metadata.slot_mapping.flatten(), - input_metadata.kv_cache_dtype, + "auto" ) if input_metadata.is_prompt: @@ -194,34 +169,62 @@ def forward( # the very attention layer of every iteration. # FIXME(woosuk): This is a hack. - """ - - if input_metadata.attn_bias is None: + if kv_cache is not None: + + if self.num_kv_heads != self.num_heads: + # As of Nov 2023, xformers only supports MHA. For MQA/GQA, + # project the key and value tensors to the desired number of + # heads. + # TODO(woosuk): Use MQA/GQA kernels for higher performance. + query = query.view(query.shape[0], self.num_kv_heads, + self.num_queries_per_kv, query.shape[-1]) + key = key[:, :, + None, :].expand(key.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + key.shape[-1]) + value = value[:, :, None, :].expand(value.shape[0], + self.num_kv_heads, + self.num_queries_per_kv, + value.shape[-1]) + + # Set attention bias if not provided. This typically happens at + # the very attention layer of every iteration. + # FIXME(woosuk): This is a hack. + if input_metadata.attn_bias is None: + if self.alibi_slopes is None: + attn_bias = BlockDiagonalCausalMask.from_seqlens( + [seq_len] * batch_size) + if self.sliding_window is not None: + attn_bias = attn_bias.make_local_attention( + self.sliding_window) + input_metadata.attn_bias = attn_bias + else: + input_metadata.attn_bias = _make_alibi_bias( + self.alibi_slopes, self.num_kv_heads, batch_size, + seq_len, query.dtype) + + # TODO(woosuk): Too many view operations. Let's try to reduce + # them in the future for code readability. if self.alibi_slopes is None: - attn_bias = BlockDiagonalCausalMask.from_seqlens( - [seq_len] * batch_size) - if self.sliding_window is not None: - attn_bias = attn_bias.make_local_attention( - self.sliding_window) - input_metadata.attn_bias = attn_bias + query = query.unsqueeze(0) + key = key.unsqueeze(0) + value = value.unsqueeze(0) else: - input_metadata.attn_bias = _make_alibi_bias( - self.alibi_slopes, self.num_kv_heads, batch_size, - seq_len, query.dtype) - """ - - # TODO(woosuk): Too many view operations. Let's try to reduce - # them in the future for code readability. - #if self.alibi_slopes is None: - # query = query.unsqueeze(0) - # key = key.unsqueeze(0) - # value = value.unsqueeze(0) - #else: - # query = query.unflatten(0, (batch_size, seq_len)) - # key = key.unflatten(0, (batch_size, seq_len)) - # value = value.unflatten(0, (batch_size, seq_len)) - - #query = query.unflatten(0, (batch_size, seq_len)) + query = query.unflatten(0, (batch_size, seq_len)) + key = key.unflatten(0, (batch_size, seq_len)) + value = value.unflatten(0, (batch_size, seq_len)) + + out = xops.memory_efficient_attention_forward( + query, + value, + key, + attn_bias=input_metadata.attn_bias, + p=0.0, + scale=self.scale, + op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if + (is_hip()) else None, + ) + output = out.view_as(query) query = query.view(-1, 32, 128) output = input_metadata.prefill_wrapper.forward( @@ -231,11 +234,6 @@ def forward( allow_fp16_qk_reduction=True ) - #print(query.shape) - #print(out.shape) - #exit(0) - #output = out.view_as(query) - else: # prefix-enabled attention output = torch.empty_like(query) @@ -255,27 +253,11 @@ def forward( ) else: - # Decoding run. - #output = _paged_attention( - # query, - # key_cache, - # value_cache, - # input_metadata, - # self.num_kv_heads, - # self.scale, - # self.alibi_slopes, - #) - - #print(query.shape) - #print(input_metadata) - #print(key_cache.shape) - output = input_metadata.decode_wrapper.forward( query, kv_cache, ) - # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index d0cb3e2c9ec8e..eba17680d5b26 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -39,7 +39,6 @@ def __init__( self.block_size = cache_config.block_size self.num_gpu_blocks = cache_config.num_gpu_blocks self.num_cpu_blocks = cache_config.num_cpu_blocks - if cache_config.cache_dtype == "auto": self.dtype = model_config.dtype else: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index c40d955e5f778..b98c042e212c7 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -571,33 +571,46 @@ def execute_model( qo_indptr = torch.zeros( (batch_size + 1,), dtype=torch.int32, device="cuda" ) - - qo_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) - input_metadata.qo_indptr = qo_indptr + qo_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) kvi = input_metadata.slot_mapping.view(-1).type(torch.int32).to("cuda:0") - paged_kv_indices = torch.div(kvi, 16, rounding_mode="floor") - if not input_metadata.is_prompt: - paged_kv_indices += 1 + #paged_kv_indices = torch.div(kvi, 16, rounding_mode="floor") + paged_kv_indptr = torch.zeros( (batch_size + 1,), dtype=torch.int32, device="cuda:0" ) - if input_metadata.is_prompt: - paged_kv_indptr[1:] = torch.cumsum(seq_lens, dim=0) - else: - paged_kv_indptr[1:] = torch.arange(1, batch_size + 1) + + #paged_kv_indptr[0] = slot_mapping[0][0] + #if input_metadata.is_prompt: + # paged_kv_indptr[1:] = torch.cumsum(seq_lens, dim=0) + #else: + # paged_kv_indptr[1:] = torch.arange(1, batch_size + 1) #paged_kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda:0") if input_metadata.is_prompt: - paged_kv_last_page_len = kvi[paged_kv_indptr[1:] - 1] % 16 + paged_kv_indptr = torch.tensor([input_metadata.slot_mapping[0][0], input_metadata.slot_mapping[0][batch_size] + 16 ] , dtype=torch.int32).to("cuda:0") // 16 else: - paged_kv_last_page_len = kvi % 16 + paged_kv_indptr = torch.tensor([input_metadata.slot_mapping[0][0], input_metadata.slot_mapping[0][0] + 16 ] , dtype=torch.int32).to("cuda:0") // 16 + if input_metadata.is_prompt: + paged_kv_last_page_len = (kvi[qo_indptr[1:] - 1] % 16) + 1 + else: + paged_kv_last_page_len = (kvi % 16) + 1 + + paged_kv_indices = torch.arange(12572).int().to("cuda:0") input_metadata.paged_kv_indptr = paged_kv_indptr - input_metadata.paged_kv_indices = paged_kv_indices + input_metadata.paged_kv_indices = paged_kv_indices input_metadata.paged_kv_last_page_len = paged_kv_last_page_len - + + #print(qo_indptr) + #print(paged_kv_indptr) + #print(paged_kv_indices) + #print(paged_kv_last_page_len) + #exit(0) + #print(qo_indptr) + #exit(0) + if input_metadata.is_prompt: input_metadata.prefill_wrapper.begin_forward( qo_indptr, @@ -616,7 +629,6 @@ def execute_model( num_kv_heads, 128, 16, - "LLAMA" ) if self.lora_config: @@ -641,9 +653,9 @@ def execute_model( sampling_metadata=sampling_metadata, ) - if not profile: - if input_metadata.is_prompt: - input_metadata.prefill_wrapper.end_forward() + #if not profile: + # if input_metadata.is_prompt: + # input_metadata.prefill_wrapper.end_forward() return output From a176b8dc5bb9f57888e0c75835289e834091c16a Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Tue, 6 Feb 2024 22:12:14 +0000 Subject: [PATCH 12/21] batched kvcache indexing --- requirements.txt | 1 - vllm/entrypoints/llm.py | 2 -- vllm/worker/model_runner.py | 30 +++++++----------------------- 3 files changed, 7 insertions(+), 26 deletions(-) diff --git a/requirements.txt b/requirements.txt index bbe7f53593777..2bf527ccc3a77 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,3 @@ uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. aioprometheus[starlette] pynvml == 11.5.0 -flashinfer diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 61a9e99b4606c..614e6fa520c8c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -216,7 +216,5 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: # Sort the outputs by request ID. # This is necessary because some requests may be finished earlier than # its previous requests. - - print(outputs) outputs = sorted(outputs, key=lambda x: int(x.request_id)) return outputs diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index b98c042e212c7..27ff82e1c593c 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -46,6 +46,9 @@ def __init__( self.lora_config = lora_config self.is_driver_worker = is_driver_worker + self.paged_kv_index = torch.arange(12572).int().to("cuda:0") + + # model_config can be None in tests/samplers/test_sampler.py. # FIXME(woosuk): This is a hack to make the tests work. Refactor this. self.sliding_window = (model_config.get_sliding_window() @@ -562,7 +565,6 @@ def execute_model( if input_metadata.is_prompt: seq_lens = input_metadata.prompt_lens - #seq_lens = torch.stack([a + b for a, b in zip(input_metadata.prompt_lens, input_metadata.context_lens)], dim=0) else: seq_lens = input_metadata.context_lens extend_seq_lens = input_metadata.context_lens @@ -575,42 +577,24 @@ def execute_model( qo_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) kvi = input_metadata.slot_mapping.view(-1).type(torch.int32).to("cuda:0") - #paged_kv_indices = torch.div(kvi, 16, rounding_mode="floor") paged_kv_indptr = torch.zeros( (batch_size + 1,), dtype=torch.int32, device="cuda:0" ) - #paged_kv_indptr[0] = slot_mapping[0][0] - #if input_metadata.is_prompt: - # paged_kv_indptr[1:] = torch.cumsum(seq_lens, dim=0) - #else: - # paged_kv_indptr[1:] = torch.arange(1, batch_size + 1) - #paged_kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda:0") + paged_kv_indptr = input_metadata.slot_mapping[:, -1].flip(dims=[0]) // 16 + paged_kv_indptr = torch.cat([paged_kv_indptr, (paged_kv_indptr[-1] + 1).unsqueeze(0)]).int() - if input_metadata.is_prompt: - paged_kv_indptr = torch.tensor([input_metadata.slot_mapping[0][0], input_metadata.slot_mapping[0][batch_size] + 16 ] , dtype=torch.int32).to("cuda:0") // 16 - else: - paged_kv_indptr = torch.tensor([input_metadata.slot_mapping[0][0], input_metadata.slot_mapping[0][0] + 16 ] , dtype=torch.int32).to("cuda:0") // 16 if input_metadata.is_prompt: paged_kv_last_page_len = (kvi[qo_indptr[1:] - 1] % 16) + 1 else: paged_kv_last_page_len = (kvi % 16) + 1 - - paged_kv_indices = torch.arange(12572).int().to("cuda:0") + paged_kv_indices = self.paged_kv_index input_metadata.paged_kv_indptr = paged_kv_indptr - input_metadata.paged_kv_indices = paged_kv_indices + input_metadata.paged_kv_indices = paged_kv_indices input_metadata.paged_kv_last_page_len = paged_kv_last_page_len - #print(qo_indptr) - #print(paged_kv_indptr) - #print(paged_kv_indices) - #print(paged_kv_last_page_len) - #exit(0) - #print(qo_indptr) - #exit(0) - if input_metadata.is_prompt: input_metadata.prefill_wrapper.begin_forward( qo_indptr, From 037a54a2a6fb8a22929dc08a901c639bd92a74a0 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Wed, 7 Feb 2024 20:40:06 +0000 Subject: [PATCH 13/21] working version. last commit before formatting + tests --- vllm/entrypoints/llm.py | 1 + vllm/model_executor/layers/attention.py | 62 ++----------------------- vllm/worker/model_runner.py | 35 +++++++++----- vllm/worker/worker.py | 1 - 4 files changed, 28 insertions(+), 71 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 614e6fa520c8c..9e11a334287cb 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -217,4 +217,5 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: # This is necessary because some requests may be finished earlier than # its previous requests. outputs = sorted(outputs, key=lambda x: int(x.request_id)) + print(outputs) return outputs diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 92d86a83e538a..d4c0b98e0c622 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -106,8 +106,6 @@ def forward( ) if input_metadata.is_prompt: - # Prompt run. - # old attn if kv_cache is None: if self.num_kv_heads != self.num_heads: @@ -169,63 +167,6 @@ def forward( # the very attention layer of every iteration. # FIXME(woosuk): This is a hack. - if kv_cache is not None: - - if self.num_kv_heads != self.num_heads: - # As of Nov 2023, xformers only supports MHA. For MQA/GQA, - # project the key and value tensors to the desired number of - # heads. - # TODO(woosuk): Use MQA/GQA kernels for higher performance. - query = query.view(query.shape[0], self.num_kv_heads, - self.num_queries_per_kv, query.shape[-1]) - key = key[:, :, - None, :].expand(key.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - key.shape[-1]) - value = value[:, :, None, :].expand(value.shape[0], - self.num_kv_heads, - self.num_queries_per_kv, - value.shape[-1]) - - # Set attention bias if not provided. This typically happens at - # the very attention layer of every iteration. - # FIXME(woosuk): This is a hack. - if input_metadata.attn_bias is None: - if self.alibi_slopes is None: - attn_bias = BlockDiagonalCausalMask.from_seqlens( - [seq_len] * batch_size) - if self.sliding_window is not None: - attn_bias = attn_bias.make_local_attention( - self.sliding_window) - input_metadata.attn_bias = attn_bias - else: - input_metadata.attn_bias = _make_alibi_bias( - self.alibi_slopes, self.num_kv_heads, batch_size, - seq_len, query.dtype) - - # TODO(woosuk): Too many view operations. Let's try to reduce - # them in the future for code readability. - if self.alibi_slopes is None: - query = query.unsqueeze(0) - key = key.unsqueeze(0) - value = value.unsqueeze(0) - else: - query = query.unflatten(0, (batch_size, seq_len)) - key = key.unflatten(0, (batch_size, seq_len)) - value = value.unflatten(0, (batch_size, seq_len)) - - out = xops.memory_efficient_attention_forward( - query, - value, - key, - attn_bias=input_metadata.attn_bias, - p=0.0, - scale=self.scale, - op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if - (is_hip()) else None, - ) - output = out.view_as(query) - query = query.view(-1, 32, 128) output = input_metadata.prefill_wrapper.forward( query, @@ -234,6 +175,9 @@ def forward( allow_fp16_qk_reduction=True ) + #output = flashinfer.single_prefill_with_kv_cache(query, key.contiguous(), value.contiguous(), causal=True, + #allow_fp16_qk_reduction=True) + else: # prefix-enabled attention output = torch.empty_like(query) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 27ff82e1c593c..f1d62d563479e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -568,33 +568,46 @@ def execute_model( else: seq_lens = input_metadata.context_lens extend_seq_lens = input_metadata.context_lens + + if input_metadata.is_prompt: + seq_lens = input_metadata.prompt_lens + #seq_lens = torch.stack([a + b for a, b in zip(input_metadata.prompt_lens, input_metadata.context_lens)], dim=0) + else: + seq_lens = input_metadata.context_lens + extend_seq_lens = input_metadata.context_lens if input_metadata.is_prompt: qo_indptr = torch.zeros( (batch_size + 1,), dtype=torch.int32, device="cuda" ) - + qo_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) - kvi = input_metadata.slot_mapping.view(-1).type(torch.int32).to("cuda:0") + input_metadata.qo_indptr = qo_indptr + kvi = input_metadata.slot_mapping.view(-1).type(torch.int32).to("cuda:0") + paged_kv_indices = torch.div(kvi, 16, rounding_mode="floor") + paged_kv_indptr = torch.zeros( (batch_size + 1,), dtype=torch.int32, device="cuda:0" ) + + #print(block_sizes_per_seq) + + if input_metadata.is_prompt: + block_sizes_per_seq = torch.tensor([len(input_metadata.slot_mapping[i].unique()) for i in range(batch_size)]) - paged_kv_indptr = input_metadata.slot_mapping[:, -1].flip(dims=[0]) // 16 - paged_kv_indptr = torch.cat([paged_kv_indptr, (paged_kv_indptr[-1] + 1).unsqueeze(0)]).int() + paged_kv_indptr[1:] = torch.cumsum(block_sizes_per_seq, dim=0) + + else: + paged_kv_indptr[1:] = torch.arange(1, batch_size + 1) if input_metadata.is_prompt: - paged_kv_last_page_len = (kvi[qo_indptr[1:] - 1] % 16) + 1 + paged_kv_last_page_len = (kvi[paged_kv_indptr[1:] - 1] % 16) + 1 else: paged_kv_last_page_len = (kvi % 16) + 1 - paged_kv_indices = self.paged_kv_index - - input_metadata.paged_kv_indptr = paged_kv_indptr - input_metadata.paged_kv_indices = paged_kv_indices - input_metadata.paged_kv_last_page_len = paged_kv_last_page_len - + + if input_metadata.is_prompt: input_metadata.prefill_wrapper.begin_forward( qo_indptr, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b9186692586d3..410bb47b020e1 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -218,7 +218,6 @@ def execute_model( output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache) - print("model executed") return output def add_lora(self, lora_request: LoRARequest) -> bool: From 8368d0ba6a9cd54e2e34fc1b0d908bf346d6b66c Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Wed, 7 Feb 2024 21:08:09 +0000 Subject: [PATCH 14/21] formatting and compatibility pt 1 --- vllm/entrypoints/llm.py | 1 - vllm/model_executor/layers/attention.py | 48 +++++------ vllm/worker/cache_engine.py | 29 +++++-- vllm/worker/model_runner.py | 108 +++++++++++------------- 4 files changed, 89 insertions(+), 97 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9e11a334287cb..614e6fa520c8c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -217,5 +217,4 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]: # This is necessary because some requests may be finished earlier than # its previous requests. outputs = sorted(outputs, key=lambda x: int(x.request_id)) - print(outputs) return outputs diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index d4c0b98e0c622..1c4dae6829344 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -18,7 +18,7 @@ # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. _PARTITION_SIZE = 512 -import flashinfer + class PagedAttention(nn.Module): """MHA/MQA/GQA layer with PagedAttention. @@ -82,10 +82,6 @@ def forward( shape = [batch_size, seq_len, num_heads * head_size] """ - prefill_wrapper = input_metadata.prefill_wrapper - decode_wrapper = input_metadata.decode_wrapper - - batch_size, seq_len, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size).contiguous() @@ -97,13 +93,9 @@ def forward( # vectors will not be cached. This happens during the initial memory # profiling run. if kv_cache is not None: - cache_ops.reshape_and_cache( - key, - value, - kv_cache, - input_metadata.slot_mapping.flatten(), - "auto" - ) + cache_ops.reshape_and_cache(key, value, kv_cache, + input_metadata.slot_mapping.flatten(), + "auto") if input_metadata.is_prompt: # old attn @@ -114,19 +106,21 @@ def forward( # heads. # TODO(woosuk): Use MQA/GQA kernels for higher performance. query = query.view(query.shape[0], self.num_kv_heads, - self.num_queries_per_kv, query.shape[-1]) + self.num_queries_per_kv, + query.shape[-1]) key = key[:, :, - None, :].expand(key.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - key.shape[-1]) - value = value[:, :, None, :].expand(value.shape[0], - self.num_kv_heads, - self.num_queries_per_kv, - value.shape[-1]) + None, :].expand(key.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + key.shape[-1]) + value = value[:, :, + None, :].expand(value.shape[0], + self.num_kv_heads, + self.num_queries_per_kv, + value.shape[-1]) # Set attention bias if not provided. This typically happens at # the very attention layer of every iteration. # FIXME(woosuk): This is a hack. - + if input_metadata.attn_bias is None: if self.alibi_slopes is None: attn_bias = BlockDiagonalCausalMask.from_seqlens( @@ -150,7 +144,7 @@ def forward( query = query.unflatten(0, (batch_size, seq_len)) key = key.unflatten(0, (batch_size, seq_len)) value = value.unflatten(0, (batch_size, seq_len)) - + out = xops.memory_efficient_attention_forward( query, key, @@ -169,11 +163,7 @@ def forward( query = query.view(-1, 32, 128) output = input_metadata.prefill_wrapper.forward( - query, - kv_cache, - causal=True, - allow_fp16_qk_reduction=True - ) + query, kv_cache, causal=True, allow_fp16_qk_reduction=True) #output = flashinfer.single_prefill_with_kv_cache(query, key.contiguous(), value.contiguous(), causal=True, #allow_fp16_qk_reduction=True) @@ -186,8 +176,8 @@ def forward( key, value, output, - key_cache, - value_cache, + #key_cache, + #value_cache, input_metadata.block_tables, # [BS, max_block_per_request] input_metadata.start_loc, input_metadata.prompt_lens, diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index eba17680d5b26..14f45bc32a34b 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -2,7 +2,6 @@ from typing import Dict, List, Tuple import torch -import flashinfer from vllm._C import cache_ops from vllm.config import CacheConfig, ModelConfig, ParallelConfig @@ -46,24 +45,40 @@ def __init__( # Initialize the cache. self.gpu_cache = self.allocate_gpu_cache() - #self.cpu_cache = self.allocate_cpu_cache() + self.cpu_cache = self.allocate_cpu_cache() def get_kv_block_shape(self) -> Tuple[int, int, int, int]: - element_size = torch.tensor([], dtype=self.dtype).element_size() - x = 16 // element_size return ( self.num_heads, self.head_size, ) - + def allocate_gpu_cache(self) -> List[KVCache]: kv_block_shape = self.get_kv_block_shape() gpu_cache = [] for _ in range(self.num_layers): - gpu_blocks = torch.empty(self.num_gpu_blocks, 2, self.block_size, *kv_block_shape, dtype=self.dtype, device="cuda") + gpu_blocks = torch.empty(self.num_gpu_blocks, + 2, + self.block_size, + *kv_block_shape, + dtype=self.dtype, + device="cuda") gpu_cache.append(gpu_blocks) return gpu_cache - + + def allocate_cpu_cache(self) -> List[KVCache]: + kv_block_shape = self.get_kv_block_shape() + cpu_cache = [] + for _ in range(self.num_layers): + cpu_blocks = torch.empty(self.num_gpu_blocks, + 2, + self.block_size, + *kv_block_shape, + dtype=self.dtype, + device="cpu") + cpu_cache.append(cpu_blocks) + return cpu_cache + def _swap( self, src: List[KVCache], diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index f1d62d563479e..515f7df739556 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -18,6 +18,7 @@ from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.utils import in_wsl + logger = init_logger(__name__) KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -45,10 +46,7 @@ def __init__( self.scheduler_config = scheduler_config self.lora_config = lora_config self.is_driver_worker = is_driver_worker - - self.paged_kv_index = torch.arange(12572).int().to("cuda:0") - - + # model_config can be None in tests/samplers/test_sampler.py. # FIXME(woosuk): This is a hack to make the tests work. Refactor this. self.sliding_window = (model_config.get_sliding_window() @@ -78,13 +76,13 @@ def __init__( self.in_wsl = in_wsl() self.kv_cache_dtype = kv_cache_dtype - workspace_buffer = torch.empty(16 * 1024 * 1024, dtype=torch.uint8, device="cuda:0") + workspace_buffer = torch.empty(16 * 1024 * 1024, + dtype=torch.uint8, + device="cuda:0") self.prefill_wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( - workspace_buffer, "NHD" - ) - self.decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( - workspace_buffer, "NHD" - ) + workspace_buffer, "NHD") + self.decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( + workspace_buffer, "NHD") def load_model(self) -> None: self.model = get_model(self.model_config, self.device_config, @@ -540,83 +538,71 @@ def prepare_input_tensors( sampling_metadata, lora_requests, lora_mapping) @torch.inference_mode() - def execute_model( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - profile = False - ) -> Optional[SamplerOutput]: + def execute_model(self, + seq_group_metadata_list: Optional[ + List[SequenceGroupMetadata]], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + profile=False) -> Optional[SamplerOutput]: (input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list) - num_qo_heads = 32 - num_kv_heads = 8 - if not profile and input_metadata.is_prompt: - if input_metadata.decode_wrapper: - input_metadata.decode_wrapper.end_forward() + num_qo_heads = self.model.config.num_attention_heads + num_kv_heads = self.model.config.num_key_value_heads + + if not profile and input_metadata.is_prompt and input_metadata.decode_wrapper: + input_metadata.decode_wrapper.end_forward() if not profile: input_metadata.prefill_wrapper = self.prefill_wrapper input_metadata.decode_wrapper = self.decode_wrapper batch_size = input_tokens.shape[0] - - prefix_lens = input_metadata.prompt_lens - if input_metadata.is_prompt: - seq_lens = input_metadata.prompt_lens - else: - seq_lens = input_metadata.context_lens - extend_seq_lens = input_metadata.context_lens - - if input_metadata.is_prompt: - seq_lens = input_metadata.prompt_lens - #seq_lens = torch.stack([a + b for a, b in zip(input_metadata.prompt_lens, input_metadata.context_lens)], dim=0) - else: - seq_lens = input_metadata.context_lens - extend_seq_lens = input_metadata.context_lens + #if input_metadata.is_prompt: + # seq_lens = input_metadata.prompt_lens + #else: + # seq_lens = input_metadata.context_lens if input_metadata.is_prompt: - qo_indptr = torch.zeros( - (batch_size + 1,), dtype=torch.int32, device="cuda" - ) - + qo_indptr = torch.zeros((batch_size + 1, ), + dtype=torch.int32, + device="cuda") + qo_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) input_metadata.qo_indptr = qo_indptr - kvi = input_metadata.slot_mapping.view(-1).type(torch.int32).to("cuda:0") + kvi = input_metadata.slot_mapping.view(-1).type( + torch.int32).to("cuda:0") + paged_kv_indices = torch.div(kvi, 16, rounding_mode="floor") - - paged_kv_indptr = torch.zeros( - (batch_size + 1,), dtype=torch.int32, device="cuda:0" - ) - + + paged_kv_indptr = torch.zeros((batch_size + 1, ), + dtype=torch.int32, + device="cuda:0") #print(block_sizes_per_seq) - + if input_metadata.is_prompt: - block_sizes_per_seq = torch.tensor([len(input_metadata.slot_mapping[i].unique()) for i in range(batch_size)]) + block_sizes_per_seq = torch.tensor([ + len(input_metadata.slot_mapping[i].unique()) + for i in range(batch_size) + ]) paged_kv_indptr[1:] = torch.cumsum(block_sizes_per_seq, dim=0) - + else: paged_kv_indptr[1:] = torch.arange(1, batch_size + 1) if input_metadata.is_prompt: - paged_kv_last_page_len = (kvi[paged_kv_indptr[1:] - 1] % 16) + 1 + paged_kv_last_page_len = (kvi[paged_kv_indptr[1:] - 1] % + 16) + 1 else: paged_kv_last_page_len = (kvi % 16) + 1 - - + if input_metadata.is_prompt: input_metadata.prefill_wrapper.begin_forward( - qo_indptr, - paged_kv_indptr, - paged_kv_indices, - paged_kv_last_page_len, - num_qo_heads, - num_kv_heads - ) + qo_indptr, paged_kv_indptr, paged_kv_indices, + paged_kv_last_page_len, num_qo_heads, num_kv_heads) else: input_metadata.decode_wrapper.begin_forward( paged_kv_indptr, @@ -626,7 +612,7 @@ def execute_model( num_kv_heads, 128, 16, - ) + ) if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) @@ -706,7 +692,9 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [None] * num_layers #torch.zeros(1, 2, )[(None, None)] * num_layers + kv_caches = [ + None + ] * num_layers #torch.zeros(1, 2, )[(None, None)] * num_layers self.execute_model(seqs, kv_caches, profile=True) torch.cuda.synchronize() return From 8276872f0ec6b071c33dbc533fe8ef9fa5a48013 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Wed, 7 Feb 2024 21:09:07 +0000 Subject: [PATCH 15/21] formatting and compatibility pt 2 --- vllm/worker/cache_engine.py | 2 +- vllm/worker/model_runner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 14f45bc32a34b..0ba5ef8c3e16c 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,7 +6,7 @@ from vllm._C import cache_ops from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE logger = init_logger(__name__) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 515f7df739556..cf0ea8c5d7cf9 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -46,7 +46,7 @@ def __init__( self.scheduler_config = scheduler_config self.lora_config = lora_config self.is_driver_worker = is_driver_worker - + # model_config can be None in tests/samplers/test_sampler.py. # FIXME(woosuk): This is a hack to make the tests work. Refactor this. self.sliding_window = (model_config.get_sliding_window() From 4d0613528fb5870d418e19aad8f267efc628e3bd Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Wed, 7 Feb 2024 21:50:50 +0000 Subject: [PATCH 16/21] fix reqs --- requirements.txt | 2 ++ vllm/model_executor/models/opt.py | 4 ++-- vllm/worker/model_runner.py | 10 ++++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2bf527ccc3a77..0937aa163ff00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,5 @@ uvicorn[standard] pydantic >= 2.0 # Required for OpenAI server. aioprometheus[starlette] pynvml == 11.5.0 +flashinfer @ https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.1/flashinfer-0.0.1+cu121-cp310-cp310-linux_x86_64.whl + diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 393b2dcabcd5a..a8cee240472db 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -101,8 +101,8 @@ def forward( ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache = kv_cache - attn_output = self.attn(q, k, v, key_cache, value_cache, + #key_cache, value_cache = kv_cache + attn_output = self.attn(q, k, v, kv_cache, input_metadata) output, _ = self.out_proj(attn_output) return output diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index cf0ea8c5d7cf9..6932508734c44 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -547,8 +547,14 @@ def execute_model(self, lora_requests, lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list) - num_qo_heads = self.model.config.num_attention_heads - num_kv_heads = self.model.config.num_key_value_heads + + if "num_key_value_heads" in self.model.config.__dict__.keys(): + num_qo_heads = self.model.config.num_attention_heads + num_kv_heads = self.model.config.num_key_value_heads + + else: + num_qo_heads = self.model.config.num_attention_heads + num_kv_heads = self.model.config.num_attention_heads if not profile and input_metadata.is_prompt and input_metadata.decode_wrapper: input_metadata.decode_wrapper.end_forward() From 57874d58903c11f3d3c1f719bde03459ab164a85 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Wed, 7 Feb 2024 22:29:54 +0000 Subject: [PATCH 17/21] rm hardcoded values, passes tests w/ eager mode --- tests/async_engine/api_server_async_engine.py | 2 ++ tests/test_regression.py | 6 ++++-- vllm/model_executor/layers/attention.py | 2 +- vllm/worker/model_runner.py | 4 +++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 1be76fdc8d868..e331ee3060df4 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -39,6 +39,8 @@ def stats() -> Response: parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() + args["enforce_eager"] = True + engine_args = AsyncEngineArgs.from_cli_args(args) engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) vllm.entrypoints.api_server.engine = engine diff --git a/tests/test_regression.py b/tests/test_regression.py index c48e474bd889f..61996dbf2fe69 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -15,7 +15,8 @@ def test_duplicated_ignored_sequence_group(): max_tokens=256) llm = LLM(model="facebook/opt-125m", max_num_batched_tokens=4096, - tensor_parallel_size=1) + tensor_parallel_size=1, + enforce_eager=True) prompts = ["This is a short prompt", "This is a very long prompt " * 1000] outputs = llm.generate(prompts, sampling_params=sampling_params) @@ -28,7 +29,8 @@ def test_max_tokens_none(): max_tokens=None) llm = LLM(model="facebook/opt-125m", max_num_batched_tokens=4096, - tensor_parallel_size=1) + tensor_parallel_size=1, + enforce_eager=True) prompts = ["Just say hello!"] outputs = llm.generate(prompts, sampling_params=sampling_params) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 1c4dae6829344..3a2cd77a17fb1 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -161,7 +161,7 @@ def forward( # the very attention layer of every iteration. # FIXME(woosuk): This is a hack. - query = query.view(-1, 32, 128) + query = query.view(-1, self.num_kv_heads, self.head_size) output = input_metadata.prefill_wrapper.forward( query, kv_cache, causal=True, allow_fp16_qk_reduction=True) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 6932508734c44..129adaf9131d3 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -548,6 +548,8 @@ def execute_model(self, lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list) + hidden_size = self.model.config.hidden_size + if "num_key_value_heads" in self.model.config.__dict__.keys(): num_qo_heads = self.model.config.num_attention_heads num_kv_heads = self.model.config.num_key_value_heads @@ -616,7 +618,7 @@ def execute_model(self, paged_kv_last_page_len, num_qo_heads, num_kv_heads, - 128, + hidden_size // num_kv_heads, 16, ) From cddf14d865d4c2030412be892b8f5fa81d0a250b Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Thu, 8 Feb 2024 18:27:51 +0000 Subject: [PATCH 18/21] fix indexing --- vllm/worker/model_runner.py | 46 +++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 129adaf9131d3..f21fbc4c4ecb7 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -565,12 +565,7 @@ def execute_model(self, input_metadata.prefill_wrapper = self.prefill_wrapper input_metadata.decode_wrapper = self.decode_wrapper batch_size = input_tokens.shape[0] - - #if input_metadata.is_prompt: - # seq_lens = input_metadata.prompt_lens - #else: - # seq_lens = input_metadata.context_lens - + if input_metadata.is_prompt: qo_indptr = torch.zeros((batch_size + 1, ), dtype=torch.int32, @@ -583,23 +578,33 @@ def execute_model(self, kvi = input_metadata.slot_mapping.view(-1).type( torch.int32).to("cuda:0") - paged_kv_indices = torch.div(kvi, 16, rounding_mode="floor") - - paged_kv_indptr = torch.zeros((batch_size + 1, ), - dtype=torch.int32, - device="cuda:0") - #print(block_sizes_per_seq) + kvd = input_metadata.block_tables.view(-1) if input_metadata.is_prompt: + paged_kv_indices = torch.div(kvi, 16, rounding_mode="floor") block_sizes_per_seq = torch.tensor([ len(input_metadata.slot_mapping[i].unique()) for i in range(batch_size) ]) + else: + paged_kv_indices = kvd #torch.div(kvd, 16, rounding_mode="floor") + block_sizes_per_seq = torch.tensor([ + len(input_metadata.block_tables[i]) + for i in range(batch_size) + ]) - paged_kv_indptr[1:] = torch.cumsum(block_sizes_per_seq, dim=0) + paged_kv_indptr = torch.zeros((batch_size + 1, ), + dtype=torch.int32, + device="cuda:0") - else: - paged_kv_indptr[1:] = torch.arange(1, batch_size + 1) + paged_kv_indices = torch.where(paged_kv_indices == -1, torch.tensor(1), paged_kv_indices) + paged_kv_indptr[1:] = torch.cumsum(block_sizes_per_seq, dim=0) + + #if input_metadata.is_prompt: + # paged_kv_indptr[1:] = torch.cumsum(block_sizes_per_seq, dim=0) + # + #else: + # paged_kv_indptr[1:] = torch.arange(1, batch_size + 1) if input_metadata.is_prompt: paged_kv_last_page_len = (kvi[paged_kv_indptr[1:] - 1] % @@ -607,6 +612,17 @@ def execute_model(self, else: paged_kv_last_page_len = (kvi % 16) + 1 + if not input_metadata.is_prompt: + print(f"indptr: {paged_kv_indptr}") + print(paged_kv_indices) + print(paged_kv_last_page_len) + print(f"kvi: {kvi // 16}") + print(f"kvd: {kvd // 16}") + + print() + print(input_metadata.block_tables) + print() + if input_metadata.is_prompt: input_metadata.prefill_wrapper.begin_forward( qo_indptr, paged_kv_indptr, paged_kv_indices, From 35f3e18d7d4437f27f1effb4307f1caa25d3c165 Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Thu, 8 Feb 2024 23:56:01 +0000 Subject: [PATCH 19/21] rm prefill --- run.py | 17 ++++ vllm/model_executor/layers/attention.py | 8 +- vllm/worker/model_runner.py | 101 +++++++----------------- 3 files changed, 51 insertions(+), 75 deletions(-) create mode 100644 run.py diff --git a/run.py b/run.py new file mode 100644 index 0000000000000..fcc5f7430241a --- /dev/null +++ b/run.py @@ -0,0 +1,17 @@ +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", +] + +sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16) + +llm = LLM(model="facebook/opt-125m", enforce_eager=True) + +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 3a2cd77a17fb1..064300e9a16cb 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -17,7 +17,7 @@ _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. _PARTITION_SIZE = 512 - +import flashinfer class PagedAttention(nn.Module): """MHA/MQA/GQA layer with PagedAttention. @@ -162,10 +162,10 @@ def forward( # FIXME(woosuk): This is a hack. query = query.view(-1, self.num_kv_heads, self.head_size) - output = input_metadata.prefill_wrapper.forward( - query, kv_cache, causal=True, allow_fp16_qk_reduction=True) + #output = input_metadata.prefill_wrapper.forward( + # query, kv_cache, causal=True) - #output = flashinfer.single_prefill_with_kv_cache(query, key.contiguous(), value.contiguous(), causal=True, + output = flashinfer.single_prefill_with_kv_cache(query, key.contiguous(), value.contiguous(), causal=True) #allow_fp16_qk_reduction=True) else: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index f21fbc4c4ecb7..15122abe238a1 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -75,7 +75,7 @@ def __init__( # cache in_wsl result self.in_wsl = in_wsl() self.kv_cache_dtype = kv_cache_dtype - + self.forward_set = False workspace_buffer = torch.empty(16 * 1024 * 1024, dtype=torch.uint8, device="cuda:0") @@ -547,7 +547,6 @@ def execute_model(self, lora_requests, lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list) - hidden_size = self.model.config.hidden_size if "num_key_value_heads" in self.model.config.__dict__.keys(): @@ -560,84 +559,48 @@ def execute_model(self, if not profile and input_metadata.is_prompt and input_metadata.decode_wrapper: input_metadata.decode_wrapper.end_forward() + self.forward_set = False - if not profile: - input_metadata.prefill_wrapper = self.prefill_wrapper + if not profile and not input_metadata.is_prompt: input_metadata.decode_wrapper = self.decode_wrapper batch_size = input_tokens.shape[0] - - if input_metadata.is_prompt: - qo_indptr = torch.zeros((batch_size + 1, ), - dtype=torch.int32, - device="cuda") - - qo_indptr[1:] = torch.cumsum(input_metadata.prompt_lens, dim=0) - - input_metadata.qo_indptr = qo_indptr kvi = input_metadata.slot_mapping.view(-1).type( torch.int32).to("cuda:0") - kvd = input_metadata.block_tables.view(-1) + kvd = input_metadata.block_tables.to("cuda:0") - if input_metadata.is_prompt: - paged_kv_indices = torch.div(kvi, 16, rounding_mode="floor") - block_sizes_per_seq = torch.tensor([ - len(input_metadata.slot_mapping[i].unique()) - for i in range(batch_size) - ]) - else: - paged_kv_indices = kvd #torch.div(kvd, 16, rounding_mode="floor") - block_sizes_per_seq = torch.tensor([ - len(input_metadata.block_tables[i]) - for i in range(batch_size) - ]) + kvi = kvi[kvi != -1] + kvd = kvd[kvd != 0] - paged_kv_indptr = torch.zeros((batch_size + 1, ), - dtype=torch.int32, - device="cuda:0") - paged_kv_indices = torch.where(paged_kv_indices == -1, torch.tensor(1), paged_kv_indices) - paged_kv_indptr[1:] = torch.cumsum(block_sizes_per_seq, dim=0) + paged_kv_indices = kvd + bsi = [] + for i in range(batch_size): + mask = input_metadata.block_tables[i] != 0 + bsi.append(len(input_metadata.block_tables[i][mask].unique_consecutive())) - #if input_metadata.is_prompt: - # paged_kv_indptr[1:] = torch.cumsum(block_sizes_per_seq, dim=0) - # - #else: - # paged_kv_indptr[1:] = torch.arange(1, batch_size + 1) + + block_sizes_per_seq = torch.tensor(bsi) - if input_metadata.is_prompt: - paged_kv_last_page_len = (kvi[paged_kv_indptr[1:] - 1] % - 16) + 1 - else: - paged_kv_last_page_len = (kvi % 16) + 1 - - if not input_metadata.is_prompt: - print(f"indptr: {paged_kv_indptr}") - print(paged_kv_indices) - print(paged_kv_last_page_len) - print(f"kvi: {kvi // 16}") - print(f"kvd: {kvd // 16}") - - print() - print(input_metadata.block_tables) - print() - - if input_metadata.is_prompt: - input_metadata.prefill_wrapper.begin_forward( - qo_indptr, paged_kv_indptr, paged_kv_indices, - paged_kv_last_page_len, num_qo_heads, num_kv_heads) - else: - input_metadata.decode_wrapper.begin_forward( - paged_kv_indptr, - paged_kv_indices, - paged_kv_last_page_len, - num_qo_heads, - num_kv_heads, - hidden_size // num_kv_heads, - 16, - ) + paged_kv_indptr = torch.zeros((batch_size + 1, ), + dtype=torch.int32, + device="cuda:0") + paged_kv_indptr[1:] = torch.cumsum(block_sizes_per_seq, dim=0 + + paged_kv_last_page_len = kvi % 16 + 1 + + input_metadata.decode_wrapper.begin_forward( + paged_kv_indptr, + paged_kv_indices, + paged_kv_last_page_len, + num_qo_heads, + num_kv_heads, + hidden_size // num_kv_heads, + 16, + ) + if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) @@ -660,10 +623,6 @@ def execute_model(self, sampling_metadata=sampling_metadata, ) - #if not profile: - # if input_metadata.is_prompt: - # input_metadata.prefill_wrapper.end_forward() - return output @torch.inference_mode() From f430e5031ac06cd8398fa33182e756b3e7ec839e Mon Sep 17 00:00:00 2001 From: Artem Yatsenko Date: Thu, 8 Feb 2024 23:56:21 +0000 Subject: [PATCH 20/21] remove file --- run.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 run.py diff --git a/run.py b/run.py deleted file mode 100644 index fcc5f7430241a..0000000000000 --- a/run.py +++ /dev/null @@ -1,17 +0,0 @@ -from vllm import LLM, SamplingParams - -prompts = [ - "Hello, my name is", -] - -sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16) - -llm = LLM(model="facebook/opt-125m", enforce_eager=True) - -outputs = llm.generate(prompts, sampling_params) - -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") From af69cd6e8296ac1c5fb008ff3a1d62f0f45fd813 Mon Sep 17 00:00:00 2001 From: Artem yatsenko Date: Fri, 9 Feb 2024 00:28:28 +0000 Subject: [PATCH 21/21] fix device for tp --- vllm/worker/model_runner.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 15122abe238a1..37f4835369b99 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -78,7 +78,7 @@ def __init__( self.forward_set = False workspace_buffer = torch.empty(16 * 1024 * 1024, dtype=torch.uint8, - device="cuda:0") + device=self.device) self.prefill_wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, "NHD") self.decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( @@ -566,9 +566,9 @@ def execute_model(self, batch_size = input_tokens.shape[0] kvi = input_metadata.slot_mapping.view(-1).type( - torch.int32).to("cuda:0") + torch.int32).to(self.device) - kvd = input_metadata.block_tables.to("cuda:0") + kvd = input_metadata.block_tables.to(self.device) kvi = kvi[kvi != -1] kvd = kvd[kvd != 0] @@ -585,9 +585,9 @@ def execute_model(self, paged_kv_indptr = torch.zeros((batch_size + 1, ), dtype=torch.int32, - device="cuda:0") + device=self.device) - paged_kv_indptr[1:] = torch.cumsum(block_sizes_per_seq, dim=0 + paged_kv_indptr[1:] = torch.cumsum(block_sizes_per_seq, dim=0) paged_kv_last_page_len = kvi % 16 + 1