From 0eb0757bef728c632be17e61a88cfa9dc1a760c4 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 23 Jul 2024 14:04:04 -0400 Subject: [PATCH 001/167] [Misc] Add ignored layers for `fp8` quantization (#6657) --- .../quantization/compressed_tensors/utils.py | 14 +++---- .../layers/quantization/fbgemm_fp8.py | 39 ++----------------- .../model_executor/layers/quantization/fp8.py | 13 ++++++- .../layers/quantization/utils/quant_utils.py | 38 ++++++++++++++++++ 4 files changed, 57 insertions(+), 47 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 7e8e70806a0fc..7912cbde5721f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -5,6 +5,9 @@ from pydantic import BaseModel, Field from torch.nn import Module +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + FUSED_LAYER_NAME_MAPPING) + class CompressionFormat(Enum): dense = "dense" @@ -86,13 +89,6 @@ def is_activation_quantization_format(format: str) -> bool: return format in _ACTIVATION_QUANTIZATION_FORMATS -# fused_name: List[shard_name] -_FUSED_LAYER_NAME_MAPPING = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"] -} - - def should_ignore_layer(layer_name: Optional[str], ignore: Iterable[str]) -> bool: if layer_name is None: @@ -106,8 +102,8 @@ def should_ignore_layer(layer_name: Optional[str], # in the safetensors checkpoint. So, we convert the name # from the fused version to unfused + check to make sure that # each shard of the fused layer has the same scheme. - if proj_name in _FUSED_LAYER_NAME_MAPPING: - shard_proj_names = _FUSED_LAYER_NAME_MAPPING[proj_name] + if proj_name in FUSED_LAYER_NAME_MAPPING: + shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name] # Convert fused_name --> [shard_names] shard_names = [ diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 6b329231ec3af..5e8d1f1947421 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -11,6 +11,8 @@ QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + is_layer_skipped) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( apply_fp8_linear, create_per_channel_scale_param) from vllm.model_executor.utils import set_weight_attrs @@ -18,14 +20,6 @@ logger = init_logger(__name__) -# Note: this is a hack. We should update each model to register the -# stacked params and get it from there instead in a future PR. -# fused_name: List[shard_name] -_FUSED_LAYER_NAME_MAPPING = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"] -} - class FBGEMMFp8Config(QuantizationConfig): """Config class for FBGEMM Fp8.""" @@ -62,37 +56,10 @@ def from_config(cls, config: Dict[str, Any]) -> "FBGEMMFp8Config": input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"]) return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub) - def _is_layer_skipped(self, prefix: str) -> bool: - # prefix: model.layers.0.self_attn.q_proj - # proj_name: q_proj - proj_name = prefix.split(".")[-1] - if proj_name in _FUSED_LAYER_NAME_MAPPING: - shard_prefixes = [ - prefix.replace(proj_name, shard_proj_name) - for shard_proj_name in _FUSED_LAYER_NAME_MAPPING[proj_name] - ] - - is_skipped = None - for shard_prefix in shard_prefixes: - is_shard_skipped = shard_prefix in self.ignore_list - - if is_skipped is None: - is_skipped = is_shard_skipped - elif is_shard_skipped != is_skipped: - raise ValueError( - f"Detected some but not all shards of {prefix} " - "are quantized. All shards of fused layers " - "to have the same precision.") - else: - is_skipped = prefix in self.ignore_list - - assert is_skipped is not None - return is_skipped - def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: if isinstance(layer, LinearBase): - if self._is_layer_skipped(prefix): + if is_layer_skipped(prefix, self.ignore_list): return UnquantizedLinearMethod() return FBGEMMFp8LinearMethod(self) return None diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index b2a1b0a9534e8..3a4f2a49a3497 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -8,12 +8,15 @@ from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, fused_moe) -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + is_layer_skipped) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, apply_fp8_linear, create_per_tensor_scale_param, cutlass_fp8_supported, per_tensor_dequantize, requantize_with_max_scale) @@ -33,6 +36,7 @@ def __init__( self, is_checkpoint_fp8_serialized: bool = False, activation_scheme: str = "dynamic", + ignored_layers: Optional[List[str]] = None, ) -> None: self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized if is_checkpoint_fp8_serialized: @@ -42,6 +46,7 @@ def __init__( raise ValueError( f"Unsupported activation scheme {activation_scheme}") self.activation_scheme = activation_scheme + self.ignored_layers = ignored_layers or [] @classmethod def get_name(cls) -> str: @@ -64,14 +69,18 @@ def from_config(cls, config: Dict[str, Any]) -> "Fp8Config": quant_method = cls.get_from_keys(config, ["quant_method"]) is_checkpoint_fp8_serialized = ("fp8" in quant_method) activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) + ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized, - activation_scheme=activation_scheme) + activation_scheme=activation_scheme, + ignored_layers=ignored_layers) def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import if isinstance(layer, LinearBase): + if is_layer_skipped(prefix, self.ignored_layers): + return UnquantizedLinearMethod() return Fp8LinearMethod(self) elif isinstance(layer, FusedMoE): return Fp8MoEMethod(self) diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 7abe919f859ca..2ba6a9a810ec0 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -1,10 +1,48 @@ """This file is used for /tests and /benchmarks""" +from typing import List + import numpy import torch SUPPORTED_NUM_BITS = [4, 8] SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] +# Note: this is a hack. We should update each model to register the +# stacked params and get it from there instead in a future PR. +# fused_name: List[shard_name] +FUSED_LAYER_NAME_MAPPING = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] +} + + +def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool: + # prefix: model.layers.0.self_attn.q_proj + # proj_name: q_proj + proj_name = prefix.split(".")[-1] + if proj_name in FUSED_LAYER_NAME_MAPPING: + shard_prefixes = [ + prefix.replace(proj_name, shard_proj_name) + for shard_proj_name in FUSED_LAYER_NAME_MAPPING[proj_name] + ] + + is_skipped = None + for shard_prefix in shard_prefixes: + is_shard_skipped = shard_prefix in ignored_layers + + if is_skipped is None: + is_skipped = is_shard_skipped + elif is_shard_skipped != is_skipped: + raise ValueError( + f"Detected some but not all shards of {prefix} " + "are quantized. All shards of fused layers " + "to have the same precision.") + else: + is_skipped = prefix in ignored_layers + + assert is_skipped is not None + return is_skipped + def get_pack_factor(num_bits): assert num_bits in SUPPORTED_NUM_BITS, f"Unsupported num_bits = {num_bits}" From 58f53034add8767c9e5d92431220faa409fa3dc2 Mon Sep 17 00:00:00 2001 From: Yehoshua Cohen <61619195+yecohn@users.noreply.github.com> Date: Tue, 23 Jul 2024 21:41:55 +0300 Subject: [PATCH 002/167] [Frontend] Add Usage data in each chunk for chat_serving. #6540 (#6652) --- tests/entrypoints/openai/test_chat.py | 40 ++++++++++++++++---- vllm/entrypoints/openai/serving_chat.py | 50 +++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 12 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 1abaa01ae192a..c96d602b63438 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -295,14 +295,19 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, async for chunk in stream: assert chunk.usage is None - # Test stream=True, stream_options={"include_usage": True} - stream = await client.chat.completions.create( - model=model_name, - messages=messages, - max_tokens=10, - temperature=0.0, - stream=True, - stream_options={"include_usage": True}) + # Test stream=True, stream_options={"include_usage": True, + # "continuous_usage_stats": False}} + stream = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + stream_options={ + "include_usage": + True, + "continuous_usage_stats": + False + }) async for chunk in stream: if chunk.choices[0].finish_reason is None: @@ -338,6 +343,25 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, stream=False, stream_options={"include_usage": True}) + # Test stream=True, stream_options={"include_usage": True, + # "continuous_usage_stats": True} + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + stream_options={ + "include_usage": True, + "continuous_usage_stats": True + }, + ) + async for chunk in stream: + assert chunk.usage.prompt_tokens >= 0 + assert chunk.usage.completion_tokens >= 0 + assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + + chunk.usage.completion_tokens) + # NOTE: Not sure why, but when I place this after `test_guided_regex_chat` # (i.e. using the same ordering as in the Completions API tests), the test diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index b21c2bc513186..3899509ef3ff4 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -247,7 +247,15 @@ async def chat_completion_stream_generator( model=model_name) if (request.stream_options and request.stream_options.include_usage): - chunk.usage = None + if (request.stream_options.continuous_usage_stats): + prompt_tokens = len(res.prompt_token_ids) + usage = UsageInfo(prompt_tokens=prompt_tokens, + completion_tokens=0, + total_tokens=prompt_tokens) + chunk.usage = usage + else: + chunk.usage = None + data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" @@ -277,7 +285,18 @@ async def chat_completion_stream_generator( model=model_name) if (request.stream_options and request.stream_options.include_usage): - chunk.usage = None + if (request.stream_options. + continuous_usage_stats): + prompt_tokens = len( + res.prompt_token_ids) + usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=0, + total_tokens=prompt_tokens) + chunk.usage = usage + else: + chunk.usage = None + data = chunk.model_dump_json( exclude_unset=True) yield f"data: {data}\n\n" @@ -336,7 +355,19 @@ async def chat_completion_stream_generator( model=model_name) if (request.stream_options and request.stream_options.include_usage): - chunk.usage = None + if (request.stream_options.continuous_usage_stats): + prompt_tokens = len(res.prompt_token_ids) + completion_tokens = len(output.token_ids) + usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + + completion_tokens, + ) + chunk.usage = usage + else: + chunk.usage = None + data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" else: @@ -356,7 +387,18 @@ async def chat_completion_stream_generator( model=model_name) if (request.stream_options and request.stream_options.include_usage): - chunk.usage = None + if (request.stream_options.continuous_usage_stats): + prompt_tokens = len(res.prompt_token_ids) + completion_tokens = len(output.token_ids) + usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + + completion_tokens, + ) + chunk.usage = usage + else: + chunk.usage = None data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" finish_reason_sent[i] = True From 507ef787d85dec24490069ffceacbd6b161f4f72 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Tue, 23 Jul 2024 13:22:09 -0600 Subject: [PATCH 003/167] [Model] Pipeline Parallel Support for DeepSeek v2 (#6519) Signed-off-by: Travis Johnson --- vllm/config.py | 1 + vllm/model_executor/models/deepseek_v2.py | 153 ++++++++++++++++------ 2 files changed, 115 insertions(+), 39 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index c27d26c098b59..6e0283f8379a2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -31,6 +31,7 @@ _PP_SUPPORTED_MODELS = [ "AquilaModel", "AquilaForCausalLM", + "DeepseekV2ForCausalLM", "InternLMForCausalLM", "LlamaForCausalLM", "LLaMAForCausalLM", diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 2d12ceb7f3dbf..2e3e9b6f2792e 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -29,7 +29,8 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig -from vllm.distributed import (get_tensor_model_parallel_world_size, +from vllm.distributed import (get_pp_group, + get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE @@ -49,6 +50,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors, SamplerOutput +from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers + class DeepseekV2MLP(nn.Module): @@ -59,17 +62,20 @@ def __init__( hidden_act: str, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True, + prefix: str = "", ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( hidden_size, [intermediate_size] * 2, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") self.down_proj = RowParallelLinear(intermediate_size, hidden_size, bias=False, quant_config=quant_config, - reduce_results=reduce_results) + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") if hidden_act != "silu": raise ValueError(f"Unsupported activation: {hidden_act}. " "Only silu is supported for now.") @@ -88,6 +94,7 @@ def __init__( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.tp_size = get_tensor_model_parallel_world_size() @@ -112,12 +119,14 @@ def __init__( quant_config=quant_config, use_grouped_topk=True, num_expert_group=config.n_group, - topk_group=config.topk_group) + topk_group=config.topk_group, + prefix=f"{prefix}.experts") self.gate = ReplicatedLinear(config.hidden_size, config.n_routed_experts, bias=False, - quant_config=None) + quant_config=None, + prefix=f"{prefix}.gate") if config.n_shared_experts is not None: intermediate_size = (config.moe_intermediate_size * config.n_shared_experts) @@ -172,10 +181,9 @@ def __init__( max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - layer_idx=None, + prefix: str = "", ) -> None: super().__init__() - self.layer_idx = layer_idx self.hidden_size = hidden_size self.qk_nope_head_dim = qk_nope_head_dim self.qk_rope_head_dim = qk_rope_head_dim @@ -195,38 +203,44 @@ def __init__( self.q_a_proj = ReplicatedLinear(self.hidden_size, self.q_lora_rank, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.q_a_proj") self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps) self.q_b_proj = ColumnParallelLinear(q_lora_rank, self.num_heads * self.qk_head_dim, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.q_b_proj") else: self.q_proj = ColumnParallelLinear(self.hidden_size, self.num_heads * self.qk_head_dim, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.q_proj") - self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size, - self.kv_lora_rank + - self.qk_rope_head_dim, - bias=False, - quant_config=quant_config) + self.kv_a_proj_with_mqa = ReplicatedLinear( + self.hidden_size, + self.kv_lora_rank + self.qk_rope_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_a_proj_with_mqa") self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps) self.kv_b_proj = ColumnParallelLinear( self.kv_lora_rank, self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.kv_b_proj") # O projection. self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim, self.hidden_size, bias=False, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.o_proj") rope_scaling['type'] = 'deepseek_yarn' self.rotary_emb = get_rope(qk_rope_head_dim, rotary_dim=qk_rope_head_dim, @@ -308,7 +322,7 @@ class DeepseekV2DecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, - layer_idx: int, + prefix: str, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ) -> None: @@ -318,6 +332,9 @@ def __init__( rope_scaling = getattr(config, "rope_scaling", None) max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep='.')[-1]) self.self_attn = DeepseekV2Attention( config=config, hidden_size=self.hidden_size, @@ -333,18 +350,23 @@ def __init__( max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, - layer_idx=layer_idx, + prefix=f"{prefix}.self_attn", ) if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0): - self.mlp = DeepseekV2MoE(config=config, quant_config=quant_config) + self.mlp = DeepseekV2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) else: self.mlp = DeepseekV2MLP( hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + prefix=f"{prefix}.mlp", ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -389,23 +411,34 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size - self.embed_tokens = VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - DeepseekV2DecoderLayer(config, - layer_idx, - cache_config=cache_config, - quant_config=quant_config) - for layer_idx in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: DeepseekV2DecoderLayer( + config, + prefix, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers") + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() def forward( self, @@ -413,14 +446,28 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors], ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): + if get_pp_group().is_first_rank: + hidden_states = self.embed_tokens(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for i in range(self.start_layer, self.end_layer): layer = self.layers[i] hidden_states, residual = layer(positions, hidden_states, - kv_caches[i], attn_metadata, - residual) + kv_caches[i - self.start_layer], + attn_metadata, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + hidden_states, _ = self.norm(hidden_states, residual) return hidden_states @@ -436,7 +483,10 @@ def __init__( super().__init__() self.config = config self.quant_config = quant_config - self.model = DeepseekV2Model(config, cache_config, quant_config) + self.model = DeepseekV2Model(config, + cache_config, + quant_config, + prefix="model") self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) @@ -452,7 +502,7 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata) + attn_metadata, intermediate_tensors) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, @@ -469,6 +519,20 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens + def make_empty_intermediate_tensors( + self, batch_size: int, dtype: torch.dtype, + device: torch.device) -> IntermediateTensors: + return IntermediateTensors({ + "hidden_states": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + "residual": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + }) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) @@ -504,6 +568,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -514,6 +582,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if weight_name not in name: continue name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, @@ -527,6 +599,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if name.endswith(".bias") and name not in params_dict: continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) From 1bedf210e35f5a11df5e9dd51e82b0663f854cf4 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Tue, 23 Jul 2024 13:47:48 -0700 Subject: [PATCH 004/167] Bump `transformers` version for Llama 3.1 hotfix and patch Chameleon (#6690) --- requirements-common.txt | 2 +- tests/test_config.py | 53 +++---- vllm/model_executor/models/__init__.py | 2 - vllm/model_executor/models/chameleon.py | 3 +- vllm/transformers_utils/config.py | 9 +- vllm/transformers_utils/configs/__init__.py | 4 - vllm/transformers_utils/configs/chameleon.py | 138 ------------------- 7 files changed, 33 insertions(+), 178 deletions(-) delete mode 100644 vllm/transformers_utils/configs/chameleon.py diff --git a/requirements-common.txt b/requirements-common.txt index 29643cfce161b..940740722c7e6 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -6,7 +6,7 @@ numpy < 2.0.0 requests tqdm py-cpuinfo -transformers >= 4.42.4 # Required for Gemma 2 and for additional chat template parameters. +transformers >= 4.43.1 # Required for Chameleon and Llama 3.1 hotfox. tokenizers >= 0.19.1 # Required for Llama 3. fastapi aiohttp diff --git a/tests/test_config.py b/tests/test_config.py index 6c8af9d7966b4..9f7d85e39ad67 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -64,9 +64,8 @@ def test_get_sliding_window(): def test_rope_customization(): - TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0} + TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0} TEST_ROPE_THETA = 16_000_000.0 - LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0} llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", @@ -96,27 +95,29 @@ def test_rope_customization(): None) == TEST_ROPE_THETA assert llama_model_config.max_model_len == 16384 - longchat_model_config = ModelConfig( - "lmsys/longchat-13b-16k", - "lmsys/longchat-13b-16k", - tokenizer_mode="auto", - trust_remote_code=False, - dtype="float16", - seed=0, - ) - assert getattr(longchat_model_config.hf_config, "rope_scaling", - None) == LONGCHAT_ROPE_SCALING - assert longchat_model_config.max_model_len == 16384 - - longchat_model_config = ModelConfig( - "lmsys/longchat-13b-16k", - "lmsys/longchat-13b-16k", - tokenizer_mode="auto", - trust_remote_code=False, - dtype="float16", - seed=0, - rope_scaling=TEST_ROPE_SCALING, - ) - assert getattr(longchat_model_config.hf_config, "rope_scaling", - None) == TEST_ROPE_SCALING - assert longchat_model_config.max_model_len == 4096 + # TODO: add these back when the rope configs are fixed + # LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0} + # longchat_model_config = ModelConfig( + # "lmsys/longchat-13b-16k", + # "lmsys/longchat-13b-16k", + # tokenizer_mode="auto", + # trust_remote_code=False, + # dtype="float16", + # seed=0, + # ) + # assert getattr(longchat_model_config.hf_config, "rope_scaling", + # None) == LONGCHAT_ROPE_SCALING + # assert longchat_model_config.max_model_len == 16384 + + # longchat_model_config = ModelConfig( + # "lmsys/longchat-13b-16k", + # "lmsys/longchat-13b-16k", + # tokenizer_mode="auto", + # trust_remote_code=False, + # dtype="float16", + # seed=0, + # rope_scaling=TEST_ROPE_SCALING, + # ) + # assert getattr(longchat_model_config.hf_config, "rope_scaling", + # None) == TEST_ROPE_SCALING + # assert longchat_model_config.max_model_len == 4096 diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 31370aebba599..55a039a88d535 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -16,8 +16,6 @@ "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b "BloomForCausalLM": ("bloom", "BloomForCausalLM"), - #TODO(ywang96): remove this when huggingface fixes the model repo - "ChameleonForCausalLM": ("chameleon", "ChameleonForConditionalGeneration"), "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index d06eb0504079f..6ece95495a026 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -6,6 +6,7 @@ import torch.nn.functional as F from PIL import Image from torch import nn +from transformers import ChameleonConfig, ChameleonVQVAEConfig from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, MultiModalConfig @@ -30,8 +31,6 @@ from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData -from vllm.transformers_utils.configs import (ChameleonConfig, - ChameleonVQVAEConfig) from vllm.utils import print_warning_once from .interfaces import SupportsVision diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index f99bea356da88..652505a892142 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -5,10 +5,10 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger -from vllm.transformers_utils.configs import (ChameleonConfig, ChatGLMConfig, - DbrxConfig, JAISConfig, - MedusaConfig, MLPSpeculatorConfig, - MPTConfig, RWConfig) +from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, + JAISConfig, MedusaConfig, + MLPSpeculatorConfig, MPTConfig, + RWConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -18,7 +18,6 @@ logger = init_logger(__name__) _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { - "chameleon": ChameleonConfig, "chatglm": ChatGLMConfig, "dbrx": DbrxConfig, "mpt": MPTConfig, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 080c0777ebdcc..51de11ca3e42a 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,5 +1,3 @@ -from vllm.transformers_utils.configs.chameleon import (ChameleonConfig, - ChameleonVQVAEConfig) from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.dbrx import DbrxConfig # RWConfig is for the original tiiuae/falcon-40b(-instruct) and @@ -12,8 +10,6 @@ from vllm.transformers_utils.configs.mpt import MPTConfig __all__ = [ - "ChameleonConfig", - "ChameleonVQVAEConfig", "ChatGLMConfig", "DbrxConfig", "MPTConfig", diff --git a/vllm/transformers_utils/configs/chameleon.py b/vllm/transformers_utils/configs/chameleon.py deleted file mode 100644 index c1ac1182e14c4..0000000000000 --- a/vllm/transformers_utils/configs/chameleon.py +++ /dev/null @@ -1,138 +0,0 @@ -from typing import List, Optional - -from transformers import PretrainedConfig - - -#TODO (ywang96): Remove this file and import it from -# transformers once the new release with Chameleon support -# is available. -class ChameleonConfig(PretrainedConfig): - model_type = "chameleon" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=65536, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - model_parallel_size=1, - swin_norm=False, - vq_config=None, - vocabulary_map=None, - mlp_bias=False, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.mlp_bias = mlp_bias - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self._rope_scaling_validation() - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.model_parallel_size = model_parallel_size - self.swin_norm = swin_norm - - if vq_config is None: - vq_config = {} - - self.vq_config = ChameleonVQVAEConfig(**vq_config) - - self.vocabulary_map = vocabulary_map - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, - dict) or len(self.rope_scaling) != 2: - raise ValueError( - "`rope_scaling` must be a dictionary with with two fields, " - f"`type` and `factor`, got {self.rope_scaling}") - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in [ - "linear", "dynamic" - ]: - raise ValueError( - "`rope_scaling`'s type field must be one of ['linear', " - f"'dynamic'], got {rope_scaling_type}") - if rope_scaling_factor is None or not isinstance( - rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError( - "`rope_scaling`'s factor field must be a float > 1, " - f"got {rope_scaling_factor}") - - -class ChameleonVQVAEConfig(PretrainedConfig): - - model_type = "chameleon_vqgan" - - def __init__( - self, - embed_dim: int = 256, - num_embeddings: int = 8192, - double_latent: bool = False, - latent_channels: int = 256, - resolution: int = 512, - in_channels: int = 3, - base_channels: int = 128, - channel_multiplier: List[int] = [1, 1, 2, 2, 4], #noqa - num_res_blocks: int = 2, - attn_resolutions: Optional[List[int]] = None, - dropout: float = 0.0, - attn_type: str = "vanilla", - initializer_range=0.02, - **kwargs, - ): - super().__init__(**kwargs) - self.embed_dim = embed_dim - self.num_embeddings = num_embeddings - self.double_latent = double_latent - self.latent_channels = latent_channels - self.resolution = resolution - self.in_channels = in_channels - self.base_channels = base_channels - self.channel_multiplier = channel_multiplier - self.num_res_blocks = num_res_blocks - self.attn_resolutions = attn_resolutions - self.dropout = dropout - self.attn_type = attn_type - self.initializer_range = initializer_range From 72fc7048032a9a12f42c76e60c0d8ca3673f0692 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 23 Jul 2024 14:03:49 -0700 Subject: [PATCH 005/167] [build] relax wheel size limit (#6704) --- .buildkite/check-wheel-size.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 75ad094fa1382..b39dce2659a54 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,7 +1,7 @@ import os import zipfile -MAX_SIZE_MB = 200 +MAX_SIZE_MB = 250 def print_top_10_largest_files(zip_file): From 01c16ede6b4fc2e07b6eb5a4f60a64a1f365e460 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 23 Jul 2024 18:45:12 -0400 Subject: [PATCH 006/167] [CI] Add smoke test for non-uniform AutoFP8 quantization (#6702) --- tests/quantization/test_fp8.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 82dc775f8d812..0602fedf0b8e3 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -13,6 +13,7 @@ MODELS = [ "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", "nm-testing/Phi-3-mini-128k-instruct-FP8", + "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV", ] From 2f808e69ab16a8775cd58849b15b465d4f11b92e Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 24 Jul 2024 01:05:05 +0200 Subject: [PATCH 007/167] [Bugfix] StatLoggers: cache spec decode metrics when they get collected. (#6645) Signed-off-by: Thomas Parnell --- tests/metrics/test_metrics.py | 91 +++++++++++++++++++++++++++++++++++ vllm/engine/metrics.py | 47 ++++++++++++------ 2 files changed, 122 insertions(+), 16 deletions(-) diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 42b15cd6c458e..23a7a85580a0a 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -1,3 +1,4 @@ +import time from typing import List import pytest @@ -10,6 +11,8 @@ from vllm.engine.metrics import RayPrometheusStatLogger from vllm.sampling_params import SamplingParams +from ..conftest import cleanup + MODELS = [ "facebook/opt-125m", ] @@ -219,6 +222,94 @@ def test_metric_spec_decode( "does not meet expectation") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [10]) +@pytest.mark.parametrize("log_interval", [1, 3, 5, 7]) +def test_metric_spec_decode_interval( + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, + log_interval: int, +) -> None: + k = 5 + + engine_args = EngineArgs(model=model, + dtype=dtype, + disable_log_stats=False, + gpu_memory_utilization=0.4, + speculative_model=model, + num_speculative_tokens=k, + use_v2_block_manager=True, + enforce_eager=True) + + engine = LLMEngine.from_engine_args(engine_args) + + try: + + engine.add_request( + "request-id-0", + example_prompts[0], + SamplingParams(max_tokens=max_tokens), + ) + + # set log internal + stat_logger = engine.stat_loggers['prometheus'] + stat_logger.local_interval = log_interval + + # prefill + engine.step() + + # wait for 5 seconds to ensure that spec decode metrics + # get triggered in first decode step + time.sleep(5) + + # first decode step should trigger async collection of metrics + engine.step() + + # wait one second to allow H2D transfer to finish + time.sleep(1) + + # second decode step should now be able to collect the spec + # decode stats and the request should also be finished + engine.step() + + # must have finisehd now + assert not engine.has_unfinished_requests() + + # wait to ensure logging occurs + time.sleep(log_interval) + + # force logging + engine.step() + + # Note that the purpose of this test is to verify spec decode + # metrics instead of functional correctness, so the expected values + # are intended to be loose. + metric_name_to_expected_fn = { + "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1, + "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1, + "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k, + "counter_spec_decode_num_draft_tokens": lambda v: v == k, + "counter_spec_decode_num_emitted_tokens": + lambda v: 0 <= v <= k + 1, + } + + for metric_name, is_expected in metric_name_to_expected_fn.items(): + metric_val = getattr( + stat_logger.metrics, + metric_name).labels(**stat_logger.labels)._value.get() + assert is_expected(metric_val), ( + f"the value of metric {metric_name} ({metric_val}) " + "does not meet expectation") + + finally: + del engine + cleanup() + + def assert_metrics(engine: LLMEngine, disable_log_stats: bool, num_requests: int) -> None: if disable_log_stats: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 4ed7da2377111..2f105b9cd2fb6 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -355,6 +355,7 @@ def __init__(self, local_interval: float) -> None: self.num_generation_tokens: List[int] = [] self.last_local_log = time.time() self.local_interval = local_interval + self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None @abstractmethod def info(self, type: str, obj: SupportsMetricsInfo) -> None: @@ -364,6 +365,12 @@ def info(self, type: str, obj: SupportsMetricsInfo) -> None: def log(self, stats: Stats) -> None: raise NotImplementedError + def maybe_update_spec_decode_metrics(self, stats: Stats): + """Save spec decode metrics (since they are unlikely + to be emitted at same time as log interval).""" + if stats.spec_decode_metrics is not None: + self.spec_decode_metrics = stats.spec_decode_metrics + class LoggingStatLogger(StatLoggerBase): """LoggingStatLogger is used in LLMEngine to log to Stdout.""" @@ -379,6 +386,9 @@ def log(self, stats: Stats) -> None: self.num_prompt_tokens.append(stats.num_prompt_tokens_iter) self.num_generation_tokens.append(stats.num_generation_tokens_iter) + # Update spec decode metrics + self.maybe_update_spec_decode_metrics(stats) + # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): @@ -408,15 +418,16 @@ def log(self, stats: Stats) -> None: stats.cpu_cache_usage_sys * 100, ) + if self.spec_decode_metrics is not None: + logger.info( + self._format_spec_decode_metrics_str( + self.spec_decode_metrics)) + # Reset tracked stats for next interval. self.num_prompt_tokens = [] self.num_generation_tokens = [] self.last_local_log = stats.now - - if stats.spec_decode_metrics is not None: - logger.info( - self._format_spec_decode_metrics_str( - stats.spec_decode_metrics)) + self.spec_decode_metrics = None def _format_spec_decode_metrics_str( self, metrics: "SpecDecodeWorkerMetrics") -> str: @@ -533,6 +544,9 @@ def log(self, stats: Stats): self.num_prompt_tokens.append(stats.num_prompt_tokens_iter) self.num_generation_tokens.append(stats.num_generation_tokens_iter) + # Update spec decode metrics + self.maybe_update_spec_decode_metrics(stats) + # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): @@ -550,26 +564,27 @@ def log(self, stats: Stats): prompt_throughput=prompt_throughput, generation_throughput=generation_throughput) - # Reset tracked stats for next interval. - self.num_prompt_tokens = [] - self.num_generation_tokens = [] - self.last_local_log = stats.now - - if stats.spec_decode_metrics is not None: + if self.spec_decode_metrics is not None: self._log_gauge( self.metrics.gauge_spec_decode_draft_acceptance_rate, - stats.spec_decode_metrics.draft_acceptance_rate) + self.spec_decode_metrics.draft_acceptance_rate) self._log_gauge(self.metrics.gauge_spec_decode_efficiency, - stats.spec_decode_metrics.system_efficiency) + self.spec_decode_metrics.system_efficiency) self._log_counter( self.metrics.counter_spec_decode_num_accepted_tokens, - stats.spec_decode_metrics.accepted_tokens) + self.spec_decode_metrics.accepted_tokens) self._log_counter( self.metrics.counter_spec_decode_num_draft_tokens, - stats.spec_decode_metrics.draft_tokens) + self.spec_decode_metrics.draft_tokens) self._log_counter( self.metrics.counter_spec_decode_num_emitted_tokens, - stats.spec_decode_metrics.emitted_tokens) + self.spec_decode_metrics.emitted_tokens) + + # Reset tracked stats for next interval. + self.num_prompt_tokens = [] + self.num_generation_tokens = [] + self.last_local_log = stats.now + self.spec_decode_metrics = None class RayPrometheusStatLogger(PrometheusStatLogger): From 87525fab925edf549611a1a74a40699b0b5e316e Mon Sep 17 00:00:00 2001 From: dongmao zhang Date: Tue, 23 Jul 2024 16:45:09 -0700 Subject: [PATCH 008/167] [bitsandbytes]: support read bnb pre-quantized model (#5753) Co-authored-by: Michael Goin --- docs/source/index.rst | 1 + docs/source/quantization/bnb.rst | 43 +++++++++ tests/quantization/test_bitsandbytes.py | 18 +++- vllm/config.py | 2 + vllm/engine/arg_utils.py | 4 +- .../layers/quantization/bitsandbytes.py | 25 +----- vllm/model_executor/model_loader/loader.py | 88 ++++++++++++++++--- .../model_loader/weight_utils.py | 1 + 8 files changed, 143 insertions(+), 39 deletions(-) create mode 100644 docs/source/quantization/bnb.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 2691805ed97a4..ded9a424ee68c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -105,6 +105,7 @@ Documentation quantization/supported_hardware quantization/auto_awq + quantization/bnb quantization/fp8 quantization/fp8_e5m2_kvcache quantization/fp8_e4m3_kvcache diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst new file mode 100644 index 0000000000000..aefb54a8acb65 --- /dev/null +++ b/docs/source/quantization/bnb.rst @@ -0,0 +1,43 @@ +.. _bits_and_bytes: + +BitsAndBytes +================== + +vLLM now supports `BitsAndBytes `_ for more efficient model inference. +BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy. +Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data. + +Below are the steps to utilize BitsAndBytes with vLLM. + +.. code-block:: console + + $ pip install bitsandbytes>=0.42.0 + +vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. + +You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes. +And usually, these repositories have a config.json file that includes a quantization_config section. + +Read quantized checkpoint. +-------------------------- + +.. code-block:: python + + from vllm import LLM + import torch + # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. + model_id = "unsloth/tinyllama-bnb-4bit" + llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ + quantization="bitsandbytes", load_format="bitsandbytes") + +Inflight quantization: load as 4bit quantization +------------------------------------------------ + +.. code-block:: python + + from vllm import LLM + import torch + model_id = "huggyllama/llama-7b" + llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ + quantization="bitsandbytes", load_format="bitsandbytes") + diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 953fd9ba939c8..b760e9ccb6b74 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -8,15 +8,20 @@ from tests.quantization.utils import is_quant_method_supported from vllm import SamplingParams +models_to_test = [ + ('huggyllama/llama-7b', 'quantize model inflight'), + ('lllyasviel/omost-llama-3-8b-4bits', 'read pre-quantized model'), +] + @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), reason='bitsandbytes is not supported on this GPU type.') -def test_load_bnb_model(vllm_runner) -> None: - with vllm_runner('huggyllama/llama-7b', +@pytest.mark.parametrize("model_name, description", models_to_test) +def test_load_bnb_model(vllm_runner, model_name, description) -> None: + with vllm_runner(model_name, quantization='bitsandbytes', load_format='bitsandbytes', enforce_eager=True) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 # check the weights in MLP & SelfAttention are quantized to torch.uint8 @@ -65,12 +70,17 @@ def test_load_bnb_model(vllm_runner) -> None: 'To be or not to be, that is the question.' ] outputs = llm.generate(prompts, sampling_params=sampling_params) - assert len(outputs) == len(prompts) for index in range(len(outputs)): # compare the first line of the output actual_output = outputs[index][1][0].split('\n', 1)[0] expected_output = expected_outputs[index].split('\n', 1)[0] + + assert len(actual_output) >= len(expected_output), ( + f'Actual {actual_output} should be larger than or equal to ' + f'expected {expected_output}') + actual_output = actual_output[:len(expected_output)] + assert actual_output == expected_output, ( f'Expected: {expected_output}, but got: {actual_output}') diff --git a/vllm/config.py b/vllm/config.py index 6e0283f8379a2..6403a53f86281 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -591,9 +591,11 @@ class LoadConfig: mainly for profiling. "tensorizer" will use CoreWeave's tensorizer library for fast weight loading. + "bitsandbytes" will load nf4 type weights. ignore_patterns: The list of patterns to ignore when loading the model. Default to "original/**/*" to avoid repeated loading of llama's checkpoints. + """ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 05bfe7c24f978..cd64d3345b830 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -676,8 +676,8 @@ def create_engine_config(self, ) -> EngineConfig: # bitsandbytes quantization needs a specific model loader # so we make sure the quant method and the load format are consistent if (self.quantization == "bitsandbytes" or - self.qlora_adapter_name_or_path is not None) and \ - self.load_format != "bitsandbytes": + self.qlora_adapter_name_or_path is not None) and \ + self.load_format != "bitsandbytes": raise ValueError( "BitsAndBytes quantization and QLoRA adapter only support " f"'bitsandbytes' load format, but got {self.load_format}") diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 4a68da5a2323e..c143d1a8f2bc7 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -15,19 +15,11 @@ class BitsAndBytesConfig(QuantizationConfig): Reference: https://arxiv.org/abs/2305.14314 """ - def __init__( - self, - adapter_name_or_path: str, - target_modules: List[str], - ) -> None: - - self.adapter_name_or_path = adapter_name_or_path - self.target_modules = target_modules + def __init__(self, ) -> None: + pass def __repr__(self) -> str: - return ( - f"BitsAndBytesConfig(adapter_name_or_path={self.adapter_name_or_path}" - ) + return "BitsAndBytesConfig" @classmethod def get_name(self) -> str: @@ -49,16 +41,7 @@ def get_config_filenames() -> List[str]: @classmethod def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig": - adapter_name = cls.get_from_keys(config, ["adapter_name_or_path"]) - default_target_modules = [ - "gate_proj", "down_proj", "up_proj", "q_proj", "k_proj", "v_proj", - "o_proj" - ] - if adapter_name == "": - target_modules = default_target_modules - else: - target_modules = cls.get_from_keys(config, ["target_modules"]) - return cls(adapter_name, target_modules) + return cls() def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["BitsAndBytesLinearMethod"]: diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 88f16918b0119..fe501b9b22da1 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -702,8 +702,14 @@ def _prepare_weights(self, model_name_or_path: str, return hf_weights_files, matched_pattern == "*.safetensors" + def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool): + if use_safetensors: + return safetensors_weights_iterator(hf_weights_files) + else: + return pt_weights_iterator(hf_weights_files) + def _get_quantized_weights_iterator( - self, model_name_or_path: str, revision: Optional[str] + self, model_name_or_path: str, revision: Optional[str], pre_quant: bool ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str, Any]]: """Get an iterator to the model weights with bitsandbytes quantization, @@ -712,6 +718,7 @@ def _get_quantized_weights_iterator( # only load the bitsandbytes module when needed try: import bitsandbytes + from bitsandbytes.functional import QuantState if bitsandbytes.__version__ < "0.42.0": raise ImportError("bitsandbytes version is wrong. Please " "install bitsandbytes>=0.42.0.") @@ -725,17 +732,63 @@ def _get_quantized_weights_iterator( model_name_or_path, revision) quant_state_dict = {} - if use_safetensors: - weight_iterator = safetensors_weights_iterator(hf_weights_files) - else: - weight_iterator = pt_weights_iterator(hf_weights_files) - def generator(): + def quantized_checkpoint() -> Generator: + # First iterate over all quant state weights + weight_iterator = self._hf_weight_iter(hf_weights_files, + use_safetensors) + temp_state_dict = {} for weight_name, weight_tensor in weight_iterator: + if weight_name.endswith(".weight"): + continue + # TODO: only nf4 quantization is supported for now + if weight_name.endswith(".quant_state.bitsandbytes__fp4"): + raise NotImplementedError( + "Only bitsandbytes_nf4 quantization" + f"is supported for now. {weight_name} is fp4 quantized" + ) + temp_state_dict[weight_name] = weight_tensor + + # Closure to parse quant_state for each prequant weight + def _parse_quant_state(param_name: str, + temp_state_dict: Dict) -> QuantState: + quant_state = {} + for k in temp_state_dict: + if param_name + "." in k: + quant_state[k] = temp_state_dict[k] + # bitsandbytes library requires + # weight.quant_state.bitsandbytes__nf4 in CPU + quant_state[param_name + + ".quant_state.bitsandbytes__nf4"] = quant_state[ + param_name + + ".quant_state.bitsandbytes__nf4"].cpu().data + return QuantState.from_dict(quant_state, device="cuda") + + # Second iterate over all prequant and normal weights + # pre quantized weights would have a quant_state + for weight_name, weight_tensor in self._hf_weight_iter( + hf_weights_files, use_safetensors): + # Filter out all weights whose suffix is not ".weight" + if not weight_name.endswith(".weight"): + continue + if weight_name + ".quant_state.bitsandbytes__nf4" \ + in temp_state_dict: + quant_state = _parse_quant_state(weight_name, + temp_state_dict) + weight_name = weight_name.replace(".weight", ".qweight") + quant_state_dict[weight_name] = quant_state + yield weight_name.replace(".weight", + ".qweight"), weight_tensor + else: + yield weight_name, weight_tensor + + def generator() -> Generator: + for weight_name, weight_tensor in self._hf_weight_iter( + hf_weights_files, use_safetensors): if any(target_module in weight_name for target_module in self.target_modules): weight_name = weight_name.replace(".weight", ".qweight") - # bitsandbytes requires data in GPU + # bitsandbytes requires data in GPU loaded_weight = weight_tensor.cuda().data with set_default_torch_dtype(torch.float32): processed_weight, quant_state = quantize_4bit( @@ -749,6 +802,8 @@ def generator(): yield weight_name, processed_weight + if pre_quant: + return quantized_checkpoint(), quant_state_dict return generator(), quant_state_dict def _load_weights(self, model_config: ModelConfig, @@ -766,12 +821,21 @@ def _load_weights(self, model_config: ModelConfig, logger.info("Loading weights with BitsAndBytes quantization. " " May take a while ...") - qweight_iterator, quant_state_dict = ( - self._get_quantized_weights_iterator(model_config.model, - model_config.revision)) + is_quantized_checkpoint = False + quant_config = getattr(model_config.hf_config, "quantization_config", + None) + if quant_config is not None and quant_config.get( + 'quant_method') == "bitsandbytes": + is_quantized_checkpoint = True + + qweight_iterator, quant_state_dict = \ + self._get_quantized_weights_iterator( + model_config.model, model_config.revision, is_quantized_checkpoint) model.load_weights(qweight_iterator) + torch.cuda.empty_cache() + param_dict = dict(model.named_parameters()) stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {} for quant_param_name in quant_state_dict: @@ -809,9 +873,9 @@ def _load_weights(self, model_config: ModelConfig, f"pack_factor not set for parameter {param_name}.") num_elements = [0] * len(quant_states) - for seq, quant_state in enumerate(quant_states.items()): + for seq, quant_state in quant_states.items(): num_elements[seq] = math.prod( - quant_state[1].shape) // pack_ratio + quant_state.shape) // pack_ratio offsets = np.concatenate(([0], np.cumsum(num_elements))) set_weight_attrs(param, {"bnb_shard_offsets": offsets}) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index dbba6ea358346..942215da01af4 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -118,6 +118,7 @@ def convert_bin_to_safetensor_file( # TODO(woosuk): Move this to other place. def get_quant_config(model_config: ModelConfig, load_config: LoadConfig) -> QuantizationConfig: + quant_cls = get_quantization_config(model_config.quantization) # Read the quantization config from the HF model config, if available. hf_quant_config = getattr(model_config.hf_config, "quantization_config", From 5e8ca973ebd5584582923b8ed1d3d823769a80a5 Mon Sep 17 00:00:00 2001 From: William Lin Date: Tue, 23 Jul 2024 18:49:44 -0700 Subject: [PATCH 009/167] [Bugfix] fix flashinfer cudagraph capture for PP (#6708) --- tests/distributed/test_pipeline_parallel.py | 24 +++++++++++++++++++++ vllm/worker/model_runner.py | 14 ++++++------ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 7f555ed9168a4..d666b8a1d44bd 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -61,3 +61,27 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, tp_args.append("--enforce-eager") compare_two_settings(MODEL_NAME, pp_args, tp_args) + + +@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [ + (2, "JackFram/llama-160m"), +]) +@pytest.mark.parametrize("ATTN_BACKEND", [ + "FLASH_ATTN", + "FLASHINFER", +]) +def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): + cudagraph_args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--pipeline-parallel-size", + str(PP_SIZE), + "--distributed-executor-backend", + "ray", + ] + os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND + + eager_args = cudagraph_args + ["--enforce-eager"] + + compare_two_settings(MODEL_NAME, eager_args, cudagraph_args) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e63be184af16a..073c5a73f739b 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1040,9 +1040,9 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.parallel_config.pipeline_parallel_size): for batch_size in reversed(batch_size_capture_list): if self.attn_backend.get_name() == "flashinfer": - indptr_buffer = indptr_buffer[:batch_size + 1] - last_page_len_buffer = last_page_len_buffer[: - batch_size] + _indptr_buffer = indptr_buffer[:batch_size + 1] + _last_page_len_buffer = last_page_len_buffer[: + batch_size] num_qo_heads = ( self.model_config.get_num_attention_heads( @@ -1055,8 +1055,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: use_tensor_cores = False decode_wrapper = \ CUDAGraphBatchDecodeWithPagedKVCacheWrapper( - decode_workspace_buffer, indptr_buffer, - indices_buffer, last_page_len_buffer, "NHD", + decode_workspace_buffer, _indptr_buffer, + indices_buffer, _last_page_len_buffer, "NHD", use_tensor_cores) kv_cache_dtype = get_kv_cache_torch_dtype( self.kv_cache_dtype, self.model_config.dtype) @@ -1131,10 +1131,10 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.model, self.attn_backend.get_name()) if self.attn_backend.get_name() == "flashinfer": - graph_runner.flashinfer_indptr_buffer = indptr_buffer + graph_runner.flashinfer_indptr_buffer = _indptr_buffer graph_runner.flashinfer_indices_buffer = indices_buffer graph_runner.flashinfer_last_page_len_buffer = \ - last_page_len_buffer + _last_page_len_buffer graph_runner.flashinfer_decode_workspace_buffer = \ decode_workspace_buffer graph_runner.flashinfer_decode_wrapper = \ From c882a7f5b3ce5c98efb52c911ea15ca565d10cd7 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 24 Jul 2024 00:34:22 -0700 Subject: [PATCH 010/167] [SpecDecoding] Update MLPSpeculator CI tests to use smaller model (#6714) --- tests/spec_decode/e2e/test_mlp_correctness.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index dd67a7735a647..e310941afacf3 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -24,14 +24,14 @@ from .conftest import run_greedy_equality_correctness_test # main model -MAIN_MODEL = "ibm-granite/granite-3b-code-instruct" +MAIN_MODEL = "JackFram/llama-160m" # speculative model -SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator" +SPEC_MODEL = "ibm-fms/llama-160m-accelerator" # max. number of speculative tokens: this corresponds to # n_predict in the config.json of the speculator model. -MAX_SPEC_TOKENS = 5 +MAX_SPEC_TOKENS = 3 # precision PRECISION = "float32" From 0a740a11ba055d6a2ae4786db6510684bfb7a887 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Wed, 24 Jul 2024 01:05:09 -0700 Subject: [PATCH 011/167] [Bugfix] Fix token padding for chameleon (#6724) --- vllm/model_executor/models/chameleon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 6ece95495a026..7659f598bab94 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -125,7 +125,8 @@ def input_processor_for_chameleon(ctx: InputContext, llm_inputs: LLMInputs): # Appending sep token for chat mode to follow default processor # behavior - new_prompt += tokenizer.sep_token + if new_prompt is not None: + new_prompt += tokenizer.sep_token new_token_ids += [CHAMELEON_SEP_TOKEN_ID] # NOTE: Create a defensive copy of the original inputs From ccc4a73257b61d5f1249999f6a804d60c2d1a518 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 24 Jul 2024 01:07:23 -0700 Subject: [PATCH 012/167] [Docs][ROCm] Detailed instructions to build from source (#6680) --- .../getting_started/amd-installation.rst | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 61efad2013b2a..71d7527a3e706 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -107,6 +107,35 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases. $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation +.. tip:: + + For example, vLLM v0.5.3 on ROCM 6.1 can be built with the following steps: + + .. code-block:: console + + $ pip install --upgrade pip + + $ # Install PyTorch + $ pip uninstall torch -y + $ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1 + + $ # Build & install AMD SMI + $ pip install /opt/rocm/share/amd_smi + + $ # Install dependencies + $ pip install --upgrade numba scipy huggingface-hub[cli] + $ pip install "numpy<2" + $ pip install -r requirements-rocm.txt + + $ # Apply the patch to ROCM 6.1 (requires root permission) + $ wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib + $ rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* + + $ # Build vLLM for MI210/MI250/MI300. + $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + $ python3 setup.py develop + + .. tip:: - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. From b5708117060460fcc6ba6f58e0669d3c46d6339e Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Wed, 24 Jul 2024 07:01:14 -0500 Subject: [PATCH 013/167] [Build/CI] Update run-amd-test.sh. Enable Docker Hub login. (#6711) --- .buildkite/run-amd-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 618d712b0279b..77e451354caf6 100644 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -55,6 +55,7 @@ while true; do done echo "--- Pulling container" +docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN} image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" docker pull ${image_name} From f4f8a9d892a357e341b90bc47a8d72ece62323d5 Mon Sep 17 00:00:00 2001 From: liuyhwangyh Date: Wed, 24 Jul 2024 20:04:46 +0800 Subject: [PATCH 014/167] [Bugfix]fix modelscope compatible issue (#6730) --- vllm/model_executor/model_loader/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index fe501b9b22da1..bbe49655020da 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -161,7 +161,7 @@ def _maybe_download_from_modelscope( cache_dir=self.load_config.download_dir, local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, revision=revision, - ignore_patterns=self.load_config.ignore_patterns, + ignore_file_pattern=self.load_config.ignore_patterns, ) else: model_path = model From 545146349c3c28282cbb60d503f06df473adc932 Mon Sep 17 00:00:00 2001 From: LF Marques Date: Wed, 24 Jul 2024 16:55:53 +0100 Subject: [PATCH 015/167] Adding f-string to validation error which is missing (#6748) --- vllm/entrypoints/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 83abc40888137..dca4523d1a27d 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -107,7 +107,7 @@ def _image_token_str(model_config: ModelConfig, return tokenizer.decode(model_config.hf_config.image_token_index) if model_type == "chameleon": return "" - raise TypeError("Unknown model type: {model_type}") + raise TypeError(f"Unknown model type: {model_type}") # TODO: Let user specify how to insert image tokens into prompt From 2cf0df3381b74f93dce3215bf5a043b5c47c55f4 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 24 Jul 2024 08:58:31 -0700 Subject: [PATCH 016/167] [Bugfix] Fix speculative decode seeded test (#6743) --- tests/spec_decode/e2e/conftest.py | 3 ++- tests/spec_decode/e2e/test_seed.py | 22 +++++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index bd1ea43f0b101..f9f246436c0f7 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -191,7 +191,8 @@ def generator_inner(): and llm.llm_engine.log_stats): for sate_logger in llm.llm_engine.stat_loggers.values(): sate_logger.local_interval = 0 - set_random_seed(seed) + if seed is not None: + set_random_seed(seed) yield llm del llm diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py index 792d7cba0f270..394a53f03ed46 100644 --- a/tests/spec_decode/e2e/test_seed.py +++ b/tests/spec_decode/e2e/test_seed.py @@ -21,7 +21,8 @@ "num_speculative_tokens": 3, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}]) +@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}]) @pytest.mark.parametrize("batch_size", [1, 8, 32]) @pytest.mark.parametrize("temperature", [0.1, 1.0]) @pytest.mark.parametrize( @@ -30,15 +31,26 @@ # Use smaller output len for fast test. 10, ]) -@pytest.mark.parametrize("seed", [1]) -def test_seeded_consistency(baseline_llm_generator, batch_size: int, - temperature: float, output_len: int): +@pytest.mark.parametrize("seed", [None]) +def test_seeded_consistency(baseline_llm_generator, test_llm_generator, + batch_size: int, temperature: float, + output_len: int): """Verify outputs are consistent across multiple runs with same seed """ run_equality_correctness_test(baseline_llm_generator, - baseline_llm_generator, + test_llm_generator, batch_size, max_output_len=output_len, temperature=temperature, seeded=True, force_output_len=True) + + # Ensure this same test does fail if we _don't_ include per-request seeds + with pytest.raises(AssertionError): + run_equality_correctness_test(baseline_llm_generator, + test_llm_generator, + batch_size, + max_output_len=output_len, + temperature=temperature, + seeded=False, + force_output_len=True) From 40468b13faa1ebde366e7002c5752b59e1368d10 Mon Sep 17 00:00:00 2001 From: "Allen.Dou" Date: Wed, 24 Jul 2024 23:58:42 +0800 Subject: [PATCH 017/167] [Bugfix] Miscalculated latency lead to time_to_first_token_seconds inaccurate. (#6686) --- vllm/engine/llm_engine.py | 3 ++- vllm/spec_decode/spec_decode_worker.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index eabe3b23a9d58..48d5305892219 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -949,8 +949,9 @@ def do_log_stats( model_output: Optional[List[SamplerOutput]] = None) -> None: """Forced log when no requests active.""" if self.log_stats: + stats = self._get_stats(scheduler_outputs, model_output) for logger in self.stat_loggers.values(): - logger.log(self._get_stats(scheduler_outputs, model_output)) + logger.log(stats) def _get_stats( self, diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 8cf0aa5b8981a..98960b88f719f 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -484,7 +484,7 @@ def _run_non_driver_rank(self) -> bool: for both speculation cases (num_lookahead_slots>0) and non-speculation cases (e.g. prefill). - Returns True iff there are remaining sequences to process. + Returns True if there are remaining sequences to process. """ assert self.rank != self._driver_rank From ee812580f7ae3969cf2550cc4dab31490bfea950 Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Wed, 24 Jul 2024 19:36:04 +0200 Subject: [PATCH 018/167] [Frontend] split run_server into build_server and run_server (#6740) --- vllm/entrypoints/openai/api_server.py | 77 +++++++++++++++++---------- vllm/scripts.py | 3 +- 2 files changed, 52 insertions(+), 28 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 931063d90566c..add5c91900b23 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -2,6 +2,7 @@ import importlib import inspect import re +import signal from contextlib import asynccontextmanager from http import HTTPStatus from typing import Optional, Set @@ -213,12 +214,13 @@ async def authentication(request: Request, call_next): return app -def run_server(args, llm_engine=None): +async def build_server( + args, + llm_engine: Optional[AsyncLLMEngine] = None, + **uvicorn_kwargs, +) -> uvicorn.Server: app = build_app(args) - logger.info("vLLM API server version %s", VLLM_VERSION) - logger.info("args: %s", args) - if args.served_model_name is not None: served_model_names = args.served_model_name else: @@ -231,19 +233,7 @@ def run_server(args, llm_engine=None): if llm_engine is not None else AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER)) - event_loop: Optional[asyncio.AbstractEventLoop] - try: - event_loop = asyncio.get_running_loop() - except RuntimeError: - event_loop = None - - if event_loop is not None and event_loop.is_running(): - # If the current is instanced by Ray Serve, - # there is already a running event loop - model_config = event_loop.run_until_complete(engine.get_model_config()) - else: - # When using single vLLM without engine_use_ray - model_config = asyncio.run(engine.get_model_config()) + model_config = await engine.get_model_config() if args.disable_log_requests: request_logger = None @@ -296,15 +286,48 @@ def run_server(args, llm_engine=None): methods = ', '.join(route.methods) logger.info("Route: %s, Methods: %s", route.path, methods) - uvicorn.run(app, - host=args.host, - port=args.port, - log_level=args.uvicorn_log_level, - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, - ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile, - ssl_ca_certs=args.ssl_ca_certs, - ssl_cert_reqs=args.ssl_cert_reqs) + config = uvicorn.Config( + app, + host=args.host, + port=args.port, + log_level=args.uvicorn_log_level, + timeout_keep_alive=TIMEOUT_KEEP_ALIVE, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs, + **uvicorn_kwargs, + ) + + return uvicorn.Server(config) + + +async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None: + logger.info("vLLM API server version %s", VLLM_VERSION) + logger.info("args: %s", args) + + server = await build_server( + args, + llm_engine, + **uvicorn_kwargs, + ) + + loop = asyncio.get_running_loop() + + server_task = loop.create_task(server.serve()) + + def signal_handler() -> None: + # prevents the uvicorn signal handler to exit early + server_task.cancel() + + loop.add_signal_handler(signal.SIGINT, signal_handler) + loop.add_signal_handler(signal.SIGTERM, signal_handler) + + try: + await server_task + except asyncio.CancelledError: + print("Gracefully stopping http server") + await server.shutdown() if __name__ == "__main__": @@ -314,4 +337,4 @@ def run_server(args, llm_engine=None): description="vLLM OpenAI-Compatible RESTful API server.") parser = make_arg_parser(parser) args = parser.parse_args() - run_server(args) + asyncio.run(run_server(args)) diff --git a/vllm/scripts.py b/vllm/scripts.py index 3f334be925ee8..aefa5cec93a57 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -1,5 +1,6 @@ # The CLI entrypoint to vLLM. import argparse +import asyncio import os import signal import sys @@ -25,7 +26,7 @@ def serve(args: argparse.Namespace) -> None: # EngineArgs expects the model name to be passed as --model. args.model = args.model_tag - run_server(args) + asyncio.run(run_server(args)) def interactive_cli(args: argparse.Namespace) -> None: From 0e63494cf334497148ee4203b59e34b2dd53e50e Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 24 Jul 2024 11:36:52 -0700 Subject: [PATCH 019/167] Add fp8 support to `reshape_and_cache_flash` (#6667) --- csrc/cache.h | 3 +- csrc/cache_kernels.cu | 75 ++++++++++++++++----------- csrc/torch_bindings.cpp | 3 +- tests/kernels/test_cache.py | 42 ++++++++++++--- vllm/_custom_ops.py | 5 +- vllm/attention/backends/flash_attn.py | 2 + vllm/attention/backends/flashinfer.py | 2 + vllm/utils.py | 9 +++- 8 files changed, 98 insertions(+), 43 deletions(-) diff --git a/csrc/cache.h b/csrc/cache.h index 52177e8901a89..11c4c5001daaa 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -25,7 +25,8 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype); + const std::string& kv_cache_dtype, + const double k_scale, const double v_scale); // Just for unittest void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index caef7f5e18630..1be806bbfa43c 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -203,17 +203,18 @@ __global__ void reshape_and_cache_kernel( } } -template +template __global__ void reshape_and_cache_flash_kernel( const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] - scalar_t* __restrict__ k_cache, // [num_blocks, block_size, num_heads, + cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads, // head_size] - scalar_t* __restrict__ v_cache, // [num_blocks, block_size, num_heads, + cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads, // head_size] const int64_t* __restrict__ slot_mapping, // [num_tokens] const int block_stride, const int key_stride, const int value_stride, - const int num_heads, const int head_size, const int block_size) { + const int num_heads, const int head_size, const int block_size, + const float k_scale, const float v_scale) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; // NOTE: slot_idx can be -1 if the token is padded @@ -228,11 +229,20 @@ __global__ void reshape_and_cache_flash_kernel( const int64_t src_value_idx = token_idx * value_stride + i; const int head_idx = i / head_size; const int head_offset = i % head_size; - const int64_t tgt_value_idx = block_idx * block_stride + - block_offset * num_heads * head_size + - head_idx * head_size + head_offset; - k_cache[tgt_value_idx] = key[src_key_idx]; - v_cache[tgt_value_idx] = value[src_value_idx]; + const int64_t tgt_key_value_idx = block_idx * block_stride + + block_offset * num_heads * head_size + + head_idx * head_size + head_offset; + scalar_t tgt_key = key[src_key_idx]; + scalar_t tgt_value = value[src_value_idx]; + if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { + key_cache[tgt_key_value_idx] = tgt_key; + value_cache[tgt_key_value_idx] = tgt_value; + } else { + key_cache[tgt_key_value_idx] = + fp8::scaled_convert(tgt_key, k_scale); + value_cache[tgt_key_value_idx] = + fp8::scaled_convert(tgt_value, v_scale); + } } } } // namespace vllm @@ -278,40 +288,45 @@ void reshape_and_cache( CALL_RESHAPE_AND_CACHE) } +// KV_T is the stored data type of kv-cache. +// CACHE_T is the data type of key and value tensors. +// KV_DTYPE is the real data type of kv-cache. +#define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE) \ + vllm::reshape_and_cache_flash_kernel \ + <<>>( \ + reinterpret_cast(key.data_ptr()), \ + reinterpret_cast(value.data_ptr()), \ + reinterpret_cast(key_cache.data_ptr()), \ + reinterpret_cast(value_cache.data_ptr()), \ + slot_mapping.data_ptr(), block_stride, key_stride, \ + value_stride, num_heads, head_size, block_size, k_scale, v_scale); + void reshape_and_cache_flash( - torch::Tensor& key, // [num_tokens, num_heads, head_size] - torch::Tensor& value, // [num_tokens, num_heads, head_size] - torch::Tensor& k_cache, // [num_blocks, block_size, num_heads, head_size] - torch::Tensor& v_cache, // [num_blocks, block_size, num_heads, head_size] + torch::Tensor& key, // [num_tokens, num_heads, head_size] + torch::Tensor& value, // [num_tokens, num_heads, head_size] + torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size] + torch::Tensor& + value_cache, // [num_blocks, block_size, num_heads, head_size] torch::Tensor& slot_mapping, // [num_tokens] - const std::string& kv_cache_dtype) { - // FIXME: only support auto datatype, does not support fp8 - if (kv_cache_dtype != "auto") { - TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype); - } + const std::string& kv_cache_dtype, const double k_scale, + const double v_scale) { int num_tokens = key.size(0); int num_heads = key.size(1); int head_size = key.size(2); - int block_size = k_cache.size(1); + int block_size = key_cache.size(1); int key_stride = key.stride(0); int value_stride = value.stride(0); - int block_stride = k_cache.stride(0); - TORCH_CHECK(k_cache.stride(0) == v_cache.stride(0)); + int block_stride = key_cache.stride(0); + TORCH_CHECK(key_cache.stride(0) == value_cache.stride(0)); dim3 grid(num_tokens); dim3 block(std::min(num_heads * head_size, 512)); const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_TYPES( - key.scalar_type(), "reshape_and_cache_flash", [&] { - vllm::reshape_and_cache_flash_kernel - <<>>( - key.data_ptr(), value.data_ptr(), - k_cache.data_ptr(), v_cache.data_ptr(), - slot_mapping.data_ptr(), block_stride, key_stride, - value_stride, num_heads, head_size, block_size); - }); + + DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype, + CALL_RESHAPE_AND_CACHE_FLASH); } namespace vllm { diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 0df9bdb75018f..3027b63ba2b33 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -248,7 +248,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! key_cache," " Tensor! value_cache," " Tensor slot_mapping," - " str kv_cache_dtype) -> ()"); + " str kv_cache_dtype," + " float k_scale, float v_scale) -> ()"); cache_ops.impl("reshape_and_cache_flash", torch::kCUDA, &reshape_and_cache_flash); diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 70ae3d0c6e0c3..f9a609464abfc 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -215,8 +215,6 @@ def test_reshape_and_cache_flash( device: str, kv_cache_dtype: str, ) -> None: - if kv_cache_dtype == "fp8": - pytest.skip() random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) @@ -248,15 +246,33 @@ def test_reshape_and_cache_flash( dtype, device=device, ) - key_cache, value_cache = key_caches[0], value_caches[0] + key_cache, value_cache = key_caches[0].contiguous( + ), value_caches[0].contiguous() + del key_caches + del value_caches # Clone the KV caches. - cloned_key_cache = key_cache.clone() - cloned_value_cache = value_cache.clone() + if kv_cache_dtype == "fp8": + cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16) + ops.convert_fp8(cloned_key_cache, key_cache) + cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16) + ops.convert_fp8(cloned_value_cache, value_cache) + else: + cloned_key_cache = key_cache.clone() + cloned_value_cache = value_cache.clone() + + # Using default kv_scale + k_scale = v_scale = 1.0 # Call the reshape_and_cache kernel. ops.reshape_and_cache_flash(key, value, key_cache, value_cache, - slot_mapping, kv_cache_dtype) + slot_mapping, kv_cache_dtype, k_scale, v_scale) + + if kv_cache_dtype == "fp8": + result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) + ops.convert_fp8(result_key_cache, key_cache) + result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) + ops.convert_fp8(result_value_cache, value_cache) # Run the reference implementation. block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") @@ -269,8 +285,18 @@ def test_reshape_and_cache_flash( cloned_key_cache[block_idx, block_offset, :, :] = key[i] cloned_value_cache[block_idx, block_offset, :, :] = value[i] - assert torch.allclose(key_cache, cloned_key_cache) - assert torch.allclose(value_cache, cloned_value_cache) + if kv_cache_dtype == "fp8": + assert torch.allclose(result_key_cache, + cloned_key_cache, + atol=0.001, + rtol=0.1) + assert torch.allclose(result_value_cache, + cloned_value_cache, + atol=0.001, + rtol=0.1) + else: + assert torch.allclose(key_cache, cloned_key_cache) + assert torch.allclose(value_cache, cloned_value_cache) @pytest.mark.parametrize("direction", COPYING_DIRECTION) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index e5151c070f2f7..0186594656cc1 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -426,10 +426,13 @@ def reshape_and_cache_flash( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, + k_scale: float, + v_scale: float, ) -> None: torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache, slot_mapping, - kv_cache_dtype) + kv_cache_dtype, k_scale, + v_scale) def copy_blocks(key_caches: List[torch.Tensor], diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index b16a204c8f44e..949bd973cf3c4 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -478,6 +478,8 @@ def forward( value_cache, attn_metadata.slot_mapping.flatten(), self.kv_cache_dtype, + k_scale, + v_scale, ) num_prefill_tokens = attn_metadata.num_prefill_tokens diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 9dac12d3b906d..2a4900489df35 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -489,6 +489,8 @@ def forward( kv_cache[:, 1], attn_metadata.slot_mapping.flatten(), self.kv_cache_dtype, + k_scale, + v_scale, ) query = query.contiguous( diff --git a/vllm/utils.py b/vllm/utils.py index 83605631b5bd6..876c3bf90b02c 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -491,7 +491,6 @@ def create_kv_caches_with_random_flash( seed: int = 0, device: Optional[str] = "cuda", ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: - assert cache_dtype != "fp8" torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) @@ -507,7 +506,13 @@ def create_kv_caches_with_random_flash( key_value_cache = torch.empty(size=key_value_cache_shape, dtype=torch_dtype, device=device) - key_value_cache.uniform_(-scale, scale) + if cache_dtype in ["auto", "half", "bfloat16", "float"]: + key_value_cache.uniform_(-scale, scale) + elif cache_dtype == 'fp8': + _generate_random_fp8(key_value_cache, -scale, scale) + else: + raise ValueError( + f"Does not support key cache of type {cache_dtype}") key_caches.append(key_value_cache[:, 0]) value_caches.append(key_value_cache[:, 1]) return key_caches, value_caches From 5448f67635570cee6fc23c7cd166e9d8f7595009 Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Wed, 24 Jul 2024 12:17:12 -0700 Subject: [PATCH 020/167] [Core] Tweaks to model runner/input builder developer APIs (#6712) --- vllm/attention/backends/flashinfer.py | 35 ++++--- vllm/worker/embedding_model_runner.py | 4 +- vllm/worker/model_runner.py | 134 +++++++++++++++++--------- 3 files changed, 109 insertions(+), 64 deletions(-) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 2a4900489df35..9746304347d6e 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -297,23 +297,26 @@ def _add_seq_group( if is_profile_run: return - # Get the number of valid blocks based on sequence length. - # If seq_len = 16, block_size = 16, - # block_table_bound is 1 with 1 valid block. - # If seq_len = 15, block_size = 16, - # block_table_bound is 0 + 1 with 1 valid block. - block_table_bound = seq_len // self.block_size + 1 \ - if seq_len % self.block_size != 0 \ - else seq_len // self.block_size block_table = block_tables[seq_id] - self.paged_kv_indices.extend(block_table[:block_table_bound]) - self.paged_kv_indptr.append(self.paged_kv_indptr[-1] + - block_table_bound) - - last_page_len = seq_len % self.block_size - if last_page_len == 0: - last_page_len = self.block_size - self.paged_kv_last_page_len.append(last_page_len) + self._update_paged_kv_tensors(block_table, seq_len) + + def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int): + # Get the number of valid blocks based on sequence length. + # If seq_len = 16, block_size = 16, + # block_table_bound is 1 with 1 valid block. + # If seq_len = 15, block_size = 16, + # block_table_bound is 0 + 1 with 1 valid block. + block_table_bound = seq_len // self.block_size + 1 \ + if seq_len % self.block_size != 0 \ + else seq_len // self.block_size + self.paged_kv_indices.extend(block_table[:block_table_bound]) + self.paged_kv_indptr.append(self.paged_kv_indptr[-1] + + block_table_bound) + + last_page_len = seq_len % self.block_size + if last_page_len == 0: + last_page_len = self.block_size + self.paged_kv_last_page_len.append(last_page_len) def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int): diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index a333e6634a41f..e919dbd18d9df 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -11,7 +11,8 @@ from vllm.pooling_params import PoolingParams from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, SequenceGroupMetadata) -from vllm.worker.model_runner import GPUModelRunnerBase, ModelInputForGPU +from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPU, + ModelInputForGPUBuilder) logger = init_logger(__name__) @@ -28,6 +29,7 @@ class EmbeddingModelRunner( GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]): _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = ( ModelInputForGPUWithPoolingMetadata) + _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder def __init__( self, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 073c5a73f739b..86d26b4a84c36 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -3,7 +3,7 @@ import time import warnings import weakref -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Set, Tuple, Type, TypeVar, Union) @@ -171,48 +171,83 @@ def from_broadcasted_tensor_dict( class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]): """Build ModelInputForGPU from SequenceGroupMetadata.""" - @dataclass + # Note: ideally we would be using a dataclass(kw_only=True) + # here, so that this can be subclassed easily, + # but kw_only is not supported in python<3.10. class InterDataForSeqGroup: """Intermediate data for the current sequence group.""" - # From sequence group metadata. - request_id: str - seq_ids: List[int] - is_prompt: bool - block_tables: Optional[Dict[int, List[int]]] - computed_block_nums: List[int] - n_seqs: int = 0 - - # Input tokens and positions. - input_tokens: List[List[int]] = field(default_factory=list) - input_positions: List[List[int]] = field(default_factory=list) - - # The sequence length (may be capped to the sliding window). - seq_lens: List[int] = field(default_factory=list) - # The original sequence length (before applying sliding window). - # This is used to compute slot mapping. - orig_seq_lens: List[int] = field(default_factory=list) - # The query length. - query_lens: List[int] = field(default_factory=list) - # The number of tokens that are already computed. - context_lens: List[int] = field(default_factory=list) - # The current sliding window block. - curr_sliding_window_blocks: List[int] = field(default_factory=list) - - # LoRA inputs. - lora_index_mapping: List[List[int]] = field(default_factory=list) - lora_prompt_mapping: List[List[int]] = field(default_factory=list) - lora_requests: Set[LoRARequest] = field(default_factory=set) - - # Prompt adapter inputs. - prompt_adapter_index_mapping: List[int] = field(default_factory=list) - prompt_adapter_prompt_mapping: List[int] = field(default_factory=list) - prompt_adapter_request: Optional[PromptAdapterRequest] = None - - # Multi-modal inputs. - multi_modal_inputs: Optional[MultiModalInputs] = None - - # Whether the prefix cache is hit (prefill only). - prefix_cache_hit: bool = False + + def __init__( + self, + *, + # From sequence group metadata. + request_id: str, + seq_ids: List[int], + is_prompt: bool, + block_tables: Optional[Dict[int, List[int]]], + computed_block_nums: List[int], + n_seqs: int = 0, + + # Input tokens and positions. + input_tokens: Optional[List[List[int]]] = None, + input_positions: Optional[List[List[int]]] = None, + + # The sequence length (may be capped to the sliding window). + seq_lens: Optional[List[int]] = None, + # The original sequence length (before applying sliding window). + # This is used to compute slot mapping. + orig_seq_lens: Optional[List[int]] = None, + # The query length. + query_lens: Optional[List[int]] = None, + # The number of tokens that are already computed. + context_lens: Optional[List[int]] = None, + # The current sliding window block. + curr_sliding_window_blocks: Optional[List[int]] = None, + + # LoRA inputs. + lora_index_mapping: Optional[List[List[int]]] = None, + lora_prompt_mapping: Optional[List[List[int]]] = None, + lora_requests: Optional[Set[LoRARequest]] = None, + + # Prompt adapter inputs. + prompt_adapter_index_mapping: Optional[List[int]] = None, + prompt_adapter_prompt_mapping: Optional[List[int]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + + # Multi-modal inputs. + multi_modal_inputs: Optional[MultiModalInputs] = None, + + # Whether the prefix cache is hit (prefill only). + prefix_cache_hit: bool = False, + ): + self.request_id = request_id + self.seq_ids = seq_ids + self.is_prompt = is_prompt + self.block_tables = block_tables + self.computed_block_nums = computed_block_nums + self.n_seqs = n_seqs + self.input_tokens = input_tokens or [] + self.input_positions = input_positions or [] + self.seq_lens = seq_lens or [] + self.orig_seq_lens = orig_seq_lens or [] + self.query_lens = query_lens or [] + self.context_lens = context_lens or [] + self.curr_sliding_window_blocks = curr_sliding_window_blocks or [] + + self.lora_index_mapping = lora_index_mapping or [] + self.lora_prompt_mapping = lora_prompt_mapping or [] + self.lora_requests = lora_requests or set() + + self.prompt_adapter_index_mapping = (prompt_adapter_index_mapping + or []) + self.prompt_adapter_prompt_mapping = (prompt_adapter_prompt_mapping + or []) + self.prompt_adapter_request = prompt_adapter_request + + self.multi_modal_inputs = multi_modal_inputs + self.prefix_cache_hit = prefix_cache_hit + + self.__post_init__() def __post_init__(self): self.n_seqs = len(self.seq_ids) @@ -457,6 +492,12 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): for per_seq_group_fn in self.per_seq_group_compute_fns: per_seq_group_fn(inter_data, seq_group_metadata) + def _use_captured_graph(self, batch_size: int, + max_decode_seq_len: int) -> bool: + return (self.decode_only and not self.runner.model_config.enforce_eager + and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] + and max_decode_seq_len <= self.runner.max_seq_len_to_capture) + def build(self) -> ModelInputForGPU: """Finalize the builder intermediate data and create on-device tensors. @@ -491,10 +532,8 @@ def build(self) -> ModelInputForGPU: } batch_size = len(input_tokens) - use_captured_graph = ( - self.decode_only and not self.runner.model_config.enforce_eager - and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] - and max_decode_seq_len <= self.runner.max_seq_len_to_capture) + use_captured_graph = self._use_captured_graph(batch_size, + max_decode_seq_len) # If cuda graph can be used, pad tensors accordingly. # See `capture_model` API for more details. @@ -592,6 +631,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): Helper class for shared methods between GPU model runners. """ _model_input_cls: Type[TModelInputForGPU] + _builder_cls: Type[ModelInputForGPUBuilder] def __init__( self, @@ -794,8 +834,7 @@ def _prepare_model_input_tensors( If cuda graph is required, this API automatically pads inputs. """ - builder = ModelInputForGPUBuilder(weakref.proxy(self), - finished_requests_ids) + builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) for seq_group_metadata in seq_group_metadata_list: builder.add_seq_group(seq_group_metadata) return builder.build() # type: ignore @@ -1191,6 +1230,7 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): """ _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = ( ModelInputForGPUWithSamplingMetadata) + _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder def make_model_input_from_broadcasted_tensor_dict( self, From 421e218b37bd98b52bb3737c5aacc5a60fd460c0 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 24 Jul 2024 16:22:16 -0400 Subject: [PATCH 021/167] [Bugfix] Bump transformers to 4.43.2 (#6752) --- requirements-common.txt | 2 +- tests/test_config.py | 55 +++++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 940740722c7e6..3b8d473c1fe7a 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -6,7 +6,7 @@ numpy < 2.0.0 requests tqdm py-cpuinfo -transformers >= 4.43.1 # Required for Chameleon and Llama 3.1 hotfox. +transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox. tokenizers >= 0.19.1 # Required for Llama 3. fastapi aiohttp diff --git a/tests/test_config.py b/tests/test_config.py index 9f7d85e39ad67..225d71c0bc0ea 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -64,8 +64,9 @@ def test_get_sliding_window(): def test_rope_customization(): - TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0} + TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0} TEST_ROPE_THETA = 16_000_000.0 + LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0} llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", @@ -95,29 +96,29 @@ def test_rope_customization(): None) == TEST_ROPE_THETA assert llama_model_config.max_model_len == 16384 - # TODO: add these back when the rope configs are fixed - # LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0} - # longchat_model_config = ModelConfig( - # "lmsys/longchat-13b-16k", - # "lmsys/longchat-13b-16k", - # tokenizer_mode="auto", - # trust_remote_code=False, - # dtype="float16", - # seed=0, - # ) - # assert getattr(longchat_model_config.hf_config, "rope_scaling", - # None) == LONGCHAT_ROPE_SCALING - # assert longchat_model_config.max_model_len == 16384 - - # longchat_model_config = ModelConfig( - # "lmsys/longchat-13b-16k", - # "lmsys/longchat-13b-16k", - # tokenizer_mode="auto", - # trust_remote_code=False, - # dtype="float16", - # seed=0, - # rope_scaling=TEST_ROPE_SCALING, - # ) - # assert getattr(longchat_model_config.hf_config, "rope_scaling", - # None) == TEST_ROPE_SCALING - # assert longchat_model_config.max_model_len == 4096 + longchat_model_config = ModelConfig( + "lmsys/longchat-13b-16k", + "lmsys/longchat-13b-16k", + tokenizer_mode="auto", + trust_remote_code=False, + dtype="float16", + seed=0, + ) + # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config + assert all( + longchat_model_config.hf_config.rope_scaling.get(key) == value + for key, value in LONGCHAT_ROPE_SCALING.items()) + assert longchat_model_config.max_model_len == 16384 + + longchat_model_config = ModelConfig( + "lmsys/longchat-13b-16k", + "lmsys/longchat-13b-16k", + tokenizer_mode="auto", + trust_remote_code=False, + dtype="float16", + seed=0, + rope_scaling=TEST_ROPE_SCALING, + ) + assert getattr(longchat_model_config.hf_config, "rope_scaling", + None) == TEST_ROPE_SCALING + assert longchat_model_config.max_model_len == 4096 From d88c458f44f5bc0d01215310f8abb5d63fa106d4 Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Wed, 24 Jul 2024 17:32:57 -0400 Subject: [PATCH 022/167] [Doc][AMD][ROCm]Added tips to refer to mi300x tuning guide for mi300x users (#6754) --- docs/source/getting_started/amd-installation.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 71d7527a3e706..1c7d274b7c47e 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -142,3 +142,10 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases. - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. - The ROCm version of PyTorch, ideally, should match the ROCm driver version. + + +.. tip:: + - For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide `_ for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to `vLLM performance optimization `_. + + From 740374d456a638df98ffbc7d9dab328752330e62 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 24 Jul 2024 17:37:12 -0700 Subject: [PATCH 023/167] [core][distributed] fix zmq hang (#6759) --- vllm/connections.py | 4 +- .../device_communicators/shm_broadcast.py | 60 +++++++------------ 2 files changed, 23 insertions(+), 41 deletions(-) diff --git a/vllm/connections.py b/vllm/connections.py index 65d44176e2464..e785a0b3ebd74 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Mapping, Optional +from typing import Mapping, MutableMapping, Optional from urllib.parse import urlparse import aiohttp @@ -40,7 +40,7 @@ def _validate_http_url(self, url: str): raise ValueError("Invalid HTTP URL: A valid HTTP URL " "must have scheme 'http' or 'https'.") - def _headers(self, **extras: str) -> Mapping[str, str]: + def _headers(self, **extras: str) -> MutableMapping[str, str]: return {"User-Agent": f"vLLM/{VLLM_VERSION}", **extras} def get_response( diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 75d84c7a71bc3..d4847542688c0 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -9,7 +9,7 @@ import torch import torch.distributed as dist from torch.distributed import ProcessGroup -from zmq import PUB, REP, REQ, SUB, SUBSCRIBE, Context # type: ignore +from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context # type: ignore import vllm.envs as envs from vllm.logger import init_logger @@ -153,9 +153,7 @@ class Handle: buffer: Optional[ShmRingBuffer] = None local_subscribe_port: Optional[int] = None - local_sync_port: Optional[int] = None remote_subscribe_port: Optional[int] = None - remote_sync_port: Optional[int] = None class MessageQueue: @@ -189,38 +187,36 @@ def __init__( self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes, max_chunks) - self.local_socket = context.socket(PUB) + # XPUB is very similar to PUB, + # except that it can receive subscription messages + # to confirm the number of subscribers + self.local_socket = context.socket(XPUB) + # set the verbose option so that we can receive every subscription + # message. otherwise, we will only receive the first subscription + # see http://api.zeromq.org/3-3:zmq-setsockopt for more details + self.local_socket.setsockopt(XPUB_VERBOSE, True) local_subscribe_port = get_open_port() self.local_socket.bind(f"tcp://*:{local_subscribe_port}") - self.local_sync_socket = context.socket(REP) - local_sync_port = get_open_port() - self.local_sync_socket.bind(f"tcp://*:{local_sync_port}") self.current_idx = 0 else: self.buffer = None # type: ignore local_subscribe_port = None - local_sync_port = None self.local_socket = None - self.local_sync_socket = None self.current_idx = -1 if n_remote_reader > 0: # for remote readers, we will: # create a publish-subscribe socket to communicate large data - self.remote_socket = context.socket(PUB) + self.remote_socket = context.socket(XPUB) + self.remote_socket.setsockopt(XPUB_VERBOSE, True) remote_subscribe_port = get_open_port() self.remote_socket.bind(f"tcp://*:{remote_subscribe_port}") - self.remote_sync_socket = context.socket(REP) - remote_sync_port = get_open_port() - self.remote_sync_socket.bind(f"tcp://*:{remote_sync_port}") else: remote_subscribe_port = None - remote_sync_port = None self.remote_socket = None - self.remote_sync_socket = None self._is_writer = True self._is_local_reader = False @@ -233,9 +229,7 @@ def __init__( local_reader_ranks=local_reader_ranks, buffer=self.buffer, local_subscribe_port=local_subscribe_port, - local_sync_port=local_sync_port, remote_subscribe_port=remote_subscribe_port, - remote_sync_port=remote_sync_port, ) logger.info("vLLM message queue communication handle: %s", self.handle) @@ -264,12 +258,7 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue": self.local_socket.connect( f"tcp://{handle.connect_ip}:{handle.local_subscribe_port}") - self.local_sync_socket = context.socket(REQ) - self.local_sync_socket.connect( - f"tcp://{handle.connect_ip}:{handle.local_sync_port}") - self.remote_socket = None - self.remote_sync_socket = None else: self.buffer = None # type: ignore self.current_idx = -1 @@ -278,17 +267,12 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue": self._is_remote_reader = True self.local_socket = None - self.local_sync_socket = None self.remote_socket = context.socket(SUB) self.remote_socket.setsockopt_string(SUBSCRIBE, "") self.remote_socket.connect( f"tcp://{handle.connect_ip}:{handle.remote_subscribe_port}") - self.remote_sync_socket = context.socket(REQ) - self.remote_sync_socket.connect( - f"tcp://{handle.connect_ip}:{handle.remote_sync_port}") - return self def wait_until_ready(self): @@ -300,29 +284,27 @@ def wait_until_ready(self): # local readers for i in range(self.n_local_reader): - recv = self.local_sync_socket.recv() - assert recv == b"READY" - self.local_sync_socket.send(b"READY") + # wait for subscription messages from all local readers + self.local_socket.recv() if self.n_local_reader > 0: + # send a message to all local readers + # to make sure the publish channel is working self.local_socket.send(b"READY") # remote readers for i in range(self.n_remote_reader): - recv = self.remote_sync_socket.recv() - assert recv == b"READY" - self.remote_sync_socket.send(b"READY") + # wait for subscription messages from all remote readers + self.remote_socket.recv() if self.n_remote_reader > 0: + # send a message to all remote readers + # to make sure the publish channel is working self.remote_socket.send(b"READY") elif self._is_local_reader: - self.local_sync_socket.send(b"READY") - recv = self.local_sync_socket.recv() - assert recv == b"READY" + # wait for the writer to send a message recv = self.local_socket.recv() assert recv == b"READY" elif self._is_remote_reader: - self.remote_sync_socket.send(b"READY") - recv = self.remote_sync_socket.recv() - assert recv == b"READY" + # wait for the writer to send a message recv = self.remote_socket.recv() assert recv == b"READY" From 5689e256baf0c45148a01ad147abf11ad82c9690 Mon Sep 17 00:00:00 2001 From: "Evan Z. Liu" Date: Wed, 24 Jul 2024 18:51:00 -0700 Subject: [PATCH 024/167] [Frontend] Represent tokens with identifiable strings (#6626) --- tests/entrypoints/openai/test_completion.py | 10 ++- .../openai/test_return_tokens_as_ids.py | 83 +++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 2 + vllm/entrypoints/openai/cli_args.py | 6 ++ vllm/entrypoints/openai/serving_chat.py | 23 +++-- vllm/entrypoints/openai/serving_completion.py | 19 ++++- vllm/entrypoints/openai/serving_engine.py | 14 ++-- 7 files changed, 138 insertions(+), 19 deletions(-) create mode 100644 tests/entrypoints/openai/test_return_tokens_as_ids.py diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 0896e337b5d24..fe00640c0021e 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -55,8 +55,9 @@ def zephyr_pa_files(): @pytest.fixture(scope="module") -def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files): - args = [ +def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files, + zephyr_pa_files): + return [ # use half precision for speed and memory savings in CI environment "--dtype", "bfloat16", @@ -85,7 +86,10 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files, zephyr_pa_files): "128", ] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + +@pytest.fixture(scope="module") +def server(default_server_args): + with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: yield remote_server diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py new file mode 100644 index 0000000000000..abe413978e0e5 --- /dev/null +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -0,0 +1,83 @@ +# Separate these tests out from test_completion and test_chat, because they +# require launching a second server with a different flag. Running both servers +# at the same time on a single node will OOM. + +import pytest + +from vllm.transformers_utils.tokenizer import get_tokenizer + +from ...utils import RemoteOpenAIServer +from .test_completion import default_server_args # noqa: F401 +from .test_completion import zephyr_lora_added_tokens_files # noqa: F401 +from .test_completion import zephyr_lora_files # noqa: F401 +from .test_completion import zephyr_pa_files # noqa: F401 +from .test_completion import MODEL_NAME + + +@pytest.fixture(scope="module") +def server_with_return_tokens_as_token_ids_flag( + default_server_args): # noqa: F811 + args_with_flag = default_server_args + ["--return-tokens-as-token-ids"] + with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +async def test_completion_return_tokens_as_token_ids_completion( + server_with_return_tokens_as_token_ids_flag): + client = server_with_return_tokens_as_token_ids_flag.get_async_client() + + completion = await client.completions.create( + model=MODEL_NAME, + # Include Unicode characters to test for dividing a single + # character across multiple tokens: 🎉 is [28705, 31862] for the + # Zephyr tokenizer + prompt="Say 'Hello, world! 🎉'", + echo=True, + temperature=0, + max_tokens=10, + logprobs=1) + + text = completion.choices[0].text + token_strs = completion.choices[0].logprobs.tokens + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + # Check that the token representations are consistent between raw tokens + # and top_logprobs + # Slice off the first one, because there's no scoring associated with BOS + top_logprobs = completion.choices[0].logprobs.top_logprobs[1:] + top_logprob_keys = [ + next(iter(logprob_by_tokens)) for logprob_by_tokens in top_logprobs + ] + assert token_strs[1:] == top_logprob_keys + + # Check that decoding the tokens gives the expected text + tokens = [int(token.removeprefix("token_id:")) for token in token_strs] + assert text == tokenizer.decode(tokens, skip_special_tokens=True) + + +@pytest.mark.asyncio +async def test_chat_return_tokens_as_token_ids_completion( + server_with_return_tokens_as_token_ids_flag): + client = server_with_return_tokens_as_token_ids_flag.get_async_client() + response = await client.chat.completions.create( + model=MODEL_NAME, + # Include Unicode characters to test for dividing a single + # character across multiple tokens: 🎉 is [28705, 31862] for the + # Zephyr tokenizer + messages=[{ + "role": "system", + "content": "You like to respond in only emojis, like 🎉" + }, { + "role": "user", + "content": "Please write some emojis: 🐱🐶🎉" + }], + temperature=0, + max_tokens=8, + logprobs=True) + + text = response.choices[0].message.content + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + token_ids = [] + for logprob_content in response.choices[0].logprobs.content: + token_ids.append(int(logprob_content.token.removeprefix("token_id:"))) + assert tokenizer.decode(token_ids, skip_special_tokens=True) == text diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index add5c91900b23..0fe4dd245b5e6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -254,6 +254,7 @@ async def build_server( prompt_adapters=args.prompt_adapters, request_logger=request_logger, chat_template=args.chat_template, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, ) openai_serving_completion = OpenAIServingCompletion( engine, @@ -262,6 +263,7 @@ async def build_server( lora_modules=args.lora_modules, prompt_adapters=args.prompt_adapters, request_logger=request_logger, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, ) openai_serving_embedding = OpenAIServingEmbedding( engine, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 64919c8be8642..a4192937980f7 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -128,6 +128,12 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "using @app.middleware('http'). " "If a class is provided, vLLM will add it to the server " "using app.add_middleware(). ") + parser.add_argument( + "--return-tokens-as-token-ids", + action="store_true", + help="When --max-logprobs is specified, represents single tokens as" + "strings of the form 'token_id:{token_id}' so that tokens that" + "are not JSON-encodable can be identified.") parser = AsyncEngineArgs.add_cli_args(parser) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 3899509ef3ff4..012f70e661100 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -50,13 +50,15 @@ def __init__( prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], chat_template: Optional[str], + return_tokens_as_token_ids: bool = False, ): super().__init__(engine=engine, model_config=model_config, served_model_names=served_model_names, lora_modules=lora_modules, prompt_adapters=prompt_adapters, - request_logger=request_logger) + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids) self.response_role = response_role @@ -522,11 +524,14 @@ def _get_top_logprobs( self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int], tokenizer: PreTrainedTokenizer) -> List[ChatCompletionLogProb]: return [ - ChatCompletionLogProb( - token=(token := self._get_decoded_token(p[1], p[0], - tokenizer)), - logprob=max(p[1].logprob, -9999.0), - bytes=list(token.encode("utf-8", errors="replace"))) + ChatCompletionLogProb(token=(token := self._get_decoded_token( + p[1], + p[0], + tokenizer, + return_as_token_id=self.return_tokens_as_token_ids)), + logprob=max(p[1].logprob, -9999.0), + bytes=list( + token.encode("utf-8", errors="replace"))) for i, p in enumerate(logprobs.items()) if top_logprobs and i < top_logprobs ] @@ -546,6 +551,8 @@ def _create_chat_logprobs( step_top_logprobs = top_logprobs[i] if step_top_logprobs is None: token = tokenizer.decode(token_id) + if self.return_tokens_as_token_ids: + token = f"token_id:{token_id}" logprobs_content.append( ChatCompletionLogProbsContent( token=token, @@ -553,7 +560,9 @@ def _create_chat_logprobs( else: logprobs_content.append( ChatCompletionLogProbsContent( - token=step_top_logprobs[token_id].decoded_token, + token=self._get_decoded_token( + step_top_logprobs[token_id], token_id, tokenizer, + self.return_tokens_as_token_ids), logprob=max(step_top_logprobs[token_id].logprob, -9999.0), bytes=list( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 6aef4c9f96150..73e420141813e 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -51,13 +51,15 @@ def __init__( lora_modules: Optional[List[LoRAModulePath]], prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], + return_tokens_as_token_ids: bool = False, ): super().__init__(engine=engine, model_config=model_config, served_model_names=served_model_names, lora_modules=lora_modules, prompt_adapters=prompt_adapters, - request_logger=request_logger) + request_logger=request_logger, + return_tokens_as_token_ids=return_tokens_as_token_ids) async def create_completion(self, request: CompletionRequest, raw_request: Request): @@ -430,12 +432,17 @@ def _create_completion_logprobs( step_top_logprobs = top_logprobs[i] if step_top_logprobs is None: token = tokenizer.decode(token_id) + if self.return_tokens_as_token_ids: + token = f"token_id:{token_id}" out_tokens.append(token) out_token_logprobs.append(None) out_top_logprobs.append(None) else: - token = self._get_decoded_token(step_top_logprobs[token_id], - token_id, tokenizer) + token = self._get_decoded_token( + step_top_logprobs[token_id], + token_id, + tokenizer, + return_as_token_id=self.return_tokens_as_token_ids) token_logprob = max(step_top_logprobs[token_id].logprob, -9999.0) out_tokens.append(token) @@ -448,7 +455,11 @@ def _create_completion_logprobs( out_top_logprobs.append({ # Convert float("-inf") to the # JSON-serializable float that OpenAI uses - self._get_decoded_token(top_lp[1], top_lp[0], tokenizer): + self._get_decoded_token( + top_lp[1], + top_lp[0], + tokenizer, + return_as_token_id=self.return_tokens_as_token_ids): max(top_lp[1].logprob, -9999.0) for i, top_lp in enumerate(step_top_logprobs.items()) if num_output_top_logprobs >= i diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 8c6bd10b9b4d4..321c9ac2c1d5f 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -68,6 +68,7 @@ def __init__( lora_modules: Optional[List[LoRAModulePath]], prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], + return_tokens_as_token_ids: bool = False, ): super().__init__() @@ -102,6 +103,7 @@ def __init__( prompt_adapter_num_virtual_tokens=num_virtual_tokens)) self.request_logger = request_logger + self.return_tokens_as_token_ids = return_tokens_as_token_ids async def show_available_models(self) -> ModelList: """Show available models. Right now we only have one model.""" @@ -384,11 +386,13 @@ def _log_inputs( ) @staticmethod - def _get_decoded_token( - logprob: Logprob, - token_id: int, - tokenizer: AnyTokenizer, - ) -> str: + def _get_decoded_token(logprob: Logprob, + token_id: int, + tokenizer: AnyTokenizer, + return_as_token_id: bool = False) -> str: + if return_as_token_id: + return f"token_id:{token_id}" + if logprob.decoded_token is not None: return logprob.decoded_token return tokenizer.decode(token_id) From 9e169a4c619c33ec4f9a14c5e971e3aa34bc4444 Mon Sep 17 00:00:00 2001 From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com> Date: Thu, 25 Jul 2024 11:59:30 +0800 Subject: [PATCH 025/167] [Model] Adding support for MiniCPM-V (#4087) --- .../dev/multimodal/multimodal_index.rst | 2 + docs/source/models/supported_models.rst | 4 + examples/minicpmv_example.py | 53 ++ tests/conftest.py | 11 +- tests/models/test_minicpmv.py | 163 +++++ vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/llama.py | 4 +- vllm/model_executor/models/minicpm.py | 3 +- vllm/model_executor/models/minicpmv.py | 682 ++++++++++++++++++ vllm/multimodal/__init__.py | 3 +- vllm/multimodal/base.py | 34 +- 11 files changed, 942 insertions(+), 18 deletions(-) create mode 100644 examples/minicpmv_example.py create mode 100644 tests/models/test_minicpmv.py create mode 100644 vllm/model_executor/models/minicpmv.py diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst index 7cdbec2c9e3d4..9784f4cc2e088 100644 --- a/docs/source/dev/multimodal/multimodal_index.rst +++ b/docs/source/dev/multimodal/multimodal_index.rst @@ -40,6 +40,8 @@ Registry Base Classes ------------ +.. autodata:: vllm.multimodal.NestedTensors + .. autodata:: vllm.multimodal.BatchedTensors .. autoclass:: vllm.multimodal.MultiModalDataBuiltins diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 068c00da39cd9..dc8bd6fb245df 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -206,6 +206,10 @@ Vision Language Models - Phi-3-Vision - :code:`microsoft/Phi-3-vision-128k-instruct`, etc. - + * - :code:`MiniCPM-V` + - MiniCPM-V + - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc. + - If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py new file mode 100644 index 0000000000000..52366a7030ad0 --- /dev/null +++ b/examples/minicpmv_example.py @@ -0,0 +1,53 @@ +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams +from vllm.assets.image import ImageAsset + +# 2.0 +# MODEL_NAME = "HwwwH/MiniCPM-V-2" +# 2.5 +MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5" + +image = ImageAsset("stop_sign").pil_image.convert("RGB") + +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) +llm = LLM(model=MODEL_NAME, + gpu_memory_utilization=1, + trust_remote_code=True, + max_model_len=4096) + +messages = [{ + 'role': + 'user', + 'content': + '(./)\n' + "What's the content of the image?" +}] +prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) +# 2.0 +# stop_token_ids = [tokenizer.eos_id] +# 2.5 +stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] + +sampling_params = SamplingParams( + stop_token_ids=stop_token_ids, + # temperature=0.7, + # top_p=0.8, + # top_k=100, + # seed=3472, + max_tokens=1024, + # min_tokens=150, + temperature=0, + use_beam_search=True, + # length_penalty=1.2, + best_of=3) + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": image + } +}, + sampling_params=sampling_params) +print(outputs[0].outputs[0].text) diff --git a/tests/conftest.py b/tests/conftest.py index 7f507310cd255..59510075b0063 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,7 +11,7 @@ import torch.nn.functional as F from PIL import Image from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq, - AutoTokenizer, BatchEncoding) + AutoTokenizer, BatchEncoding, BatchFeature) from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset @@ -133,7 +133,7 @@ def image_assets() -> _ImageAssets: return IMAGE_ASSETS -_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding) +_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature) class HfRunner: @@ -339,7 +339,6 @@ def generate_greedy_logprobs_limit( processor_kwargs["images"] = images[i] inputs = self.processor(**processor_kwargs) - input_ids = inputs.input_ids output = self.model.generate( **self.wrap_device(inputs), @@ -381,7 +380,7 @@ def generate_greedy_logprobs_limit( all_logprobs.append(seq_logprobs_lst) seq_ids = output.sequences[0] - output_len = seq_ids.shape[0] - input_ids.shape[1] + output_len = len(seq_logprobs_lst) output_ids = seq_ids[-output_len:] all_output_ids.append(output_ids.tolist()) all_output_strs.append(self.tokenizer.decode(output_ids)) @@ -514,10 +513,12 @@ def generate_greedy_logprobs( max_tokens: int, num_logprobs: int, images: Optional[List[Image.Image]] = None, + stop_token_ids: Optional[List[int]] = None, ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: greedy_logprobs_params = SamplingParams(temperature=0.0, max_tokens=max_tokens, - logprobs=num_logprobs) + logprobs=num_logprobs, + stop_token_ids=stop_token_ids) outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params, images=images) diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py new file mode 100644 index 0000000000000..9124fa7a6238c --- /dev/null +++ b/tests/models/test_minicpmv.py @@ -0,0 +1,163 @@ +from collections import UserDict +from typing import List, Optional, Tuple, Type + +import pytest +import torch +import torch.types +from transformers import BatchFeature + +from vllm.multimodal.utils import rescale_image_size +from vllm.sequence import SampleLogprobs + +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from .utils import check_logprobs_close + +pytestmark = pytest.mark.vlm + +# The image token is placed before "user" on purpose so that the test can pass +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \ + "(./)\nWhat's the content of the image?<|eot_id|>" \ + "<|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 + "cherry_blossom": + "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \ + "(./)\nWhat is the season?<|eot_id|>" \ + "<|start_header_id|>assistant<|end_header_id|>\n\n" +}) + +models = ["openbmb/MiniCPM-Llama3-V-2_5"] + + +def trunc_hf_output(hf_output: Tuple[List[int], str, + Optional[SampleLogprobs]]): + output_ids, output_str, out_logprobs = hf_output + if output_str.endswith("<|eot_id|>"): + output_str = output_str.split("<|eot_id|>")[0] + return output_ids, output_str, out_logprobs + + +target_dtype = "half" + + +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + image_assets: _ImageAssets, + model: str, + *, + size_factors: List[float], + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalDataDict objects + and corresponding vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + max_model_len=4096, + max_num_seqs=1, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + tokenizer = vllm_model.model.get_tokenizer() + stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] + vllm_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=vllm_images, + stop_token_ids=stop_token_ids) + for prompts, vllm_images in inputs_per_image + ] + + with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad(): + + class NestedInputs(UserDict): + + def __init__(self, model_inputs: BatchFeature): + super().__init__({"model_inputs": model_inputs}) + + self.model_inputs = model_inputs + + def to(self, device: torch.types.Device): + return NestedInputs(self.model_inputs.to(device)) + + hf_processor = hf_model.processor + hf_model.processor = lambda **kw: NestedInputs( + hf_processor(**kw) # type: ignore + ) + + hf_outputs_per_image = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=hf_images, + tokenizer=tokenizer) + for prompts, hf_images in inputs_per_image + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, + vllm_outputs_per_image): + check_logprobs_close( + outputs_0_lst=[ + trunc_hf_output(hf_output) for hf_output in hf_outputs + ], + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_tokens: int, num_logprobs: int) -> None: + run_test( + hf_runner, + vllm_runner, + image_assets, + model, + size_factors=size_factors, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 55a039a88d535..7df5b8fa64710 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -50,6 +50,7 @@ "MptForCausalLM": ("mpt", "MPTForCausalLM"), "MPTForCausalLM": ("mpt", "MPTForCausalLM"), "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), + "MiniCPMV": ("minicpmv", "MiniCPMV"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2052c443a8885..306d22e42ed1d 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -418,9 +418,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + input_embeds: Optional[torch.Tensor] = None ) -> Union[torch.Tensor, IntermediateTensors]: model_output = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + input_embeds) return model_output def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 4ccf1cf0fad76..7a8ac0bb1f949 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -463,10 +463,11 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + input_embeds: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata) + attn_metadata, input_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py new file mode 100644 index 0000000000000..8563216d9c392 --- /dev/null +++ b/vllm/model_executor/models/minicpmv.py @@ -0,0 +1,682 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniCPM-V-2 model compatible with HuggingFace weights.""" +import math +import re +from functools import partial +from typing import Iterable, List, Optional, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image +from torch import nn +from torch.nn.init import trunc_normal_ +from transformers.configuration_utils import PretrainedConfig +from transformers.models.idefics2.modeling_idefics2 import ( + Idefics2VisionTransformer) + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, MultiModalConfig +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import SupportsVision +from vllm.model_executor.models.llama import LlamaForCausalLM +from vllm.model_executor.models.minicpm import MiniCPMForCausalLM +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.image import (cached_get_image_processor, + cached_get_tokenizer) +from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData + +_KEYS_TO_MODIFY_MAPPING = { + "language_model.lm_head": "lm_head", + "language_model.model": "language_model", +} + + +def get_abs_pos(abs_pos, tgt_size): + # abs_pos: L, C + # tgt_size: (H, W) + # return: M, C + src_size = int(math.sqrt(abs_pos.size(0))) + # tgt_size = int(math.sqrt(tgt_size)) + dtype = abs_pos.dtype + + return F.interpolate( + abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2), + size=(tgt_size[0], tgt_size[1]), + mode="bicubic", + align_corners=False, + ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype) + + +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_2d_sincos_pos_embed(embed_dim, + grid_size, + cls_token=False, + version=2.0): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or + [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + if version == 2.0: + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], + axis=0) + else: + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version=2.0): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid( + embed_dim // 2, grid[0], version) # (H*W, D/2) or (H, W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid( + embed_dim // 2, grid[1], version) # (H*W, D/2) or (H, W, D/2) + + if version == 2.0: + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + else: + emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, version=2.0): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) / (H, W) + out: (M, D) / (H, W, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000**omega # (D/2,) + + if version == 2.0: + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + else: + out = np.einsum('hw,d->hwd', pos, omega) # (H, W, D/2), outer product + emb_sin = np.sin(out) # (H, W, D/2) + emb_cos = np.cos(out) # (H, W, D/2) + emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D) + return emb + + +class Resampler(nn.Module): + """ + A 2D perceiver-resampler network with one cross attention layers by + (grid_size**2) learnable queries and 2d sincos pos_emb + Outputs: + A tensor with the shape of (grid_size**2, embed_dim) + """ + + default_norm_layer = partial(nn.LayerNorm, eps=1e-6) + + def __init__(self, + num_queries, + grid_size, + embed_dim, + num_heads, + kv_dim=None, + norm_layer=default_norm_layer, + adaptive=False, + max_size=(70, 70), + version=2.0): + super().__init__() + + self.version = version + if self.version == 2.0: + self.num_queries = grid_size**2 + else: + self.num_queries = num_queries + self.max_size = max_size + self.embed_dim = embed_dim + self.num_heads = num_heads + self.adaptive = adaptive + + self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) + trunc_normal_(self.query, std=.02) + + if kv_dim is not None and kv_dim != embed_dim: + self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False) + else: + self.kv_proj = nn.Identity() + + self.attn = nn.MultiheadAttention(embed_dim, num_heads) + self.ln_q = norm_layer(embed_dim) + self.ln_kv = norm_layer(embed_dim) + + self.ln_post = norm_layer(embed_dim) + self.proj = nn.Parameter( + (embed_dim**-0.5) * torch.randn(embed_dim, embed_dim)) + + if self.version == 2.0: + self.pos_embed = nn.Parameter( + torch.from_numpy( + get_2d_sincos_pos_embed( + embed_dim, grid_size, + version=self.version)).float()).requires_grad_(False) + else: + self._set_2d_pos_cache(self.max_size) + + self.apply(self._init_weights) + + def _set_2d_pos_cache(self, max_size, device='cpu'): + pos_embed = torch.from_numpy( + get_2d_sincos_pos_embed(self.embed_dim, + max_size, + version=self.version)).float().to(device) + self.register_buffer("pos_embed", pos_embed, persistent=False) + + def _adjust_pos_cache(self, tgt_sizes, device): + max_h = torch.max(tgt_sizes[:, 0]) + max_w = torch.max(tgt_sizes[:, 1]) + if max_h > self.max_size[0] or max_w > self.max_size[1]: + self.max_size = [ + max(max_h, self.max_size[0]), + max(max_w, self.max_size[1]) + ] + self._set_2d_pos_cache(self.max_size, device) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward_2_5(self, x, tgt_sizes=None): + assert x.shape[0] == tgt_sizes.shape[0] + bs = x.shape[0] + + device = x.device + dtype = x.dtype + + patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] + + self._adjust_pos_cache(tgt_sizes, device=device) + + max_patch_len = torch.max(patch_len) + key_padding_mask = torch.zeros((bs, max_patch_len), + dtype=torch.bool, + device=device) + + pos_embed = [] + for i in range(bs): + tgt_h, tgt_w = tgt_sizes[i] + pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape( + (tgt_h * tgt_w, -1)).to(dtype)) # patches * D + key_padding_mask[i, patch_len[i]:] = True + + pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, + batch_first=True, + padding_value=0.0).permute( + 1, 0, + 2) # BLD => L * B * D + + x = self.kv_proj(x) # B * L * D + x = self.ln_kv(x).permute(1, 0, 2) # L * B * D + + q = self.ln_q(self.query) # Q * D + + out = self.attn( + self._repeat(q, bs), # Q * B * D + x + pos_embed, # L * B * D + L * B * D + x, + key_padding_mask=key_padding_mask)[0] + # out: Q * B * D + x = out.permute(1, 0, 2) # B * Q * D + + x = self.ln_post(x) + x = x @ self.proj + return x + + def forward_2(self, x, tgt_sizes=None, attn_mask=None): + if self.adaptive: + pos_embed = torch.Tensor( + get_2d_sincos_pos_embed(self.embed_dim, + tgt_sizes)).float().to(device=x.device, + dtype=x.dtype) + else: + pos_embed = get_abs_pos(self.pos_embed, tgt_sizes) + + x = self.kv_proj(x) + x = self.ln_kv(x).permute(1, 0, 2) + + N = x.shape[1] + q = self.ln_q(self.query) + out = self.attn(self._repeat(q, N) + self.pos_embed.unsqueeze(1), + x + pos_embed.unsqueeze(1), + x, + attn_mask=attn_mask)[0] + x = out.permute(1, 0, 2) + + x = self.ln_post(x) + x = x @ self.proj + return x + + def forward(self, x, tgt_sizes=None, attn_mask=None): + if self.version == 2.0: + return self.forward_2(x, tgt_sizes=tgt_sizes, attn_mask=attn_mask) + else: + return self.forward_2_5(x, tgt_sizes=tgt_sizes) + + def _repeat(self, query, N: int): + return query.unsqueeze(1).repeat(1, N, 1) + + +def get_max_minicpmv_image_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config(PretrainedConfig) + return getattr(hf_config, "query_num", 64) + + +def dummy_seq_data_for_minicpmv(seq_len: int): + token_ids = [0] * seq_len + return SequenceData(token_ids) + + +def dummy_image_for_minicpmv(hf_config): + width = height = hf_config.image_size + image = Image.new("RGB", (width, height), color=0) + return {"image": image} + + +def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int): + hf_config = ctx.get_hf_config(PretrainedConfig) + + # image_feature_size = get_max_minicpmv_image_tokens(ctx) + + seq_data = dummy_seq_data_for_minicpmv(seq_len) + + mm_data = dummy_image_for_minicpmv(hf_config) + + return seq_data, mm_data + + +def input_processor_for_minicpmv(ctx: InputContext, llm_inputs: LLMInputs): + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + model_config = ctx.model_config + + tokenizer = cached_get_tokenizer(model_config.tokenizer, + trust_remote_code=True) + + prompt = llm_inputs.get("prompt") + if prompt is None: + token_ids = llm_inputs.get("prompt_token_ids") + prompt = tokenizer.decode(token_ids) + image_processor = cached_get_image_processor(model_config.tokenizer) + + pattern = "(./)" + image = multi_modal_data["image"] + image_tags = re.findall(pattern, prompt) + assert len(image_tags) <= 1 + text_chunks = prompt.split(pattern) + new_prompt = text_chunks[0] \ + + image_processor.get_slice_image_placeholder(image.size) \ + + text_chunks[1] + + new_token_ids = tokenizer.encode(new_prompt) + + llm_inputs = LLMInputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + return llm_inputs + + +@MULTIMODAL_REGISTRY.register_image_input_mapper() +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv) +@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv) +class MiniCPMV(nn.Module, SupportsVision): + + def __init__( + self, + config, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.multimodal_config = multimodal_config + + self.version = float(self.config.version) + self.llm = self.init_llm(config, cache_config, quant_config) + self.vpm = self.init_vision_module() + param_dtype = torch.get_default_dtype() + self.vpm.to(dtype=param_dtype) + self.vision_dim = self.vpm.embed_dim if self.version == 2.0 \ + else self.vpm.embeddings.embed_dim + self.embed_dim = self.llm.config.hidden_size + self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) + self.resampler.to(device="cuda", dtype=param_dtype) + self.sampler = Sampler() + + def init_llm(self, config, cache_config, quant_config): + if self.version == 2.0: + return MiniCPMForCausalLM(config, + cache_config=cache_config, + quant_config=quant_config) + else: + return LlamaForCausalLM(config, + cache_config=cache_config, + quant_config=quant_config) + + def init_vision_module(self): + if self.version == 2.0: + try: + import timm + except ImportError: + raise ImportError( + 'Please install timm==0.9.10') from ImportError + default_dtype = torch.get_default_dtype() + torch.set_default_dtype(torch.float16) + model = timm.create_model('vit_so400m_patch14_siglip_384.webli', + pretrained=False, + num_classes=0, + dynamic_img_size=True, + dynamic_img_pad=True) + torch.set_default_dtype(default_dtype) + if isinstance(model, timm.models.VisionTransformer + ) and model.attn_pool is not None: + model.attn_pool = torch.nn.Identity() + + if self.config.drop_vision_last_layer: + model.blocks = model.blocks[:-1] + else: + model = Idefics2VisionTransformer(self.config.vision_config) + if self.config.drop_vision_last_layer: + model.encoder.layers = model.encoder.layers[:-1] + return model + + def init_resampler(self, embed_dim, vision_dim): + default_dtype = torch.get_default_dtype() + torch.set_default_dtype(torch.float16) + if self.version == 2.0: + resampler = Resampler(grid_size=int( + math.sqrt(self.config.query_num)), + num_queries=None, + embed_dim=embed_dim, + num_heads=embed_dim // 128, + kv_dim=vision_dim, + adaptive=True, + version=self.version) + else: + resampler = Resampler(num_queries=self.config.query_num, + grid_size=None, + embed_dim=embed_dim, + num_heads=embed_dim // 128, + kv_dim=vision_dim, + adaptive=True, + version=self.version) + torch.set_default_dtype(default_dtype) + return resampler + + def get_vision_embedding(self, + pixel_values, + patch_attn_mask=None, + tgt_sizes=None, + version=2.0): + if version == 2.0: + res = [] + dtype = self.vpm.pos_embed.data.dtype + for pixel_value in pixel_values: + # V2.0 start + H, W = pixel_value[0].shape[-2:] + tgt_size = (math.ceil(H / self.vpm.patch_embed.patch_size[0]), + math.ceil(W / self.vpm.patch_embed.patch_size[0])) + # V2.0 end + vision_embedding = self.vpm.forward_features( + pixel_value.unsqueeze(0).type(dtype)) + if hasattr(self.vpm, 'num_prefix_tokens' + ) and self.vpm.num_prefix_tokens > 0: + vision_embedding = vision_embedding[:, self.vpm. + num_prefix_tokens:] + res.append(self.resampler(vision_embedding, tgt_size)) + return torch.vstack(res) + else: + vision_embedding = self.vpm( + pixel_values.type(dtype), + patch_attention_mask=patch_attn_mask).last_hidden_state + vision_embedding = self.resampler(vision_embedding, tgt_sizes) + + def get_image_bounds(self, input_ids): + tokenizer = cached_get_tokenizer(self.config._name_or_path, + trust_remote_code=True) + im_start_token_id = tokenizer.im_start_id + im_end_token_id = tokenizer.im_end_id + image_start_tokens = torch.where(input_ids == im_start_token_id)[0] + image_start_tokens += 1 + image_end_tokens = torch.where(input_ids == im_end_token_id)[0] + valid_image_nums = min(len(image_start_tokens), len(image_end_tokens)) + if valid_image_nums == 0: + return [] + image_bound = torch.hstack([ + image_start_tokens[:valid_image_nums].unsqueeze(-1), + image_end_tokens[:valid_image_nums].unsqueeze(-1), + ]) + + return image_bound + + def get_vision_hidden_states(self, data): + if "vision_hidden_states" not in data: + pixel_values = data["pixel_values"] + tgt_sizes = data["tgt_sizes"] + vision_hidden_states = [] + if self.version == 2.0: + if pixel_values is not None and len(pixel_values) > 0: + vision_hidden_states = self.get_vision_embedding( + pixel_values) + else: + vision_hidden_states = torch.tensor([]).to( + data["input_ids"].device) + else: + device = self.vpm.embeddings.position_embedding.weight.device + dtype = self.vpm.embeddings.position_embedding.weight.dtype + all_pixel_values = [ + i.flatten(end_dim=1).permute(1, 0) for i in pixel_values + ] + if all_pixel_values: + tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32) + max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1]) + all_pixel_values = torch.nn.utils.rnn.pad_sequence( + all_pixel_values, batch_first=True, padding_value=0.0) + B, L, _ = all_pixel_values.shape + all_pixel_values = all_pixel_values.permute( + 0, 2, 1).reshape(B, 3, -1, L) + + patch_attn_mask = torch.zeros((B, 1, max_patches), + dtype=torch.bool, + device=device) + for i in range(B): + patch_attn_mask[i, :tgt_sizes[i][0] * + tgt_sizes[i][1]] = True + + vision_embedding = self.vpm( + all_pixel_values.type(dtype), + patch_attention_mask=patch_attn_mask).last_hidden_state + vision_hidden_states = self.resampler( + vision_embedding, tgt_sizes) + + else: # no image + dummy_feature = [] + vision_hidden_states = dummy_feature + else: + vision_hidden_states = data["vision_hidden_states"] + + return vision_hidden_states + + def get_embedding(self, data): + input_ids = data["input_ids"] + + vision_hidden_states = self.get_vision_hidden_states(data) + if vision_hidden_states is not None and len(vision_hidden_states) > 0: + image_bounds = self.get_image_bounds(input_ids) + else: + image_bounds = [] + + if hasattr(self.llm.config, 'scale_emb'): + vlm_embedding = self.llm.model.embed_tokens( + input_ids) * self.llm.config.scale_emb + else: + vlm_embedding = self.llm.model.embed_tokens(input_ids) + vision_hidden_states = [ + i.type(vlm_embedding.dtype) if isinstance(i, torch.Tensor) else i + for i in vision_hidden_states + ] + + if len(vision_hidden_states) > 0 and len(image_bounds) > 0: + vision_hidden_states = torch.cat(vision_hidden_states, dim=0) + image_indices = torch.stack([ + torch.arange(r[0], r[1], dtype=torch.long) + for r in image_bounds + ]).to(vlm_embedding.device) + vlm_embedding.scatter_( + 0, + image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]), + vision_hidden_states.view(-1, vision_hidden_states.shape[-1])) + return vlm_embedding, vision_hidden_states + + def process_multimodal_inputs(self, inputs): + pixel_values = [] + tgt_sizes = [] + for b in range(len(inputs["pixel_values"])): + pixel_values += inputs["pixel_values"][b] + tgt_sizes += inputs["tgt_sizes"][b] + return { + "pixel_values": pixel_values, + "input_ids": inputs["input_ids"], + "tgt_sizes": tgt_sizes + } + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ): + inputs = { + "pixel_values": kwargs.pop("pixel_values", []), + "input_ids": input_ids, + "tgt_sizes": kwargs.pop("tgt_sizes", None), + } + + inputs = self.process_multimodal_inputs(inputs) + + vlm_embeddings, vision_hidden_states = self.get_embedding(inputs) + + output = self.llm(input_ids=None, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + input_embeds=vlm_embeddings) + return output + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + return self.llm.compute_logits(hidden_states, sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.llm.sample(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + # for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): + # if key_to_modify in name: + # name = name.replace(key_to_modify, new_key) + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + use_default_weight_loading = False + if "vpm" in name or 'resampler' in name: + # We only do sharding for language model and + # not vision model for now. + use_default_weight_loading = True + else: + for (param_name, weight_name, + shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + use_default_weight_loading = True + if use_default_weight_loading: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 503dceab5b168..0e3b35d425cb7 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,5 +1,5 @@ from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict, - MultiModalInputs, MultiModalPlugin) + MultiModalInputs, MultiModalPlugin, NestedTensors) from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -17,6 +17,7 @@ "MultiModalDataDict", "MultiModalInputs", "MultiModalPlugin", + "NestedTensors", "MULTIMODAL_REGISTRY", "MultiModalRegistry", ] diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 3ebc25c5930cf..0d435bd644e29 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from collections import UserDict, defaultdict from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict, - TypeVar, Union) + TypeVar, Union, cast) import torch import torch.types @@ -15,10 +15,17 @@ logger = init_logger(__name__) -BatchedTensors = Union[torch.Tensor, List[torch.Tensor]] +NestedTensors = Union[List[torch.Tensor], torch.Tensor] +""" +Use a list instead of a tensor if the dimensions of each element do not match. +Currently only supports up to singly nested list of tensors. +""" + +BatchedTensors = Union[List[NestedTensors], NestedTensors] """ If each input tensor in the batch has the same size, this is a single batched -tensor; otherwise, this is a list of tensors with one element per batch. +tensor; otherwise, this is a list of :class:`NestedTensors` with one element +per item in the batch. """ if sys.version_info < (3, 9): @@ -27,7 +34,7 @@ class _MultiModalInputsBase(UserDict): pass else: - class _MultiModalInputsBase(UserDict[str, torch.Tensor]): + class _MultiModalInputsBase(UserDict[str, NestedTensors]): pass @@ -39,19 +46,26 @@ class MultiModalInputs(_MultiModalInputsBase): @staticmethod def try_concat( - tensors: List[torch.Tensor], + tensors: List[NestedTensors], *, device: torch.types.Device, ) -> BatchedTensors: - unbatched_shape = tensors[0].shape[1:] + # may be list rather than tensors + if isinstance(tensors[0], list): + return [[t.to(device=device) for t in tensor[0]] + for tensor in tensors] + + tensors_ = cast(List[torch.Tensor], tensors) + + unbatched_shape = tensors_[0].shape[1:] - for tensor in tensors: + for tensor in tensors_: if tensor.shape[1:] != unbatched_shape: return [ - tensor.squeeze(0).to(device=device) for tensor in tensors + tensor.squeeze(0).to(device=device) for tensor in tensors_ ] - return torch.cat(tensors, dim=0).to(device=device) + return torch.cat(tensors_, dim=0).to(device=device) @staticmethod def batch( @@ -64,7 +78,7 @@ def batch( keys = inputs_list[0].keys() - item_lists: Dict[str, List[torch.Tensor]] = defaultdict(list) + item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) for inputs in inputs_list: if inputs.keys() != keys: From 309aaef8255fb832bf674c6ed7d9d84211629421 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Wed, 24 Jul 2024 22:33:56 -0700 Subject: [PATCH 026/167] [Bugfix] Fix decode tokens w. CUDA graph (#6757) --- tests/worker/test_model_runner.py | 1 + vllm/attention/backends/flash_attn.py | 12 ++++++++++-- vllm/attention/backends/flashinfer.py | 11 ++++++++++- vllm/attention/backends/utils.py | 11 ++++++++++- 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index b5742c4338616..4a0e2b4184936 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -193,6 +193,7 @@ def test_prepare_decode_cuda_graph(batch_size): for _ in range(expected_bs - len(seq_lens)): seq_lens.append(1) assert attn_metadata.seq_lens == seq_lens + assert attn_metadata.num_decode_tokens == len(seq_lens) start_idx = 0 start_loc = [start_idx] for _ in context_lens: diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 949bd973cf3c4..7d7aff9dc3cdc 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -272,7 +272,15 @@ def _add_seq_group( def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int): - """Build attention metadata with on-device tensors.""" + """Build attention metadata with on-device tensors. + + Args: + seq_lens: The maybe padded sequence lengths of the input sequences. + query_lens: The query lengths of the input sequences. + cuda_graph_pad_size: The padding size for cuda graph. + -1 if cuda graph is not used. + batch_size: The maybe padded batch size. + """ for inter_data in self.input_builder.inter_data_list: self._add_seq_group(inter_data, self.input_builder.chunked_prefill_enabled) @@ -297,7 +305,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], if use_captured_graph: self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) self.block_tables.extend([] * cuda_graph_pad_size) - num_decode_tokens = batch_size + cuda_graph_pad_size + num_decode_tokens = batch_size # The shape of graph_block_tables is # [max batch size, max context len // block size]. diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 9746304347d6e..83a420d76834b 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -320,6 +320,15 @@ def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int): def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int): + """Build attention metadata with on-device tensors. + + Args: + seq_lens: The maybe padded sequence lengths of the input sequences. + query_lens: The query lengths of the input sequences. + cuda_graph_pad_size: The padding size for cuda graph. + -1 if cuda graph is not used. + batch_size: The maybe padded batch size. + """ for inter_data in self.input_builder.inter_data_list: self._add_seq_group(inter_data, self.input_builder.chunked_prefill_enabled) @@ -334,7 +343,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], if use_captured_graph: self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) self.block_tables.extend([] * cuda_graph_pad_size) - num_decode_tokens = batch_size + cuda_graph_pad_size + num_decode_tokens = batch_size # The shape of graph_block_tables is # [max batch size, max context len // block size]. diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 5877712b9b7d3..dcd10ed410a79 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -149,6 +149,15 @@ def _add_seq_group( def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int): + """Build attention metadata with on-device tensors. + + Args: + seq_lens: The maybe padded sequence lengths of the input sequences. + query_lens: The query lengths of the input sequences. + cuda_graph_pad_size: The padding size for cuda graph. + -1 if cuda graph is not used. + batch_size: The maybe padded batch size. + """ for inter_data in self.input_builder.inter_data_list: self._add_seq_group(inter_data, self.input_builder.chunked_prefill_enabled) @@ -173,7 +182,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], if use_captured_graph: self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) self.block_tables.extend([] * cuda_graph_pad_size) - num_decode_tokens = batch_size + cuda_graph_pad_size + num_decode_tokens = batch_size # The shape of graph_block_tables is # [max batch size, max context len // block size]. From 0310029a2fc62171fae87155150326125e082a5a Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Thu, 25 Jul 2024 01:34:11 -0400 Subject: [PATCH 027/167] [Bugfix] Fix awq_marlin and gptq_marlin flags (#6745) --- vllm/model_executor/layers/quantization/awq_marlin.py | 5 +++-- vllm/model_executor/layers/quantization/gptq_marlin.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 092f87b623e7f..5ffbb8e854e87 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -25,7 +25,7 @@ class AWQMarlinConfig(QuantizationConfig): def __init__(self, weight_bits: int, group_size: int, has_zp: bool, lm_head_quantized: bool) -> None: self.weight_bits = weight_bits - self.pack_factor = 32 // self.weight_bits # packed into int32 + self.pack_factor = 32 // self.weight_bits # packed into 32bits self.group_size = group_size self.has_zp = has_zp self.lm_head_quantized = lm_head_quantized @@ -69,7 +69,8 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig": def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]: can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg) - is_valid_user_quant = (user_quant is None or user_quant == "marlin") + is_valid_user_quant = (user_quant is None or user_quant == "marlin" + or user_quant == "awq_marlin") if can_convert and is_valid_user_quant: msg = ("The model is convertible to {} during runtime." diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 5b4d614ae2e74..bdcc9c3b4f0c5 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -79,7 +79,8 @@ def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]: can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg) - is_valid_user_quant = (user_quant is None or user_quant == "marlin") + is_valid_user_quant = (user_quant is None or user_quant == "marlin" + or user_quant == "gptq_marlin") if can_convert and is_valid_user_quant: msg = ("The model is convertible to {} during runtime." From 316a41ac1de4e6e46933cadb39b9b7af65b01abd Mon Sep 17 00:00:00 2001 From: Chang Su Date: Wed, 24 Jul 2024 22:48:07 -0700 Subject: [PATCH 028/167] [Bugfix] Fix encoding_format in examples/openai_embedding_client.py (#6755) --- examples/openai_embedding_client.py | 13 ++++++++----- tests/entrypoints/openai/test_embedding.py | 1 - 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/openai_embedding_client.py b/examples/openai_embedding_client.py index b73360fe15a24..b4f4c7ad6beb2 100644 --- a/examples/openai_embedding_client.py +++ b/examples/openai_embedding_client.py @@ -13,11 +13,14 @@ models = client.models.list() model = models.data[0].id -responses = client.embeddings.create(input=[ - "Hello my name is", - "The best thing about vLLM is that it supports many different models" -], - model=model) +responses = client.embeddings.create( + input=[ + "Hello my name is", + "The best thing about vLLM is that it supports many different models" + ], + model=model, + encoding_format="float", +) for data in responses.data: print(data.embedding) # list of float of len 4096 diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 2ca0c0d63c25c..c9747339bbf15 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -18,7 +18,6 @@ def embedding_server(): "--enforce-eager", "--max-model-len", "8192", - "--enforce-eager", ] with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server: From b75e314fff29bdc94b2fb1dd78519e92f9520e65 Mon Sep 17 00:00:00 2001 From: Alphi <52458637+HwwwwwwwH@users.noreply.github.com> Date: Fri, 26 Jul 2024 00:42:49 +0800 Subject: [PATCH 029/167] [Bugfix] Add image placeholder for OpenAI Compatible Server of MiniCPM-V (#6787) Co-authored-by: hezhihui Co-authored-by: Cyrus Leung --- examples/minicpmv_example.py | 2 ++ vllm/entrypoints/chat_utils.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py index 52366a7030ad0..bf20a7ea04ad4 100644 --- a/examples/minicpmv_example.py +++ b/examples/minicpmv_example.py @@ -4,6 +4,8 @@ from vllm.assets.image import ImageAsset # 2.0 +# The official repo doesn't work yet, so we need to use a fork for now +# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # MODEL_NAME = "HwwwH/MiniCPM-V-2" # 2.5 MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5" diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index dca4523d1a27d..1f6d77b828459 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -100,7 +100,9 @@ def _image_token_str(model_config: ModelConfig, if model_type == "phi3_v": # Workaround since this token is not defined in the tokenizer return "<|image_1|>" - if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv", "paligemma"): + if model_type == "minicpmv": + return "(./)" + if model_type in ("blip-2", "chatglm", "fuyu", "paligemma"): # These models do not use image tokens in the prompt return None if model_type.startswith("llava"): From 889da130e747b1382268ed428352f2e73e51a30b Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Thu, 25 Jul 2024 09:46:04 -0700 Subject: [PATCH 030/167] [ Misc ] `fp8-marlin` channelwise via `compressed-tensors` (#6524) Co-authored-by: mgoin --- .../configs/Qwen2-1.5B-Instruct-FP8W8.yaml | 11 ++ .../lm-eval-harness/configs/models-small.txt | 1 + .../compressed_tensors/compressed_tensors.py | 61 ++++++++-- .../compressed_tensors/schemes/__init__.py | 2 + .../schemes/compressed_tensors_scheme.py | 3 +- .../schemes/compressed_tensors_unquantized.py | 3 +- .../schemes/compressed_tensors_w4a16_24.py | 3 +- .../schemes/compressed_tensors_w8a16_fp8.py | 105 ++++++++++++++++++ .../schemes/compressed_tensors_w8a8_fp8.py | 10 +- .../schemes/compressed_tensors_w8a8_int8.py | 19 ++-- .../schemes/compressed_tensors_wNa16.py | 3 +- .../model_executor/layers/quantization/fp8.py | 33 ++++-- .../quantization/utils/marlin_utils_fp8.py | 14 +-- 13 files changed, 219 insertions(+), 49 deletions(-) create mode 100644 .buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml new file mode 100644 index 0000000000000..42936fbfbe7d4 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-FP8W8.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 +model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.578 + - name: "exact_match,flexible-extract" + value: 0.585 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 1d1b0ed38671d..109692395acf6 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -5,3 +5,4 @@ Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml +Qwen2-1.5B-Instruct-FP8W8.yaml diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index c4d0c9cb981da..39d00bd5733ff 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -10,7 +10,8 @@ W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, CompressedTensorsScheme, CompressedTensorsUnquantized, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, - CompressedTensorsW8A8Int8, CompressedTensorsWNA16) + CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, + CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, QuantizationType, find_matched_target, is_activation_quantization_format, @@ -100,14 +101,18 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": def get_config_filenames(cls) -> List[str]: return [] - def _check_scheme_supported(self, min_capability: int): + def _check_scheme_supported(self, + min_capability: int, + error: bool = True) -> bool: capability = current_platform.get_device_capability() capability = capability[0] * 10 + capability[1] - if capability < min_capability: + supported = capability >= min_capability + if error and not supported: raise RuntimeError( "Quantization scheme is not supported for ", f"the current GPU. Min capability: {min_capability}. ", f"Current capability: {capability}.") + return supported def _is_static_tensor_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: @@ -170,6 +175,29 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel, # All conditions satisfied. return True + def _is_fp8_w8a16(self, weight_quant: BaseModel, + input_quant: BaseModel) -> bool: + # Confirm weights quantized. + if weight_quant is None: + return False + + # Confirm we have floating points. + if weight_quant.type != QuantizationType.FLOAT: + return False + + # Confirm weight scheme is supported. + is_symmetric_weight = weight_quant.symmetric + is_static_weight = not weight_quant.dynamic + is_per_tensor_or_channel_weight = (weight_quant.strategy in [ + QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL + ]) + if not (is_symmetric_weight and is_static_weight + and is_per_tensor_or_channel_weight): + return False + + # All conditions satisfied. + return True + def _is_wNa16_group_channel(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: input_quant_none = input_quant is None @@ -204,9 +232,23 @@ def _get_scheme_from_parts( # Detect If Activation Quantization. if is_activation_quantization_format(self.quant_format): if self._is_fp8_w8a8(weight_quant, input_quant): - return CompressedTensorsW8A8Fp8( + is_fp8_w8a8_supported = self._check_scheme_supported( + CompressedTensorsW8A8Fp8.get_min_capability(), error=False) + if is_fp8_w8a8_supported: + return CompressedTensorsW8A8Fp8( + strategy=weight_quant.strategy, + is_static_input_scheme=(not input_quant.dynamic)) + else: + return CompressedTensorsW8A16Fp8( + strategy=weight_quant.strategy, + is_static_input_scheme=(input_quant + and not input_quant.dynamic)) + + if self._is_fp8_w8a16(weight_quant, input_quant): + return CompressedTensorsW8A16Fp8( strategy=weight_quant.strategy, - is_static_input_scheme=(not input_quant.dynamic)) + is_static_input_scheme=(input_quant + and not input_quant.dynamic)) if self._is_static_tensor_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Int8( @@ -257,11 +299,10 @@ def get_scheme( targets=self.target_scheme_map.keys()) # Find the quant_scheme - scheme = self.target_scheme_map[matched_target] - - return self._get_scheme_from_parts( - weight_quant=scheme["weights"], - input_quant=scheme["input_activations"]) + scheme_dict = self.target_scheme_map[matched_target] + scheme = self._get_scheme_from_parts( + weight_quant=scheme_dict["weights"], + input_quant=scheme_dict["input_activations"]) # Raise error if device does not support the scheme # (e.g. fp8 needs ada lovelace) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index dd94c49827f62..ca9e286ce5b2d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -4,6 +4,7 @@ CompressedTensorsW4A16Sparse24) from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8 from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8 +from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8 from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS, CompressedTensorsWNA16) @@ -11,6 +12,7 @@ "CompressedTensorsScheme", "CompressedTensorsUnquantized", "CompressedTensorsWNA16", + "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24", "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8", diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py index d5f37b47bb87e..b4bab33e1fb1d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py @@ -12,8 +12,9 @@ class CompressedTensorsScheme(ABC): of different quantization schemes supported by CompressedTensors. """ + @classmethod @abstractmethod - def get_min_capability(self) -> int: + def get_min_capability(cls) -> int: """ Get minimum device capability. """ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py index 6203f02d25e90..b7ba29ddc9840 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py @@ -18,7 +18,8 @@ class CompressedTensorsUnquantized(CompressedTensorsScheme): in a linear transformation. """ - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # volta and up return 70 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index eec523d00372c..b8ffb22d7a89d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -29,7 +29,8 @@ def __init__(self, raise ValueError( "group_size must be given when using strategy group") - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # ampere + up return 80 diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py new file mode 100644 index 0000000000000..eeb7c042e1d1f --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -0,0 +1,105 @@ +from typing import Callable, List, Optional + +import torch + +from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( + CompressedTensorsScheme) +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + QuantizationStrategy) +from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( + apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + convert_to_channelwise, create_per_channel_scale_param, + create_per_tensor_scale_param) +from vllm.model_executor.utils import set_weight_attrs + +__all__ = ["CompressedTensorsW8A16Fp8"] + +SUPPORTED_STRATEGIES = [ + QuantizationStrategy.CHANNEL, QuantizationStrategy.TENSOR +] + + +class CompressedTensorsW8A16Fp8(CompressedTensorsScheme): + + def __init__(self, strategy: str, is_static_input_scheme: bool): + self.strategy = strategy + self.is_static_input_scheme = is_static_input_scheme + + @classmethod + def get_min_capability(cls) -> int: + # ampere and up + return 80 + + # W8A8-Fp8 kernels support only per-tensor and per-channel cases. + # So if we have a fused module (QKV, MLP) with per tensor scales, + # we expand each scale to its shard's channels. + def process_weights_after_loading(self, layer) -> None: + if self.strategy == QuantizationStrategy.TENSOR: + ws_channelwise = convert_to_channelwise(layer.weight_scale, + layer.logical_widths) + layer.weight_scale = torch.nn.Parameter(ws_channelwise, + requires_grad=False) + + # Weights must be transposed for marlin + layer.weight = torch.nn.Parameter(layer.weight.t(), + requires_grad=False) + + prepare_fp8_layer_for_marlin(layer, strategy="channel") + + def create_weights(self, layer: torch.nn.Module, input_size: int, + output_partition_sizes: List[int], + input_size_per_partition: int, + params_dtype: torch.dtype, weight_loader: Callable, + **kwargs): + + output_size_per_partition = sum(output_partition_sizes) + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.orig_dtype = params_dtype + + # WEIGHT + weight = torch.nn.Parameter(torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn), + requires_grad=False) + layer.register_parameter("weight", weight) + set_weight_attrs(weight, { + "input_dim": 1, + "output_dim": 0, + "weight_loader": weight_loader, + }) + + # WEIGHT SCALE + layer_kwargs = {"weight_loader": weight_loader} + if self.strategy == QuantizationStrategy.CHANNEL: + weight_scale = create_per_channel_scale_param( + output_partition_sizes, **layer_kwargs) + elif self.strategy == QuantizationStrategy.TENSOR: + weight_scale = create_per_tensor_scale_param( + output_partition_sizes, **layer_kwargs) + else: + raise ValueError( + f"Unsupported weight strategy={self.strategy}, " + f"supported strategies are {SUPPORTED_STRATEGIES}") + layer.register_parameter("weight_scale", weight_scale) + + # INPUT SCALE (to deal with converted checkpoints) + if self.is_static_input_scheme: + input_scale = create_per_tensor_scale_param( + output_partition_sizes, **layer_kwargs) + layer.register_parameter("input_scale", input_scale) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + + return apply_fp8_marlin_linear(input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + workspace=layer.workspace, + size_n=layer.output_size_per_partition, + size_k=layer.input_size_per_partition, + bias=bias) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 51156a3bc07af..cc9d71db140c2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -23,7 +23,8 @@ def __init__(self, strategy: str, is_static_input_scheme: bool): self.is_static_input_scheme = is_static_input_scheme self.cutlass_fp8_supported = cutlass_fp8_supported() - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # lovelace and up return 89 @@ -77,19 +78,20 @@ def create_weights(self, layer: torch.nn.Module, }) # WEIGHT SCALE + layer_kwargs = {"weight_loader": weight_loader} if self.strategy == QuantizationStrategy.CHANNEL: weight_scale = create_per_channel_scale_param( - output_partition_sizes, weight_loader=weight_loader) + output_partition_sizes, **layer_kwargs) else: assert self.strategy == QuantizationStrategy.TENSOR weight_scale = create_per_tensor_scale_param( - output_partition_sizes, weight_loader=weight_loader) + output_partition_sizes, **layer_kwargs) layer.register_parameter("weight_scale", weight_scale) # INPUT SCALE if self.is_static_input_scheme: input_scale = create_per_tensor_scale_param( - output_partition_sizes, weight_loader=weight_loader) + output_partition_sizes, **layer_kwargs) layer.register_parameter("input_scale", input_scale) def apply_weights(self, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index e81496c89ac7f..3a80863d3abbe 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -19,7 +19,8 @@ def __init__(self, strategy: str, is_static_input_scheme: bool): self.strategy = strategy self.is_static_input_scheme = is_static_input_scheme - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # turing and up return 75 @@ -68,19 +69,19 @@ def create_weights(self, layer: torch.nn.Module, # WEIGHT SCALE layer_kwargs = {"weight_loader": weight_loader} if self.strategy == QuantizationStrategy.CHANNEL: - scale = create_per_channel_scale_param(output_partition_sizes, - **layer_kwargs) + weight_scale = create_per_channel_scale_param( + output_partition_sizes, **layer_kwargs) else: assert self.strategy == QuantizationStrategy.TENSOR - scale = create_per_tensor_scale_param(output_partition_sizes, - **layer_kwargs) - layer.register_parameter("weight_scale", scale) + weight_scale = create_per_tensor_scale_param( + output_partition_sizes, **layer_kwargs) + layer.register_parameter("weight_scale", weight_scale) # INPUT SCALE if self.is_static_input_scheme: - scale = create_per_tensor_scale_param(output_partition_sizes, - **layer_kwargs) - layer.register_parameter("input_scale", scale) + input_scale = create_per_tensor_scale_param( + output_partition_sizes, **layer_kwargs) + layer.register_parameter("input_scale", input_scale) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index e4cf0c0b5d95b..996cba315c556 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -42,7 +42,8 @@ def __init__(self, group_size=self.group_size, is_sym=True) - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # ampere and up return 80 diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 3a4f2a49a3497..6649b317ca838 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -18,8 +18,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( is_layer_skipped) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - all_close_1d, apply_fp8_linear, create_per_tensor_scale_param, - cutlass_fp8_supported, per_tensor_dequantize, requantize_with_max_scale) + all_close_1d, apply_fp8_linear, convert_to_channelwise, + create_per_tensor_scale_param, cutlass_fp8_supported, + per_tensor_dequantize, requantize_with_max_scale) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.utils import print_warning_once @@ -179,19 +180,29 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.weight_scale = Parameter(weight_scale, requires_grad=False) layer.input_scale = None - # If checkpoint is fp8, requantize the separately quantized logical - # weights into a single fp8 weight with a single weight scale. + # If checkpoint is fp8, handle that there are N scales for N + # shards in a fused module else: - # Dequant -> Quant with max scale. - max_w_scale, weight = requantize_with_max_scale( - weight=layer.weight, - weight_scale=layer.weight_scale, - logical_widths=layer.logical_widths, - ) + # If using marlin (w8a16), kernel uses channelwise weights, + # so extend the weight scales to be channelwise. + if self.use_marlin: + weight = layer.weight + weight_scale = convert_to_channelwise(layer.weight_scale, + layer.logical_widths) + + # If using w8a8, torch._scaled_mm needs per tensor, so + # requantize the logical shards as a single weight. + else: + # Dequant -> Quant with max scale so we can run per tensor. + weight_scale, weight = requantize_with_max_scale( + weight=layer.weight, + weight_scale=layer.weight_scale, + logical_widths=layer.logical_widths, + ) # Update layer with new values. layer.weight = Parameter(weight.t(), requires_grad=False) - layer.weight_scale = Parameter(max_w_scale, requires_grad=False) + layer.weight_scale = Parameter(weight_scale, requires_grad=False) if self.quant_config.activation_scheme == "static": layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False) diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index c878939580f10..5f9d8658a342f 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -46,7 +46,8 @@ def apply_fp8_marlin_linear( return output.reshape(out_shape) -def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None: +def prepare_fp8_layer_for_marlin(layer: torch.nn.Module, + strategy: str = "tensor") -> None: print_warning_once( "Your GPU does not have native support for FP8 computation but " "FP8 quantization is being used. Weight-only FP8 compression will " @@ -74,16 +75,7 @@ def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None: layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False) # WEIGHT SCALES - # Currently Marlin doesn't support per-tensor scales, so we - # expand it to channelwise - is_channelwise = (len(layer.weight_scale.shape) > 0 - and layer.weight_scale.shape[0] == part_size_n) - if is_channelwise: - scales = layer.weight_scale - else: - scales = layer.weight_scale.repeat(1, part_size_n) - scales = scales.to(layer.orig_dtype).to(device) - + scales = layer.weight_scale.to(layer.orig_dtype) # Permute scales marlin_scales = marlin_permute_scales(s=scales, size_k=part_size_k, From 65b1f121c885f169da210946eddb0d52524677f1 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 25 Jul 2024 12:46:15 -0400 Subject: [PATCH 031/167] [Bugfix] Fix `kv_cache_dtype=fp8` without scales for FP8 checkpoints (#6761) --- tests/quantization/test_fp8.py | 12 ++++++++++-- vllm/model_executor/layers/quantization/kv_cache.py | 6 ++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 0602fedf0b8e3..ad92f1f189f65 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -60,12 +60,20 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="FP8 is not supported on this GPU type.") -def test_load_fp16_model(vllm_runner) -> None: - with vllm_runner("facebook/opt-125m", quantization="fp8") as llm: +@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) +def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None: + with vllm_runner("facebook/opt-125m", + quantization="fp8", + kv_cache_dtype=kv_cache_dtype) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 fc1 = model.model.decoder.layers[0].fc1 assert isinstance(fc1.quant_method, Fp8LinearMethod) + if kv_cache_dtype == "fp8": + attn = model.model.decoder.layers[0].self_attn.attn + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + assert attn._k_scale == 1.0 + assert attn._v_scale == 1.0 capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index c1495711447fa..d79536d196b92 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -46,10 +46,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: elif layer.k_scale < 0.0 and layer.v_scale < 0.0: # If no scales were loaded (both scales are invalid negative # values), use the default value of 1.0 - k_scale = torch.nn.Parameter(torch.tensor(1.0), - requires_grad=False) - v_scale = torch.nn.Parameter(torch.tensor(1.0), - requires_grad=False) + k_scale = 1.0 + v_scale = 1.0 else: # If we find a single kv_scale in the checkpoint, we remap # kv_scale to k_scale during weight loading, and duplicate From 95db75de64bec34f4d80acff92c62d1cdfa94688 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 25 Jul 2024 13:40:01 -0400 Subject: [PATCH 032/167] [Bugfix] Add synchronize to prevent possible data race (#6788) Co-authored-by: Lucas Wilkinson --- vllm/distributed/parallel_state.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 128096c88a8b1..e9c6fc3a255e4 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -243,6 +243,13 @@ def graph_capture( ca_comm = self.ca_comm maybe_ca_context = nullcontext( ) if ca_comm is None else ca_comm.capture() + + # ensure all initialization operations complete before attempting to + # capture the graph on another stream + curr_stream = torch.cuda.current_stream() + if curr_stream != stream: + stream.wait_stream(curr_stream) + with torch.cuda.stream(stream), maybe_ca_context: # In graph mode, we have to be very careful about the collective # operations. The current status is: From 6a1e25b1514a25d3da96d0d78c4568f6e581e242 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 25 Jul 2024 11:57:16 -0700 Subject: [PATCH 033/167] [Doc] Add documentations for nightly benchmarks (#6412) --- .buildkite/nightly-benchmarks/README.md | 80 +++++++++++++++---- README.md | 2 +- docs/source/index.rst | 6 ++ .../performance_benchmark/benchmarks.rst | 23 ++++++ 4 files changed, 94 insertions(+), 17 deletions(-) create mode 100644 docs/source/performance_benchmark/benchmarks.rst diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index c84e150934306..c1aebaf5b3bbe 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -3,30 +3,51 @@ ## Introduction -This directory contains the performance benchmarking CI for vllm. -The goal is to help developers know the impact of their PRs on the performance of vllm. +This directory contains two sets of benchmark for vllm. +- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance +- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. -This benchmark will be *triggered* upon: + +See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. + + +## Performance benchmark quick overview + +**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models. + +**Benchmarking Duration**: about 1hr. + +**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. + + +## Nightly benchmark quick overview + +**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. + +**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy. + +**Benchmarking Duration**: about 3.5hrs. + + + +## Trigger the benchmark + +Performance benchmark will be triggered when: - A PR being merged into vllm. - Every commit for those PRs with `perf-benchmarks` label. -**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models. +Nightly benchmark will be triggered when: +- Every commit for those PRs with `nightly-benchmarks` label. -**Benchmarking Duration**: about 1hr. -**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run. -## Configuring the workload +## Performance benchmark details -The benchmarking workload contains three parts: -- Latency tests in `latency-tests.json`. -- Throughput tests in `throughput-tests.json`. -- Serving tests in `serving-tests.json`. +See [descriptions.md](tests/descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. -See [descriptions.md](tests/descriptions.md) for detailed descriptions. -### Latency test +#### Latency test Here is an example of one test inside `latency-tests.json`: @@ -54,12 +75,12 @@ Note that the performance numbers are highly sensitive to the value of the param WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file. -### Throughput test +#### Throughput test The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. -### Serving test +#### Serving test We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: ``` @@ -96,9 +117,36 @@ The number of this test is less stable compared to the delay and latency benchma WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. -## Visualizing the results +#### Visualizing the results The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. If you do not see the table, please wait till the benchmark finish running. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. + + + +## Nightly test details + +See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. + + +#### Workflow + +- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. +- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container. +- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark. +- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. + +#### Nightly tests + +In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark. + +#### Docker containers + +The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. + +WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`. + +WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). + diff --git a/README.md b/README.md index 8e508195cdceb..a9215f4c7e1c5 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ vLLM is fast with: - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache - Optimized CUDA kernels -**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)). +**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)). vLLM is flexible and easy to use with: diff --git a/docs/source/index.rst b/docs/source/index.rst index ded9a424ee68c..8f06f2f2e5469 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -117,6 +117,12 @@ Documentation automatic_prefix_caching/apc automatic_prefix_caching/details +.. toctree:: + :maxdepth: 1 + :caption: Performance benchmarks + + performance_benchmark/benchmarks + .. toctree:: :maxdepth: 2 :caption: Developer Documentation diff --git a/docs/source/performance_benchmark/benchmarks.rst b/docs/source/performance_benchmark/benchmarks.rst new file mode 100644 index 0000000000000..9a23aab10d03d --- /dev/null +++ b/docs/source/performance_benchmark/benchmarks.rst @@ -0,0 +1,23 @@ +.. _benchmarks: + +Benchmark suites of vLLM +======================== + + + +vLLM contains two sets of benchmarks: + ++ **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard `_ for the latest performance results. + ++ **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README `_. + + +Trigger a benchmark +------------------- + +The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`. + + +.. note:: + + Please refer to `vLLM performance benchmark descriptions `_ and `vLLM nightly benchmark descriptions `_ for detailed descriptions on benchmark environment, workload and metrics. From cd7edc4e8726d4b87e121f9ec671ecb6dd0c45d6 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 25 Jul 2024 18:05:09 -0400 Subject: [PATCH 034/167] [Bugfix] Fix empty (nullptr) channelwise scales when loading wNa16 using compressed tensors (#6798) --- .../schemes/compressed_tensors_wNa16.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 996cba315c556..a41962ccd66d8 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -55,7 +55,12 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, output_size_per_partition = sum(output_partition_sizes) # If group_size is -1, we are in channelwise case. - group_size = input_size if self.group_size == -1 else self.group_size + channelwise = (self.group_size == -1) + group_size = input_size if channelwise else self.group_size + row_parallel = (input_size != input_size_per_partition) + # In the case of channelwise quantization, we need to replicate the + # scales across all gpus. + partition_scales = (row_parallel and not channelwise) verify_marlin_supports_shape( output_size_per_partition=output_size_per_partition, @@ -66,8 +71,8 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, weight_scale_dim = None scales_and_zp_size = input_size // group_size - if (input_size != input_size_per_partition - and self.group_size is not None): + if partition_scales: + assert input_size_per_partition % group_size == 0 weight_scale_dim = 1 scales_and_zp_size = input_size_per_partition // group_size From f3ff63c3f45974986f13f60647a258b09913c420 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 25 Jul 2024 15:38:32 -0700 Subject: [PATCH 035/167] [doc][distributed] improve multinode serving doc (#6804) --- docs/source/serving/distributed_serving.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst index 4fc36a680084c..5f14fd2b0ee0a 100644 --- a/docs/source/serving/distributed_serving.rst +++ b/docs/source/serving/distributed_serving.rst @@ -79,7 +79,7 @@ On the rest of the worker nodes, run the following command: $ --worker \ $ /path/to/the/huggingface/home/in/this/node -Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. +Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. @@ -101,7 +101,7 @@ You can also use tensor parallel without pipeline parallel, just set the tensor To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. .. warning:: - After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script `_ for more information. + After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script `_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion `_ for more information. .. warning:: From b7215de2c5fcdf8af96cf941556d63934ea8f353 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 25 Jul 2024 16:47:55 -0700 Subject: [PATCH 036/167] [Docs] Publish 5th meetup slides (#6799) --- README.md | 10 +--------- docs/source/community/meetups.rst | 1 + 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a9215f4c7e1c5..5f23f0813f606 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,8 @@ Easy, fast, and cheap LLM serving for everyone --- -**The Fifth vLLM Bay Area Meetup (July 24th 5pm-8pm PT)** - -We are excited to announce our fifth vLLM Meetup! -Join us to hear the vLLM's recent updates and the upcoming roadmap. -Additionally, our collaborators from AWS will be presenting their insights and experiences in deploying vLLM. -Register now [here](https://lu.ma/lp0gyjqr) and be part of the event! - ---- - *Latest News* 🔥 +- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). - [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing). - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing). diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst index 0fde31ef9b059..3b01b109ebf2c 100644 --- a/docs/source/community/meetups.rst +++ b/docs/source/community/meetups.rst @@ -5,6 +5,7 @@ vLLM Meetups We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- `The fifth vLLM meetup `__, with AWS, July 24th 2024. `[Slides] `__ - `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__ - `The third vLLM meetup `__, with Roblox, April 2nd 2024. `[Slides] `__ - `The second vLLM meetup `__, with IBM Research, January 31st 2024. `[Slides] `__ `[Video (vLLM Update)] `__ `[Video (IBM Research & torch.compile)] `__ From 1adddb14bf0e1a603581bca49e8d29e8bfb337dc Mon Sep 17 00:00:00 2001 From: SangBin Cho Date: Thu, 25 Jul 2024 16:53:25 -0700 Subject: [PATCH 037/167] [Core] Fix ray forward_dag error mssg (#6792) --- vllm/executor/ray_gpu_executor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index e4aaeaa24c1bc..564fa79acfd40 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -29,6 +29,7 @@ class RayGPUExecutor(DistributedGPUExecutor): uses_ray: bool = True def _init_executor(self) -> None: + self.forward_dag: Optional["ray.dag.CompiledDAG"] = None # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. @@ -60,8 +61,6 @@ def _init_executor(self) -> None: # Create the parallel GPU workers. self._init_workers_ray(placement_group) - self.forward_dag: Optional["ray.dag.CompiledDAG"] = None - def _configure_ray_workers_use_nsight(self, ray_remote_kwargs) -> Dict[str, Any]: # If nsight profiling is enabled, we need to set the profiling From 443c7cf4cf891e6957d4b31655e58cabceb5a2a7 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 25 Jul 2024 17:44:09 -0700 Subject: [PATCH 038/167] [ci][distributed] fix flaky tests (#6806) --- tests/distributed/test_pipeline_parallel.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index d666b8a1d44bd..5ff39ddfbf996 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -1,3 +1,10 @@ +""" +WARNING: This test runs in both single-node (4 GPUs) and multi-node + (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is + important to set the distributed backend to "mp" to avoid Ray scheduling + all workers in a node other than the head node, which can cause the test + to fail. +""" import os import pytest @@ -78,7 +85,7 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND): "--pipeline-parallel-size", str(PP_SIZE), "--distributed-executor-backend", - "ray", + "mp", ] os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND From 2eb9f4ff262bb39859baebf8d2109abcdadee860 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Thu, 25 Jul 2024 18:08:33 -0700 Subject: [PATCH 039/167] [ci] Mark tensorizer as soft fail and separate from grouped test (#6810) [ci] Mark tensorizer test as soft fail and separate it from grouped test in fast check (#6810) Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index e7dd1fdb2e660..633bc5ca95bf9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -17,11 +17,10 @@ steps: - pytest -v -s test_utils.py # Utils - pytest -v -s worker # Worker -- label: Tensorizer, Metrics, Tracing Test +- label: Metrics, Tracing Test fast_check: true fast_check_only: true commands: - - apt-get install -y curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer - pytest -v -s metrics # Metrics - "pip install \ opentelemetry-sdk \ @@ -221,6 +220,8 @@ steps: - label: Tensorizer Test #mirror_hardwares: [amd] + soft_fail: true + fast_check: true commands: - apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn From 062a1d0fab111723ab768f94bdd48a6adc054007 Mon Sep 17 00:00:00 2001 From: QQSong Date: Thu, 25 Jul 2024 19:24:58 -0700 Subject: [PATCH 040/167] Fix ReplicatedLinear weight loading (#6793) --- vllm/model_executor/layers/linear.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 0e0a2b72f93d4..b6e280ae65049 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -199,12 +199,16 @@ def __init__(self, self.input_size, self.output_size, self.params_dtype, + weight_loader=self.weight_loader, prefix=prefix) if bias: self.bias = Parameter( torch.empty(self.output_size, dtype=self.params_dtype)) - set_weight_attrs(self.bias, {"output_dim": 0}) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) else: self.register_parameter("bias", None) From 084a01fd3544557990f8af8af6fd3c1185bae848 Mon Sep 17 00:00:00 2001 From: Anthony Platanios Date: Fri, 26 Jul 2024 00:25:35 -0400 Subject: [PATCH 041/167] [Bugfix] [Easy] Fixed a bug in the multiprocessing GPU executor. (#6770) --- vllm/executor/multiproc_gpu_executor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 9811fc2a55199..19f7a497cdd9f 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -1,6 +1,7 @@ import asyncio import os import signal +import threading import weakref from functools import partial from typing import Any, List, Optional @@ -115,8 +116,9 @@ def shutdown(signum, frame): if executor := ref(): executor.shutdown() - signal.signal(signal.SIGINT, shutdown) - signal.signal(signal.SIGTERM, shutdown) + if threading.current_thread() is threading.main_thread(): + signal.signal(signal.SIGINT, shutdown) + signal.signal(signal.SIGTERM, shutdown) self.driver_worker = self._create_worker( distributed_init_method=distributed_init_method) From 89a84b0bb7b30706a02836234a94493ea8f780bf Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Fri, 26 Jul 2024 12:31:31 +0800 Subject: [PATCH 042/167] [Core] Use array to speedup padding (#6779) --- vllm/model_executor/layers/sampler.py | 2 +- vllm/model_executor/sampling_metadata.py | 21 ++++++++++++--------- vllm/sequence.py | 23 ++++++++++++++++------- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 5c376797a054f..121458f8156a1 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -220,7 +220,7 @@ def _apply_min_tokens_penalty( seqs_to_penalize: List[int] = [] for j, seq_id in enumerate(seq_ids): seq_data = seq_group.seq_data[seq_id] - if len(seq_data.output_token_ids) < min_tokens: + if len(seq_data.output_token_ids_array) < min_tokens: seqs_to_penalize.append(j) if seqs_to_penalize: diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 390b5d173ebcd..27b37a9d53470 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -1,4 +1,5 @@ import random +from array import array from dataclasses import dataclass from typing import Dict, List, Optional, Tuple @@ -329,8 +330,8 @@ def from_sampling_metadata( user-defined seed for each sequence. extra_entropy: extra entropy to use when generating seeds. """ - prompt_tokens: List[List[int]] = [] - output_tokens: List[List[int]] = [] + prompt_tokens: List[array] = [] + output_tokens: List[array] = [] top_ks: List[int] = [] temperatures: List[float] = [] top_ps: List[float] = [] @@ -432,13 +433,15 @@ def from_sampling_metadata( if (seq_group.is_prompt and sampling_params.prompt_logprobs is not None): prefill_len = len(seq_group.prompt_logprob_indices) - prompt_tokens.extend([] for _ in range(prefill_len)) - output_tokens.extend([] for _ in range(prefill_len)) + prompt_tokens.extend( + array('l') for _ in range(prefill_len)) + output_tokens.extend( + array('l') for _ in range(prefill_len)) if seq_group.do_sample: for seq_id in seq_ids: seq_data = seq_group.seq_data[seq_id] - prompt_tokens.append(list(seq_data.prompt_token_ids)) - output_tokens.append(list(seq_data.output_token_ids)) + prompt_tokens.append(seq_data.prompt_token_ids_array) + output_tokens.append(seq_data.output_token_ids_array) sampling_tensors = SamplingTensors.from_lists( temperatures, top_ps, top_ks, min_ps, presence_penalties, @@ -454,9 +457,9 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], frequency_penalties: List[float], repetition_penalties: List[float], sampling_seeds: List[int], sample_indices: List[int], - prompt_tokens: List[List[int]], - output_tokens: List[List[int]], vocab_size: int, - extra_seeds_to_generate: int, device: torch.device, + prompt_tokens: List[array], output_tokens: List[array], + vocab_size: int, extra_seeds_to_generate: int, + device: torch.device, dtype: torch.dtype) -> "SamplingTensors": # Note that the performance will be very bad without # pinned memory. diff --git a/vllm/sequence.py b/vllm/sequence.py index 0cd4c7e71d78d..72821ecea0f47 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -3,6 +3,7 @@ import enum import math from abc import ABC, abstractmethod +from array import array from collections import defaultdict from dataclasses import dataclass, field from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Set, Tuple, @@ -119,10 +120,10 @@ def __init__( prompt_token_ids: List[int], output_token_ids: Optional[List[int]] = None, ) -> None: - self._prompt_token_ids: List[int] = list(prompt_token_ids) + self._prompt_token_ids = array('l', prompt_token_ids) self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(prompt_token_ids) - self._output_token_ids: List[int] = ( - list(output_token_ids) if output_token_ids is not None else []) + self._output_token_ids = array( + 'l', output_token_ids if output_token_ids is not None else []) self.cumulative_logprob = 0.0 # The number of tokens that are computed (that run against the model). @@ -132,8 +133,8 @@ def __init__( self._update_cached_all_tokens() def _update_cached_all_tokens(self): - self._cached_all_token_ids: List[int] = (self._prompt_token_ids + - self._output_token_ids) + self._cached_all_token_ids: List[int] = list(self._prompt_token_ids + + self._output_token_ids) @property def prompt_token_ids(self) -> Tuple[int, ...]: @@ -141,19 +142,27 @@ def prompt_token_ids(self) -> Tuple[int, ...]: @prompt_token_ids.setter def prompt_token_ids(self, new_prompt_token_ids) -> None: - self._prompt_token_ids = list(new_prompt_token_ids) + self._prompt_token_ids = array('l', new_prompt_token_ids) self._prompt_token_ids_tuple = tuple(new_prompt_token_ids) self._update_cached_all_tokens() + @property + def prompt_token_ids_array(self) -> array: + return self._prompt_token_ids + @property def output_token_ids(self) -> Tuple[int, ...]: return tuple(self._output_token_ids) @output_token_ids.setter def output_token_ids(self, new_output_token_ids) -> None: - self._output_token_ids = list(new_output_token_ids) + self._output_token_ids = array('l', new_output_token_ids) self._update_cached_all_tokens() + @property + def output_token_ids_array(self) -> array: + return self._output_token_ids + def append_token_id(self, token_id: int, logprob: float) -> None: self._output_token_ids.append(token_id) self._cached_all_token_ids.append(token_id) From 85ad7e2d012edd87de9e84e93ed3204c80599695 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 25 Jul 2024 21:48:05 -0700 Subject: [PATCH 043/167] [doc][debugging] add known issues for hangs (#6816) --- docs/source/getting_started/debugging.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 2aa52e79888a3..d7066f2325b3a 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -65,6 +65,10 @@ Here are some common issues that can cause hangs: If the problem persists, feel free to `open an issue on GitHub `_, with a detailed description of the issue, your environment, and the logs. +Some known issues: + +- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can cause hangs at a low probability (once in about 20 times, depending on the machine configuration). The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_ . + .. warning:: After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on. From 07278c37ddd898d842bbddc382e4f67ac08dae35 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 26 Jul 2024 14:33:42 -0400 Subject: [PATCH 044/167] [Model] Support Nemotron models (Nemotron-3, Nemotron-4, Minitron) (#6611) --- .../configs/Minitron-4B-Base.yaml | 11 + .../lm-eval-harness/configs/models-small.txt | 1 + vllm/model_executor/layers/activation.py | 16 + .../model_executor/layers/rotary_embedding.py | 3 + vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/nemotron.py | 531 ++++++++++++++++++ vllm/transformers_utils/config.py | 3 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/nemotron.py | 209 +++++++ 9 files changed, 776 insertions(+), 1 deletion(-) create mode 100644 .buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml create mode 100644 vllm/model_executor/models/nemotron.py create mode 100644 vllm/transformers_utils/configs/nemotron.py diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml new file mode 100644 index 0000000000000..a0466748ea71e --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nvidia/Minitron-4B-Base -b auto -l 1000 -f 5 -t 1 +model_name: "nvidia/Minitron-4B-Base" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.252 + - name: "exact_match,flexible-extract" + value: 0.252 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 109692395acf6..e4df4b547aa5e 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -4,5 +4,6 @@ Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml +Minitron-4B-Base.yaml Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml Qwen2-1.5B-Instruct-FP8W8.yaml diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 5bfdba67b443d..6578193a31597 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -159,6 +159,21 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: +class ReLUSquaredActivation(CustomOp): + """ + Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2 + """ + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + """PyTorch-native implementation equivalent to forward().""" + relu_applied = nn.functional.relu(x) + squared = torch.square(relu_applied) + return squared + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + return self.forward_native(x) + + class ScaledActivation(nn.Module): """An activation function with post-scale parameters. @@ -207,6 +222,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): "gelu_new": NewGELU(), "gelu_pytorch_tanh": nn.GELU(approximate="tanh"), "relu": nn.ReLU(), + "relu2": ReLUSquaredActivation(), "quick_gelu": QuickGELU(), } diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 60ba4623edc38..aecba0ae74911 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -774,6 +774,7 @@ def get_rope( is_neox_style: bool = True, rope_scaling: Optional[Dict[str, Any]] = None, dtype: Optional[torch.dtype] = None, + rotary_percent: float = 1.0, ) -> RotaryEmbedding: if dtype is None: dtype = torch.get_default_dtype() @@ -786,6 +787,8 @@ def get_rope( rope_scaling_args = tuple(rope_scaling_tuple.items()) else: rope_scaling_args = None + if rotary_percent < 1.0: + rotary_dim = int(rotary_dim * rotary_percent) key = (head_size, rotary_dim, max_position, base, is_neox_style, rope_scaling_args, dtype) if key in _ROPE_DICT: diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 7df5b8fa64710..ead64c0e92553 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -51,6 +51,7 @@ "MPTForCausalLM": ("mpt", "MPTForCausalLM"), "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), "MiniCPMV": ("minicpmv", "MiniCPMV"), + "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py new file mode 100644 index 0000000000000..bb85f20ab9802 --- /dev/null +++ b/vllm/model_executor/models/nemotron.py @@ -0,0 +1,531 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Nemotron model compatible with HuggingFace weights.""" +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import torch +from torch import nn + +from vllm.attention import Attention, AttentionMetadata +from vllm.config import CacheConfig, LoRAConfig +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.transformers_utils.configs import NemotronConfig + +from .interfaces import SupportsLoRA +from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers + +# The architecture is pretty similar to Llama, with these changes: +# - There is no gate_proj, just up_proj +# - Normal LayerNorm (with a +1 to the weights) instead of RMSNorm +# - Squared ReLU instead of SwiGLU +# - Adds a rotary_percent to RoPE + + +def _cast_if_autocast_enabled(*args): + if not torch.is_autocast_enabled(): + return args + else: + return torch.cuda.amp.autocast_mode._cast( + args, torch.get_autocast_gpu_dtype()) + + +class NemotronLayerNorm1P(nn.LayerNorm): + + def __init__(self, + normalized_shape: Union[int, List[int], torch.Size], + eps: float = 1e-5, + elementwise_affine: bool = True, + bias: bool = True, + device=None, + dtype=None): + super().__init__(normalized_shape, eps, elementwise_affine, bias, + device, dtype) + + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if residual is not None: + x = x + residual + residual = x + args = _cast_if_autocast_enabled(x, self.normalized_shape, + self.weight + 1, self.bias, self.eps) + with torch.cuda.amp.autocast(enabled=False): + x = torch.nn.functional.layer_norm(*args) + return x if residual is None else (x, residual) + + +class NemotronMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + prefix: str = "", + ) -> None: + super().__init__() + self.up_proj = ColumnParallelLinear(input_size=hidden_size, + output_size=intermediate_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.up_proj") + self.down_proj = RowParallelLinear(input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") + self.act_fn = get_act_fn(hidden_act) + + def forward(self, x): + up, _ = self.up_proj(x) + x = self.act_fn(up) + x, _ = self.down_proj(x) + return x + + +class NemotronAttention(nn.Module): + + def __init__( + self, + config: NemotronConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + cache_config: Optional[CacheConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + # MistralConfig has an optional head_dim introduced by Mistral-Nemo + self.head_dim = getattr(config, "head_dim", + self.hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.rotary_percent = config.rope_percent + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + rotary_percent=self.rotary_percent, + ) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class NemotronDecoderLayer(nn.Module): + + def __init__( + self, + config: NemotronConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None and getattr( + config, "original_max_position_embeddings", None): + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + # Support abacusai/Smaug-72B-v0.1 with attention_bias + # Support internlm/internlm-7b with bias + attention_bias = getattr(config, "attention_bias", False) or getattr( + config, "bias", False) + self.self_attn = NemotronAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + bias=attention_bias, + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + ) + self.mlp = NemotronMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "mlp_bias", False), + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = NemotronLayerNorm1P(config.hidden_size, + eps=config.norm_eps) + self.post_attention_layernorm = NemotronLayerNorm1P( + config.hidden_size, eps=config.norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class NemotronModel(nn.Module): + + def __init__( + self, + config: NemotronConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + if get_pp_group().is_first_rank or (config.tie_word_embeddings + and get_pp_group().is_last_rank): + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + else: + self.embed_tokens = PPMissingLayer() + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: NemotronDecoderLayer(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), + prefix=f"{prefix}.layers") + if get_pp_group().is_last_rank: + self.norm = NemotronLayerNorm1P(config.hidden_size, + eps=config.norm_eps) + else: + self.norm = PPMissingLayer() + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i - self.start_layer], + attn_metadata, + residual, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class NemotronForCausalLM(nn.Module, SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", "o_proj", "up_proj", "down_proj", "embed_tokens", "lm_head" + ] + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + } + + def __init__( + self, + config: NemotronConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + super().__init__() + + assert isinstance(config, NemotronConfig) + + self.config = config + self.lora_config = lora_config + + self.model = NemotronModel(config, + cache_config, + quant_config, + lora_config=lora_config, + prefix="model") + if get_pp_group().is_last_rank: + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + quant_config=quant_config, + ) + if config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, + logit_scale) + self.sampler = Sampler() + else: + self.lm_head = PPMissingLayer() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + model_output = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors) + return model_output + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def make_empty_intermediate_tensors( + self, batch_size: int, dtype: torch.dtype, + device: torch.device) -> IntermediateTensors: + return IntermediateTensors({ + "hidden_states": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + "residual": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + }) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 652505a892142..3ba2e01985598 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -8,7 +8,7 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, JAISConfig, MedusaConfig, MLPSpeculatorConfig, MPTConfig, - RWConfig) + NemotronConfig, RWConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -26,6 +26,7 @@ "jais": JAISConfig, "mlp_speculator": MLPSpeculatorConfig, "medusa": MedusaConfig, + "nemotron": NemotronConfig, } for name, cls in _CONFIG_REGISTRY.items(): diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 51de11ca3e42a..1750950b3c38b 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -8,6 +8,7 @@ from vllm.transformers_utils.configs.medusa import MedusaConfig from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.mpt import MPTConfig +from vllm.transformers_utils.configs.nemotron import NemotronConfig __all__ = [ "ChatGLMConfig", @@ -17,4 +18,5 @@ "JAISConfig", "MedusaConfig", "MLPSpeculatorConfig", + "NemotronConfig", ] diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py new file mode 100644 index 0000000000000..a22a9f475dda9 --- /dev/null +++ b/vllm/transformers_utils/configs/nemotron.py @@ -0,0 +1,209 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Nemotron model configuration""" + +from transformers import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class NemotronConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a + [`NemotronModel`]. It is used to instantiate an Nemotron model + according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar + configuration to that of the Nemotron-8B. + + Configuration objects inherit from [`PretrainedConfig`] and can be + used to control the model outputs. Read the documentation from + [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the Nemotron model. Defines the number of + different tokens that can be represented by the + `inputs_ids` passed when calling [`NemotronModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the + Transformer decoder. + head_dim (`int`, *optional*, defaults to None): + Projection weights dimension in multi-head attention. Set to + hidden_size // num_attention_heads if None + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to + implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use + Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention + (MQA) otherwise GQA is used. When converting a multi-head + checkpoint to a GQA checkpoint, each group key and value + head should be constructed by meanpooling all the original + heads within that group. For more details checkout + [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it + is not specified, will default to `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the + decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used + with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for + initializing all weight matrices. + norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values + attentions (not used by all models). Only relevant if + `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE + embeddings. Currently supports two scaling strategies: linear + and dynamic. Their scaling factor must be a float greater than 1. + The expected format is `{"type": strategy name, + "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output + projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj and down_proj layers in the MLP + layers. + + ```python + >>> from transformers import NemotronModel, NemotronConfig + + >>> # Initializing a Nemotron nemotron-15b style configuration + >>> configuration = NemotronConfig() + + >>> # Initializing a model from the nemotron-15b style configuration + >>> model = NemotronModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "nemotron" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=256000, + hidden_size=6144, + intermediate_size=24576, + num_hidden_layers=32, + num_attention_heads=48, + head_dim=None, + num_key_value_heads=None, + hidden_act="relu2", + max_position_embeddings=4096, + initializer_range=0.0134, + norm_eps=1e-5, + use_cache=True, + pad_token_id=None, + bos_token_id=2, + eos_token_id=3, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + rope_percent=0.5, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + head_dim = head_dim or kwargs.get("kv_channels", None) + self.head_dim = head_dim if head_dim is not None else ( + hidden_size // num_attention_heads) + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.norm_eps = norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + rope_percent = rope_percent or kwargs.get("rope_percentage", None) + self.rope_percent = rope_percent + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, + dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with two fields, " + f"`type` and `factor`, got {self.rope_scaling}") + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in [ + "linear", "dynamic" + ]: + raise ValueError( + "`rope_scaling`'s type field must be one of ['linear', " + f"'dynamic'], got {rope_scaling_type}") + if rope_scaling_factor is None or not isinstance( + rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError( + "`rope_scaling`'s factor field must be a float > 1, got " + f"{rope_scaling_factor}") From 50704f52c4643777fb0e5dc99f6c048dd9f54f2d Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 26 Jul 2024 14:41:04 -0400 Subject: [PATCH 045/167] [Bugfix][Kernel] Promote another index to int64_t (#6838) --- csrc/quantization/fp8/common.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 090f95d1bda71..6dae32b25f9c4 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -48,7 +48,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale, const scalar_t* __restrict__ input, int64_t num_elems) { __shared__ float cache[1024]; - int i = blockDim.x * blockIdx.x + threadIdx.x; + int64_t i = blockDim.x * blockIdx.x + threadIdx.x; // First store maximum for all values processes by // the current thread in cache[threadIdx.x] From 71734f1bf263ed4877e928d7d9c4522d12e9c61f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 26 Jul 2024 12:28:32 -0700 Subject: [PATCH 046/167] [Build/CI][ROCm] Minor simplification to Dockerfile.rocm (#6811) --- Dockerfile.rocm | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index ff39791456398..7b4c0166a04bd 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -53,9 +53,9 @@ RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(whic # Install torch == 2.5.0 on ROCm RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ *"rocm-6.1"*) \ - python3 -m pip uninstall -y torch torchaudio torchvision \ + python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --no-cache-dir --pre \ - torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \ + torch==2.5.0.dev20240710 \ torchvision==0.20.0.dev20240710 \ --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ *) ;; esac @@ -127,13 +127,6 @@ FROM base AS final # Import the vLLM development directory from the build context COPY . . -# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. -# Manually remove it so that later steps of numpy upgrade can continue -RUN case "$(which python3)" in \ - *"/opt/conda/envs/py_3.9"*) \ - rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ - *) ;; esac - # Package upgrades for useful functionality or to avoid dependency issues RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade numba scipy huggingface-hub[cli] From aa4867791ecd73a5f55b7bad4d9372954e661fe4 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 26 Jul 2024 12:39:49 -0700 Subject: [PATCH 047/167] [Misc][TPU] Support TPU in initialize_ray_cluster (#6812) --- vllm/executor/ray_utils.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index fcbfa30d7a38a..58b864070f727 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -3,7 +3,7 @@ from vllm.config import ParallelConfig from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_ip, is_hip, is_xpu +from vllm.utils import get_ip, is_hip, is_tpu, is_xpu from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -93,32 +93,38 @@ def initialize_ray_cluster( # Placement group is already set. return + device_str = "GPU" if not is_tpu() else "TPU" # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() if current_placement_group: # We are in a placement group bundles = current_placement_group.bundle_specs # Verify that we can use the placement group. - gpu_bundles = 0 + device_bundles = 0 for bundle in bundles: - bundle_gpus = bundle.get("GPU", 0) - if bundle_gpus > 1: + bundle_devices = bundle.get(device_str, 0) + if bundle_devices > 1: raise ValueError( - "Placement group bundle cannot have more than 1 GPU.") - if bundle_gpus: - gpu_bundles += 1 - if parallel_config.world_size > gpu_bundles: + "Placement group bundle cannot have more than 1 " + f"{device_str}.") + if bundle_devices: + device_bundles += 1 + if parallel_config.world_size > device_bundles: raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the placement group.") + f"The number of required {device_str}s exceeds the total " + f"number of available {device_str}s in the placement group." + f"Required number of devices: {parallel_config.world_size}. " + f"Total number of devices: {device_bundles}.") else: - num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) - if parallel_config.world_size > num_gpus_in_cluster: + num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) + if parallel_config.world_size > num_devices_in_cluster: raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the cluster.") + f"The number of required {device_str}s exceeds the total " + f"number of available {device_str}s in the placement group.") # Create a new placement group - placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size) + placement_group_specs = ([{ + device_str: 1 + }] * parallel_config.world_size) current_placement_group = ray.util.placement_group( placement_group_specs) # Wait until PG is ready - this will block until all From 3bbb4936dc5aa7737750410ab4b4647817dcf9a3 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Sat, 27 Jul 2024 04:50:10 +0800 Subject: [PATCH 048/167] [Hardware] [Intel] Enable Multiprocessing and tensor parallel in CPU backend and update documentation (#6125) --- .buildkite/run-cpu-test.sh | 28 +- Dockerfile.cpu | 9 +- cmake/cpu_extension.cmake | 4 + csrc/cpu/torch_bindings.cpp | 7 + csrc/cpu/utils.cpp | 65 +++++ .../getting_started/cpu-installation.rst | 55 +++- requirements-cpu.txt | 4 +- vllm/distributed/parallel_state.py | 3 + vllm/engine/async_llm_engine.py | 2 - vllm/envs.py | 8 +- vllm/executor/cpu_executor.py | 264 +++++++++++++++--- vllm/utils.py | 21 -- vllm/worker/cpu_model_runner.py | 7 +- vllm/worker/cpu_worker.py | 17 +- 14 files changed, 404 insertions(+), 90 deletions(-) create mode 100644 csrc/cpu/utils.cpp diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index a7678aae54644..21deec2bba973 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -3,26 +3,38 @@ set -ex # Try building the docker image -docker build -t cpu-test -f Dockerfile.cpu . -docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . +numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu . +numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . # Setup cleanup remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } trap remove_docker_container EXIT remove_docker_container -# Run the image +# Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ - --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test + --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ - --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2 + --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 # offline inference -docker exec cpu-test bash -c "python3 examples/offline_inference.py" docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test -docker exec cpu-test bash -c "cd tests; +docker exec cpu-test bash -c " pip install pytest Pillow protobuf - cd ../ pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported + +# online inference +docker exec cpu-test bash -c " + export VLLM_CPU_KVCACHE_SPACE=10 + export VLLM_CPU_OMP_THREADS_BIND=48-92 + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" diff --git a/Dockerfile.cpu b/Dockerfile.cpu index f95d748f1e4be..c473ba431e680 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -2,8 +2,8 @@ FROM ubuntu:22.04 AS cpu-test-1 -RUN apt-get update -y \ - && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \ +RUN apt-get update -y \ + && apt-get install -y curl git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 # https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html @@ -13,8 +13,9 @@ RUN pip install intel-openmp ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD" +RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl +RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.4.0%2Bgitfbaa4bc-cp310-cp310-linux_x86_64.whl RUN pip install --upgrade pip \ && pip install wheel packaging ninja "setuptools>=49.4.0" numpy @@ -25,7 +26,7 @@ COPY ./ /workspace/vllm WORKDIR /workspace/vllm -RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... ARG VLLM_CPU_DISABLE_AVX512 diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 690559ee265e9..118f9b28e0ae3 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -83,6 +83,8 @@ endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") +list(APPEND LIBS "numa") + # # Define extension targets @@ -95,6 +97,7 @@ set(VLLM_EXT_SRC "csrc/cpu/activation.cpp" "csrc/cpu/attention.cpp" "csrc/cpu/cache.cpp" + "csrc/cpu/utils.cpp" "csrc/cpu/layernorm.cpp" "csrc/cpu/pos_encoding.cpp" "csrc/cpu/torch_bindings.cpp") @@ -104,6 +107,7 @@ define_gpu_extension_target( DESTINATION vllm LANGUAGE CXX SOURCES ${VLLM_EXT_SRC} + LIBRARIES ${LIBS} COMPILE_FLAGS ${CXX_COMPILE_FLAGS} USE_SABI 3 WITH_SOABI diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 5be0e9810b5b9..7d549e271a30d 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -4,6 +4,8 @@ #include +void init_cpu_threads_env(const std::string& cpu_ids); + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -107,4 +109,9 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache); } +TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) { + // CPU utils + utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env); +} + REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp new file mode 100644 index 0000000000000..5782580baa861 --- /dev/null +++ b/csrc/cpu/utils.cpp @@ -0,0 +1,65 @@ +#include +#include +#include +#include + +#include "cpu_types.hpp" + +void init_cpu_threads_env(const std::string& cpu_ids) { + bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str()); + TORCH_CHECK(omp_cpu_mask->size > 0); + std::vector omp_cpu_ids; + omp_cpu_ids.reserve(omp_cpu_mask->size); + + constexpr int group_size = 8 * sizeof(*omp_cpu_mask->maskp); + + for (int offset = 0; offset < omp_cpu_mask->size; offset += group_size) { + unsigned long group_mask = omp_cpu_mask->maskp[offset / group_size]; + int i = 0; + while (group_mask) { + if (group_mask & 1) { + omp_cpu_ids.emplace_back(offset + i); + } + ++i; + group_mask >>= 1; + } + } + + // Memory node binding + if (numa_available() != -1) { + int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front()); + bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str()); + bitmask* src_mask = numa_get_membind(); + + int pid = getpid(); + + // move all existing pages to the specified numa node. + *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp); + int page_num = numa_migrate_pages(pid, src_mask, mask); + if (page_num == -1) { + TORCH_CHECK(false, + "numa_migrate_pages failed. errno: " + std::to_string(errno)); + } + + // restrict memory allocation node. + numa_set_membind(mask); + numa_set_strict(1); + } + + // OMP threads binding + omp_set_num_threads((int)omp_cpu_ids.size()); + torch::set_num_threads((int)omp_cpu_ids.size()); + TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads()); + TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads()); +#pragma omp parallel for schedule(static, 1) + for (size_t i = 0; i < omp_cpu_ids.size(); ++i) { + cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size); + size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size); + CPU_ZERO_S(size, mask); + CPU_SET_S(omp_cpu_ids[i], size, mask); + sched_setaffinity(0, sizeof(cpu_set_t), mask); + CPU_FREE(mask); + } + + numa_free_nodemask(omp_cpu_mask); +} diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index 1c97515dbecd9..7fc469e06844f 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -10,6 +10,7 @@ Table of contents: #. :ref:`Requirements ` #. :ref:`Quick start using Dockerfile ` #. :ref:`Build from source ` +#. :ref:`Related runtime environment variables ` #. :ref:`Intel Extension for PyTorch ` #. :ref:`Performance tips ` @@ -47,7 +48,7 @@ Build from source .. code-block:: console $ sudo apt-get update -y - $ sudo apt-get install -y gcc-12 g++-12 + $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - Second, install Python packages for vLLM CPU backend building: @@ -71,6 +72,15 @@ Build from source - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. +.. _env_intro: + +Related runtime environment variables +------------------------------------- + +- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. + +- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. + .. _ipex_guidance: Intel Extension for PyTorch @@ -78,15 +88,11 @@ Intel Extension for PyTorch - `Intel Extension for PyTorch (IPEX) `_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. -- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed. - .. _cpu_backend_performance_tips: Performance tips ----------------- -- vLLM CPU backend uses environment variable ``VLLM_CPU_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: .. code-block:: console @@ -96,11 +102,44 @@ Performance tips $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD $ python examples/offline_inference.py # run vLLM -- vLLM CPU backend uses OpenMP for thread-parallel computation. If you want the best performance on CPU, it will be very critical to isolate CPU cores for OpenMP threads with other thread pools (like web-service event-loop), to avoid CPU oversubscription. +- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: -- If using vLLM CPU backend on a bare-metal machine, it is recommended to disable the hyper-threading. +.. code-block:: console + + $ export VLLM_CPU_KVCACHE_SPACE=40 + $ export VLLM_CPU_OMP_THREADS_BIND=0-29 + $ vllm serve facebook/opt-125m + +- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: + +.. code-block:: console -- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores and memory nodes, to avoid the remote memory node access. ``numactl`` is an useful tool for CPU core and memory binding on NUMA platform. Besides, ``--cpuset-cpus`` and ``--cpuset-mems`` arguments of ``docker run`` are also useful. + $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores + + # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. + CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ + 0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 + 1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 + 2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 + 3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 + 4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 + 5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 + 6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 + 7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + 8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 + 9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 + 10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 + 11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 + 12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 + 13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 + 14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 + 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + + # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 + $ export VLLM_CPU_OMP_THREADS_BIND=0-7 + $ python examples/offline_inference.py + +- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access. diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 754070df21c0a..a8ce104d83290 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,6 +2,6 @@ -r requirements-common.txt # Dependencies for x86_64 CPUs -torch == 2.3.1+cpu; platform_machine != "ppc64le" -torchvision == 0.18.1+cpu; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch +torch == 2.4.0; platform_machine != "ppc64le" +torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index e9c6fc3a255e4..58cae46d9af27 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -296,6 +296,9 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: pynccl_comm = self.pynccl_comm if (pynccl_comm is not None and not pynccl_comm.disabled): pynccl_comm.all_reduce(input_) + elif input_.is_cpu: + import intel_extension_for_pytorch as ipex + ipex.distributed.all_reduce(input_, group=self.device_group) else: torch.distributed.all_reduce(input_, group=self.device_group) return input_ diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 16b7bc64a2849..93cc319f11c42 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -410,8 +410,6 @@ def _get_executor_cls( from vllm.executor.tpu_executor import TPUExecutorAsync executor_class = TPUExecutorAsync elif engine_config.device_config.device_type == "cpu": - assert distributed_executor_backend is None, ( - "Distributed execution is not supported with the CPU backend.") from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync elif engine_config.device_config.device_type == "openvino": diff --git a/vllm/envs.py b/vllm/envs.py index 595992e51db87..f06b6d66ea6f4 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -29,6 +29,7 @@ VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 + VLLM_CPU_OMP_THREADS_BIND: str = "" VLLM_OPENVINO_KVCACHE_SPACE: int = 0 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False @@ -241,11 +242,16 @@ def get_default_config_root(): "VLLM_ATTENTION_BACKEND": lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), - # CPU key-value cache space + # (CPU backend only) CPU key-value cache space. # default is 4GB "VLLM_CPU_KVCACHE_SPACE": lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")), + # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31", + # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'. + "VLLM_CPU_OMP_THREADS_BIND": + lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"), + # OpenVINO key-value cache space # default is 4GB "VLLM_OPENVINO_KVCACHE_SPACE": diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 23e429dac7232..3229e5ad20afa 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -1,16 +1,21 @@ -from typing import List, Set, Tuple +import os +from functools import partial +from typing import Any, Awaitable, List, Optional, Set, Tuple, Union import torch import vllm.envs as envs from vllm.config import CacheConfig, ModelConfig, SchedulerConfig from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, + ResultHandler, WorkerMonitor) from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) +from vllm.utils import (get_distributed_init_method, get_open_port, + get_vllm_instance_id, make_async) +from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -22,46 +27,173 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" assert self.lora_config is None, "cpu backend doesn't support LoRA" + + # + # Environment variables for CPU executor + # + + # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers + os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id() + + # Disable torch async compiling which won't work with daemonic processes + os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" + + # Intel OpenMP setting + ld_prealod_str = os.getenv("LD_PRELOAD", "") + if "libiomp5.so" in ld_prealod_str: + # The time(milliseconds) that a thread should wait after + # completing the execution of a parallel region, before sleeping. + os.environ['KMP_BLOCKTIME'] = "1" + # Prevents the CPU to run into low performance state + os.environ['KMP_TPAUSE'] = "0" + # Provides fine granularity parallelism + os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist" + os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist" + os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist" + + # To hint IPEX uses shared memory based AllReduce + os.environ["LOCAL_WORLD_SIZE"] = str( + self.parallel_config.tensor_parallel_size) + self.model_config = _verify_and_get_model_config(self.model_config) self.cache_config = _verify_and_get_cache_config(self.cache_config) self.scheduler_config = _verify_and_get_scheduler_config( self.scheduler_config) - # Instantiate the worker and load the model to CPU. - self._init_worker() - - def _init_worker(self): - from vllm.worker.cpu_worker import CPUWorker + # Multiprocessing-based executor does not support multi-node setting. + # Since it only works for single node, we can use the loopback address + # 127.0.0.1 for communication. + ip = "127.0.0.1" + port = get_open_port() + self.distributed_init_method = get_distributed_init_method(ip, port) + + is_async = isinstance(self, CPUExecutorAsync) + + world_size = self.parallel_config.tensor_parallel_size + result_handler = ResultHandler() + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + self.workers = [] + + if is_async: + self.workers = [ + ProcessWorkerWrapper( + result_handler, + partial( + self._create_worker, + rank=rank, + local_rank=rank, + )) for rank in range(0, world_size) + ] + self.driver_worker = self.workers[0] + self.workers = self.workers[1:] + self.driver_method_invoker = _async_driver_method_invoker + else: + self.driver_worker = self._create_worker() + self.driver_method_invoker = _driver_method_invoker + + if world_size != 1: + self.workers = [ + ProcessWorkerWrapper( + result_handler, + partial( + self._create_worker, + rank=rank, + local_rank=rank, + )) for rank in range(1, world_size) + ] + + if world_size != 1 or is_async: + if is_async: + async_worker_list = self.workers + [self.driver_worker] + else: + async_worker_list = self.workers + self.worker_monitor = WorkerMonitor(async_worker_list, + result_handler) + result_handler.start() + self.worker_monitor.start() + + self._run_workers("init_device") + self._run_workers("load_model") + + def _create_worker( + self, + local_rank: int = 0, + rank: int = 0, + ): + worker_module_name = "vllm.worker.cpu_worker" + worker_class_name = "CPUWorker" + + wrapper = WorkerWrapperBase( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + ) - assert self.parallel_config.world_size == 1, ( - "CPUExecutor only supports single CPU socket currently.") + assert self.distributed_init_method is not None - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = CPUWorker( + kwargs = dict( model_config=self.model_config, parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, device_config=self.device_config, cache_config=self.cache_config, load_config=self.load_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, + local_rank=local_rank, + rank=rank, + distributed_init_method=self.distributed_init_method, lora_config=self.lora_config, multimodal_config=self.multimodal_config, kv_cache_dtype=self.cache_config.cache_dtype, prompt_adapter_config=self.prompt_adapter_config, - is_driver_worker=True, + is_driver_worker=rank == 0, ) - self.driver_worker.init_device() - self.driver_worker.load_model() + wrapper.init_worker(**kwargs) + + return wrapper.worker + + def _run_workers( + self, + method: str, + *args, + async_run_remote_workers_only: bool = False, + max_concurrent_workers: Optional[int] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers. + + Args: + async_run_remote_workers_only: If True the method will be run only + in the remote workers, not the driver worker. It will also be + run asynchronously and return a list of futures rather than + blocking on the results. + """ + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + # Start the workers first. + worker_outputs = [ + worker.execute_method(method, *args, **kwargs) + for worker in self.workers + ] + + if async_run_remote_workers_only: + # Just return futures + return worker_outputs + + driver_worker_output = self.driver_method_invoker( + self.driver_worker, method, *args, **kwargs) + + # Get the results of the workers. + return [driver_worker_output + ] + [output.get() for output in worker_outputs] def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available KV blocks by invoking the underlying worker. """ - return self.driver_worker.determine_num_available_blocks() + return self.driver_method_invoker(self.driver_worker, + "determine_num_available_blocks") def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: @@ -74,43 +206,95 @@ def initialize_cache(self, num_gpu_blocks: int, # referred as `gpu block`. Because we want to reuse the existing block # management procedure. logger.info("# CPU blocks: %d", num_gpu_blocks) - self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + + self._run_workers("initialize_cache", + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - output = self.driver_worker.execute_model(execute_model_req) + if (self.parallel_config.tensor_parallel_size > 1 + and self.parallel_worker_tasks is None): + self.parallel_worker_tasks = self._run_workers( + "start_worker_execution_loop", + async_run_remote_workers_only=True, + ) + output = self.driver_method_invoker(self.driver_worker, + "execute_model", execute_model_req) return output + def stop_remote_worker_execution_loop(self) -> None: + if self.parallel_worker_tasks is None: + return + """ + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + self.driver_method_invoker(self.driver_worker, "execute_model", None) + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + self._wait_for_tasks_completion(parallel_worker_tasks) + def add_lora(self, lora_request: LoRARequest) -> bool: - return self.driver_worker.add_lora(lora_request) + return all(self._run_workers("add_lora", lora_request)) def remove_lora(self, lora_id: int) -> bool: - return self.driver_worker.remove_lora(lora_id) + return all(self._run_workers("remove_lora", lora_id)) def pin_lora(self, lora_id: int) -> bool: - return self.driver_worker.pin_lora(lora_id) + assert lora_id > 0, "lora_id must be greater than 0." + return all(self._run_workers( + "pin_lora", + lora_id=lora_id, + )) def list_loras(self) -> Set[int]: - return self.driver_worker.list_loras() + return self.driver_method_invoker(self.driver_worker, "list_loras") def add_prompt_adapter( self, prompt_adapter_request: PromptAdapterRequest) -> bool: - return self.driver_worker.add_prompt_adapter(prompt_adapter_request) + return all( + self._run_workers( + "add_prompt_adapter", + prompt_adapter_request, + )) def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - return self.driver_worker.remove_prompt_adapter(prompt_adapter_id) + return all( + self._run_workers( + "remove_prompt_adapter", + prompt_adapter_id, + )) def list_prompt_adapters(self) -> Set[int]: - return self.driver_worker.list_prompt_adapters() + return self.driver_method_invoker(self.driver_worker, + "list_prompt_adapters") def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - return self.driver_worker.pin_prompt_adapter(prompt_adapter_id) + return all(self._run_workers( + "pin_prompt_adapter", + prompt_adapter_id, + )) def check_health(self) -> None: - # CPUExecutor will always be healthy as long as - # it's running. - return + """Raises an error if engine is unhealthy.""" + if self.worker_monitor is not None and not self.worker_monitor.is_alive( + ): + raise RuntimeError("Worker processes are not running") + + def shutdown(self): + if (worker_monitor := getattr(self, "worker_monitor", + None)) is not None: + worker_monitor.close() + + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + for result in parallel_worker_tasks: + result.get() class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase): @@ -118,14 +302,12 @@ class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase): async def execute_model_async( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - output = await make_async(self.driver_worker.execute_model + output = await make_async(self.execute_model )(execute_model_req=execute_model_req, ) return output async def check_health_async(self) -> None: - # CPUExecutor will always be healthy as long as - # it's running. - return + self.check_health() def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: @@ -170,3 +352,11 @@ def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig: f" {kv_cache_space}, expect a positive integer value.") return config + + +def _driver_method_invoker(driver, method: str, *args, **kwargs): + return getattr(driver, method)(*args, **kwargs) + + +def _async_driver_method_invoker(driver, method: str, *args, **kwargs): + return driver.execute_method(method, *args, **kwargs).get() diff --git a/vllm/utils.py b/vllm/utils.py index 876c3bf90b02c..90be09fc7b967 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -404,27 +404,6 @@ def update_environment_variables(envs: Dict[str, str]): os.environ[k] = v -def init_kmp_env(): - if not is_cpu(): - return - - ld_prealod_str = os.getenv("LD_PRELOAD", "") - if "libiomp5.so" not in ld_prealod_str: - return - - # The time(milliseconds) that a thread should wait after completing the - # execution of a parallel region, before sleeping. - os.environ['KMP_BLOCKTIME'] = "1" - # dump settings on start up - os.environ['KMP_SETTINGS'] = "1" - # Prevents the CPU to run into low performance state - os.environ['KMP_TPAUSE'] = "0" - # Provides fine granularity parallelism - os.environ['KMP_FORKJOIN_BARRIER_PATTERN'] = "dist,dist" - os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist" - os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist" - - def chunk_list(lst: List[T], chunk_size: int): """Yield successive chunk_size chunks from lst.""" for i in range(0, len(lst), chunk_size): diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 83f4ba69fb728..71763c08ec45f 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -42,6 +42,7 @@ class CPUModelInput(ModelRunnerInputBase): attn_metadata: Optional["AttentionMetadata"] = None sampling_metadata: Optional["SamplingMetadata"] = None multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None + virtual_engine: Optional[int] = None def as_broadcastable_tensor_dict( self) -> Dict[str, Union[int, torch.Tensor]]: @@ -204,8 +205,8 @@ def _prepare_prompt( attn_metadata = self.attn_backend.make_metadata( is_prompt=True, seq_lens=seq_lens, - seq_lens_tensor=None, - max_decode_seq_len=None, + seq_lens_tensor=torch.tensor([]), + max_decode_seq_len=0, num_prefills=len(seq_lens), num_prefill_tokens=num_prompt_tokens, num_decode_tokens=0, @@ -345,7 +346,7 @@ def prepare_model_input( multi_modal_kwargs=multi_modal_kwargs, ) - @torch.inference_mode() + @torch.no_grad() def execute_model( self, model_input: CPUModelInput, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 3c22c73267b7f..735d48c908d61 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -4,6 +4,7 @@ import torch import torch.distributed +import vllm.envs as envs from vllm.attention import get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, MultiModalConfig, ParallelConfig, @@ -13,7 +14,7 @@ from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, init_kmp_env +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.worker.cpu_model_runner import CPUModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, LoraNotSupportedWorkerBase, WorkerInput) @@ -152,13 +153,18 @@ def __init__( if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." - # try to initialize intel openmp optimized tunings - init_kmp_env() - if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules init_cached_hf_modules() + + # Setup OpenMP threads affinity. + omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND + if omp_cpuids == "all": + self.local_omp_cpuid = "all" + else: + self.local_omp_cpuid = omp_cpuids.split("|")[rank] + self.model_runner: CPUModelRunner = CPUModelRunner( model_config, parallel_config, @@ -177,6 +183,9 @@ def __init__( self.cpu_cache: List[List[torch.Tensor]] def init_device(self) -> None: + if self.local_omp_cpuid != "all": + torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) + self.init_distributed_environment() # Set random seed. set_random_seed(self.model_config.seed) From 281977bd6eccade50be461f5a22cc51b74006976 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 26 Jul 2024 17:32:44 -0400 Subject: [PATCH 049/167] [Doc] Add Nemotron to supported model docs (#6843) --- docs/source/models/supported_models.rst | 4 ++++ vllm/model_executor/layers/activation.py | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index dc8bd6fb245df..483f552bba238 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -113,6 +113,10 @@ Decoder-only Language Models - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. - + * - :code:`NemotronForCausalLM` + - Nemotron-3, Nemotron-4, Minitron + - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. + - ✅︎ * - :code:`OLMoForCausalLM` - OLMo - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc. diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 6578193a31597..4c14fe476ee4a 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -166,9 +166,7 @@ class ReLUSquaredActivation(CustomOp): def forward_native(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" - relu_applied = nn.functional.relu(x) - squared = torch.square(relu_applied) - return squared + return torch.square(F.relu(x)) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: return self.forward_native(x) From 150a1ffbfd3d0429d30fa5ab841f53903a0a8a62 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 26 Jul 2024 17:39:10 -0400 Subject: [PATCH 050/167] [Doc] Update SkyPilot doc for wrong indents and instructions for update service (#4283) --- docs/source/serving/run_on_sky.rst | 430 ++++++++++++++++------------- 1 file changed, 243 insertions(+), 187 deletions(-) diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst index bd33c76cec3de..674b14a879bc3 100644 --- a/docs/source/serving/run_on_sky.rst +++ b/docs/source/serving/run_on_sky.rst @@ -5,9 +5,9 @@ Deploying and scaling up with SkyPilot .. raw:: html -

- vLLM -

+

+ vLLM +

vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot `__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery `__. @@ -21,8 +21,8 @@ Prerequisites .. code-block:: console - pip install skypilot-nightly - sky check + pip install skypilot-nightly + sky check Run on a single instance @@ -32,64 +32,64 @@ See the vLLM SkyPilot YAML for serving, `serving.yaml # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log & - - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://localhost:8081/v1 \ - --stop-token-ids 128009,128001 + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log & + + echo 'Waiting for vllm api server to start...' + while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://localhost:8081/v1 \ + --stop-token-ids 128009,128001 Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): .. code-block:: console - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN + HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. .. code-block:: console - (task, pid=7431) Running on public URL: https://.gradio.live + (task, pid=7431) Running on public URL: https://.gradio.live **Optional**: Serve the 70B model instead of the default 8B and use more GPU: .. code-block:: console - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct + HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct Scale up to multiple replicas @@ -99,151 +99,212 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut .. code-block:: yaml - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_tokens: 1 - + service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + .. raw:: html -
- Click to see the full recipe YAML +
+ Click to see the full recipe YAML .. code-block:: yaml - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? + service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? max_tokens: 1 - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log & - - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://localhost:8081/v1 \ - --stop-token-ids 128009,128001 + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log .. raw:: html -
+
Start the serving the Llama-3 8B model on multiple replicas: .. code-block:: console - HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN + HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN Wait until the service is ready: .. code-block:: console - watch -n10 sky serve status vllm + watch -n10 sky serve status vllm .. raw:: html -
- Example outputs: +
+ Example outputs: .. code-block:: console - Services - NAME VERSION UPTIME STATUS REPLICAS ENDPOINT - vllm 1 35s READY 2/2 xx.yy.zz.100:30001 + Services + NAME VERSION UPTIME STATUS REPLICAS ENDPOINT + vllm 1 35s READY 2/2 xx.yy.zz.100:30001 - Service Replicas - SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION - vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP({'L4': 1}) READY us-east4 - vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP({'L4': 1}) READY us-east4 + Service Replicas + SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION + vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 + vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 .. raw:: html - -
+ +
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: .. code-block:: console - ENDPOINT=$(sky serve status --endpoint 8081 vllm) - curl -L http://$ENDPOINT/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Meta-Llama-3-8B-Instruct", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Who are you?" - } - ], - "stop_token_ids": [128009, 128001] - }' - -To enable autoscaling, you could specify additional configs in `services`: + ENDPOINT=$(sky serve status --endpoint 8081 vllm) + curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "stop_token_ids": [128009, 128001] + }' + +To enable autoscaling, you could replace the `replicas` with the following configs in `service`: .. code-block:: yaml - services: - replica_policy: - min_replicas: 0 - max_replicas: 3 - target_qps_per_replica: 2 + service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 This will scale the service up to when the QPS exceeds 2 for each replica. + +.. raw:: html + +
+ Click to see the full recipe YAML + + +.. code-block:: yaml + + service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + + resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + + run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log + + +.. raw:: html + +
+ +To update the service with the new config: + +.. code-block:: console + + HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN + + +To stop the service: + +.. code-block:: console + + sky serve down vllm + **Optional**: Connect a GUI to the endpoint ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -253,58 +314,53 @@ It is also possible to access the Llama-3 service with a separate GUI frontend, .. raw:: html -
- Click to see the full GUI YAML +
+ Click to see the full GUI YAML .. code-block:: yaml - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct - ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. - - resources: - cpus: 2 - - setup: | - conda activate vllm - if [ $? -ne 0 ]; then - conda create -n vllm python=3.10 -y - conda activate vllm - fi - - # Install Gradio for web UI. - pip install gradio openai - - run: | - conda activate vllm - export PATH=$PATH:/sbin - WORKER_IP=$(hostname -I | cut -d' ' -f1) - CONTROLLER_PORT=21001 - WORKER_PORT=21002 - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://$ENDPOINT/v1 \ - --stop-token-ids 128009,128001 | tee ~/gradio.log + envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. + + resources: + cpus: 2 + + setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + # Install Gradio for web UI. + pip install gradio openai + + run: | + conda activate vllm + export PATH=$PATH:/sbin + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://$ENDPOINT/v1 \ + --stop-token-ids 128009,128001 | tee ~/gradio.log + .. raw:: html - -
+ +
1. Start the chat web UI: .. code-block:: console - sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) + sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) 2. Then, we can access the GUI at the returned gradio link: .. code-block:: console - | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live + | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live From b5f49ee55beac7bb314fd8880bdb718ed117dacb Mon Sep 17 00:00:00 2001 From: Gurpreet Singh Dhami <143527450+gurpreet-dhami@users.noreply.github.com> Date: Fri, 26 Jul 2024 20:26:45 -0400 Subject: [PATCH 051/167] Update README.md (#6847) --- examples/fp8/quantizer/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/fp8/quantizer/README.md b/examples/fp8/quantizer/README.md index 8f89a74a6a367..0b6944f688b49 100644 --- a/examples/fp8/quantizer/README.md +++ b/examples/fp8/quantizer/README.md @@ -16,7 +16,7 @@ #### Run on H100 system for speed if FP8; number of GPUs depends on the model size #### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache: -`python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1` +`python quantize.py --model-dir ./ll2-7b --dtype float16 --qformat fp8 --kv-cache-dtype fp8 --output-dir ./ll2_7b_fp8 --calib-size 512 --tp-size 1` Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference) ``` From bb5494676f5f57f1cf7cf72598de5434a2a22865 Mon Sep 17 00:00:00 2001 From: chenqianfzh <51831990+chenqianfzh@users.noreply.github.com> Date: Fri, 26 Jul 2024 18:32:20 -0700 Subject: [PATCH 052/167] enforce eager mode with bnb quantization temporarily (#6846) --- vllm/config.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index 6403a53f86281..92fde449b43fd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -282,6 +282,10 @@ def verify_with_parallel_config( raise ValueError( "BitAndBytes quantization with TP or PP is not supported yet.") + if self.quantization == "bitsandbytes" and self.enforce_eager is False: + raise ValueError( + "BitAndBytes with enforce_eager = False is not supported yet.") + def get_hf_config_sliding_window(self) -> Optional[int]: """Get the sliding window size, or None if disabled.""" From d09b94ca588c6de1e627194264357e14460ae2eb Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 26 Jul 2024 18:45:57 -0700 Subject: [PATCH 053/167] [TPU] Support collective communications in XLA devices (#6813) --- .../device_communicators/tpu_communicator.py | 30 +++++++++++++++++++ vllm/distributed/parallel_state.py | 22 ++++++++++++++ vllm/lora/layers.py | 4 +++ .../model_executor/layers/logits_processor.py | 16 ++++++++-- 4 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 vllm/distributed/device_communicators/tpu_communicator.py diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py new file mode 100644 index 0000000000000..69a9a516f3ebe --- /dev/null +++ b/vllm/distributed/device_communicators/tpu_communicator.py @@ -0,0 +1,30 @@ +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.platforms import current_platform + +if current_platform.is_tpu(): + import torch_xla.core.xla_model as xm + from torch_xla._internal import pjrt + + +class TpuCommunicator: + + def __init__(self, group: ProcessGroup): + if not current_platform.is_tpu(): + self.disabled = True + return + self.disabled = False + + local_rank = dist.get_rank(group) + world_size = dist.get_world_size(group) + pjrt.initialize_multiprocess(local_rank, world_size) + xm._init_world_size_ordinal() + + def all_reduce(self, x: torch.Tensor) -> torch.Tensor: + return xm.all_reduce(xm.REDUCE_SUM, x) + + def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor: + assert dim == -1, "TPUs only support dim=-1 for all-gather." + return xm.all_gather(x, dim=dim) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 58cae46d9af27..4116b1729d188 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -133,6 +133,7 @@ def __init__( torch_distributed_backend: Union[str, Backend], use_pynccl: bool, use_custom_allreduce: bool, + use_tpu_communicator: bool, use_message_queue_broadcaster: bool = False, ): @@ -164,6 +165,7 @@ def __init__( self.use_pynccl = use_pynccl self.use_custom_allreduce = use_custom_allreduce + self.use_tpu_communicator = use_tpu_communicator # lazy import to avoid documentation build error from vllm.distributed.device_communicators.custom_all_reduce import ( @@ -190,6 +192,12 @@ def __init__( else: self.ca_comm = None + from vllm.distributed.device_communicators.tpu_communicator import ( + TpuCommunicator) + self.tpu_communicator: Optional[TpuCommunicator] + if use_tpu_communicator and self.world_size > 1: + self.tpu_communicator = TpuCommunicator(group=self.cpu_group) + from vllm.distributed.device_communicators.shm_broadcast import ( MessageQueue) self.mq_broadcaster: Optional[MessageQueue] = None @@ -289,6 +297,12 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: # Bypass the function if we are using only 1 GPU. if self.world_size == 1: return input_ + + # For TPUs, use TPU communicator. + tpu_comm = self.tpu_communicator + if tpu_comm is not None and not tpu_comm.disabled: + return tpu_comm.all_reduce(input_) + if ca_comm is not None: out = ca_comm.custom_all_reduce(input_) if out is not None: @@ -310,6 +324,12 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: return input_ assert -input_.dim() <= dim < input_.dim(), ( f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") + + # For TPUs, use TPU communicator. + tpu_comm = self.tpu_communicator + if tpu_comm is not None and not tpu_comm.disabled: + return tpu_comm.all_gather(input_, dim) + if dim < 0: # Convert negative dim to positive. dim += input_.dim() @@ -727,6 +747,7 @@ def init_world_group(ranks: List[int], local_rank: int, torch_distributed_backend=backend, use_pynccl=False, use_custom_allreduce=False, + use_tpu_communicator=False, ) @@ -745,6 +766,7 @@ def init_model_parallel_group( torch_distributed_backend=backend, use_pynccl=True, use_custom_allreduce=use_custom_allreduce, + use_tpu_communicator=True, use_message_queue_broadcaster=use_message_queue_broadcaster, ) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 40de134c0a5ee..87de285a373a2 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1067,6 +1067,10 @@ def scale(self): def soft_cap(self): return self.base_layer.soft_cap + @property + def use_gather(self): + return self.base_layer.use_gather + @property def org_vocab_size(self): return self.base_layer.org_vocab_size diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index f6fcf49ef464b..bd3e7e114204f 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -5,10 +5,12 @@ import torch import torch.nn as nn -from vllm.distributed import tensor_model_parallel_gather +from vllm.distributed import (tensor_model_parallel_all_gather, + tensor_model_parallel_gather) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform class LogitsProcessor(nn.Module): @@ -39,6 +41,8 @@ def __init__(self, self.org_vocab_size = org_vocab_size or vocab_size # Soft cap the logits. Used in Gemma 2. self.soft_cap = soft_cap + # Whether to use gather or all-gather to gather the logits. + self.use_gather = not current_platform.is_tpu() def forward( self, @@ -76,7 +80,15 @@ def _get_logits(self, hidden_states: torch.Tensor, logits = lm_head.linear_method.apply(lm_head, hidden_states, bias=embedding_bias) - logits = tensor_model_parallel_gather(logits) + if self.use_gather: + logits = tensor_model_parallel_gather(logits) + else: + # Gather is not supported for some devices such as TPUs. + # Use all-gather instead. + # NOTE(woosuk): Here, the outputs of every device should not be None + # because XLA requires strict SPMD among all devices. Every device + # should execute the same operations after gathering the logits. + logits = tensor_model_parallel_all_gather(logits) # Remove paddings in vocab (if any). if logits is not None: logits = logits[:, :self.org_vocab_size] From 981b0d567355063d5453e382a85970cae083c615 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 27 Jul 2024 09:58:25 +0800 Subject: [PATCH 054/167] [Frontend] Factor out code for running uvicorn (#6828) --- vllm/entrypoints/api_server.py | 74 ++++++++++++++++++--------- vllm/entrypoints/openai/api_server.py | 72 ++++++++------------------ vllm/server/__init__.py | 3 ++ vllm/server/launch.py | 42 +++++++++++++++ 4 files changed, 116 insertions(+), 75 deletions(-) create mode 100644 vllm/server/__init__.py create mode 100644 vllm/server/launch.py diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 66941442c8c9c..3476357658522 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -5,12 +5,12 @@ We are also not going to accept PRs modifying this file, please change `vllm/entrypoints/openai/api_server.py` instead. """ - +import asyncio import json import ssl -from typing import AsyncGenerator +from argparse import Namespace +from typing import Any, AsyncGenerator, Optional -import uvicorn from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -18,8 +18,10 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.logger import init_logger from vllm.sampling_params import SamplingParams +from vllm.server import serve_http from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid +from vllm.version import __version__ as VLLM_VERSION logger = init_logger("vllm.entrypoints.api_server") @@ -81,6 +83,50 @@ async def stream_results() -> AsyncGenerator[bytes, None]: return JSONResponse(ret) +def build_app(args: Namespace) -> FastAPI: + global app + + app.root_path = args.root_path + return app + + +async def init_app( + args: Namespace, + llm_engine: Optional[AsyncLLMEngine] = None, +) -> FastAPI: + app = build_app(args) + + global engine + + engine_args = AsyncEngineArgs.from_cli_args(args) + engine = (llm_engine + if llm_engine is not None else AsyncLLMEngine.from_engine_args( + engine_args, usage_context=UsageContext.API_SERVER)) + + return app + + +async def run_server(args: Namespace, + llm_engine: Optional[AsyncLLMEngine] = None, + **uvicorn_kwargs: Any) -> None: + logger.info("vLLM API server version %s", VLLM_VERSION) + logger.info("args: %s", args) + + app = await init_app(args, llm_engine) + await serve_http( + app, + host=args.host, + port=args.port, + log_level=args.log_level, + timeout_keep_alive=TIMEOUT_KEEP_ALIVE, + ssl_keyfile=args.ssl_keyfile, + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs, + **uvicorn_kwargs, + ) + + if __name__ == "__main__": parser = FlexibleArgumentParser() parser.add_argument("--host", type=str, default=None) @@ -105,25 +151,5 @@ async def stream_results() -> AsyncGenerator[bytes, None]: parser.add_argument("--log-level", type=str, default="debug") parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() - engine_args = AsyncEngineArgs.from_cli_args(args) - engine = AsyncLLMEngine.from_engine_args( - engine_args, usage_context=UsageContext.API_SERVER) - - app.root_path = args.root_path - logger.info("Available routes are:") - for route in app.routes: - if not hasattr(route, 'methods'): - continue - methods = ', '.join(route.methods) - logger.info("Route: %s, Methods: %s", route.path, methods) - - uvicorn.run(app, - host=args.host, - port=args.port, - log_level=args.log_level, - timeout_keep_alive=TIMEOUT_KEEP_ALIVE, - ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile, - ssl_ca_certs=args.ssl_ca_certs, - ssl_cert_reqs=args.ssl_cert_reqs) + asyncio.run(run_server(args)) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 0fe4dd245b5e6..c1640a10a407d 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -2,14 +2,12 @@ import importlib import inspect import re -import signal +from argparse import Namespace from contextlib import asynccontextmanager from http import HTTPStatus -from typing import Optional, Set +from typing import Any, Optional, Set -import fastapi -import uvicorn -from fastapi import APIRouter, Request +from fastapi import APIRouter, FastAPI, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -38,6 +36,7 @@ from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) from vllm.logger import init_logger +from vllm.server import serve_http from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser from vllm.version import __version__ as VLLM_VERSION @@ -57,7 +56,7 @@ @asynccontextmanager -async def lifespan(app: fastapi.FastAPI): +async def lifespan(app: FastAPI): async def _force_log(): while True: @@ -75,7 +74,7 @@ async def _force_log(): router = APIRouter() -def mount_metrics(app: fastapi.FastAPI): +def mount_metrics(app: FastAPI): # Add prometheus asgi middleware to route /metrics requests metrics_route = Mount("/metrics", make_asgi_app()) # Workaround for 307 Redirect for /metrics @@ -165,8 +164,8 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): return JSONResponse(content=generator.model_dump()) -def build_app(args): - app = fastapi.FastAPI(lifespan=lifespan) +def build_app(args: Namespace) -> FastAPI: + app = FastAPI(lifespan=lifespan) app.include_router(router) app.root_path = args.root_path @@ -214,11 +213,8 @@ async def authentication(request: Request, call_next): return app -async def build_server( - args, - llm_engine: Optional[AsyncLLMEngine] = None, - **uvicorn_kwargs, -) -> uvicorn.Server: +async def init_app(args: Namespace, + llm_engine: Optional[AsyncLLMEngine] = None) -> FastAPI: app = build_app(args) if args.served_model_name is not None: @@ -281,14 +277,17 @@ async def build_server( ) app.root_path = args.root_path - logger.info("Available routes are:") - for route in app.routes: - if not hasattr(route, 'methods'): - continue - methods = ', '.join(route.methods) - logger.info("Route: %s, Methods: %s", route.path, methods) + return app + + +async def run_server(args: Namespace, + llm_engine: Optional[AsyncLLMEngine] = None, + **uvicorn_kwargs: Any) -> None: + logger.info("vLLM API server version %s", VLLM_VERSION) + logger.info("args: %s", args) - config = uvicorn.Config( + app = await init_app(args, llm_engine) + await serve_http( app, host=args.host, port=args.port, @@ -301,36 +300,6 @@ async def build_server( **uvicorn_kwargs, ) - return uvicorn.Server(config) - - -async def run_server(args, llm_engine=None, **uvicorn_kwargs) -> None: - logger.info("vLLM API server version %s", VLLM_VERSION) - logger.info("args: %s", args) - - server = await build_server( - args, - llm_engine, - **uvicorn_kwargs, - ) - - loop = asyncio.get_running_loop() - - server_task = loop.create_task(server.serve()) - - def signal_handler() -> None: - # prevents the uvicorn signal handler to exit early - server_task.cancel() - - loop.add_signal_handler(signal.SIGINT, signal_handler) - loop.add_signal_handler(signal.SIGTERM, signal_handler) - - try: - await server_task - except asyncio.CancelledError: - print("Gracefully stopping http server") - await server.shutdown() - if __name__ == "__main__": # NOTE(simon): @@ -339,4 +308,5 @@ def signal_handler() -> None: description="vLLM OpenAI-Compatible RESTful API server.") parser = make_arg_parser(parser) args = parser.parse_args() + asyncio.run(run_server(args)) diff --git a/vllm/server/__init__.py b/vllm/server/__init__.py new file mode 100644 index 0000000000000..17c98b4dad6c9 --- /dev/null +++ b/vllm/server/__init__.py @@ -0,0 +1,3 @@ +from .launch import serve_http + +__all__ = ["serve_http"] diff --git a/vllm/server/launch.py b/vllm/server/launch.py new file mode 100644 index 0000000000000..1a8aeb7f1022b --- /dev/null +++ b/vllm/server/launch.py @@ -0,0 +1,42 @@ +import asyncio +import signal +from typing import Any + +import uvicorn +from fastapi import FastAPI + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +async def serve_http(app: FastAPI, **uvicorn_kwargs: Any) -> None: + logger.info("Available routes are:") + for route in app.routes: + methods = getattr(route, "methods", None) + path = getattr(route, "path", None) + + if methods is None or path is None: + continue + + logger.info("Route: %s, Methods: %s", path, ', '.join(methods)) + + config = uvicorn.Config(app, **uvicorn_kwargs) + server = uvicorn.Server(config) + + loop = asyncio.get_running_loop() + + server_task = loop.create_task(server.serve()) + + def signal_handler() -> None: + # prevents the uvicorn signal handler to exit early + server_task.cancel() + + loop.add_signal_handler(signal.SIGINT, signal_handler) + loop.add_signal_handler(signal.SIGTERM, signal_handler) + + try: + await server_task + except asyncio.CancelledError: + logger.info("Gracefully stopping http server") + await server.shutdown() From 55712941e57bcfd662db2905811d6e2807b9153f Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 26 Jul 2024 22:27:44 -0400 Subject: [PATCH 055/167] [Bug Fix] Illegal memory access, FP8 Llama 3.1 405b (#6852) --- .../broadcast_load_epilogue_c3x.hpp | 46 +++++++++++++++---- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp index 877a9f5b9e5de..e4bc9752ed7db 100644 --- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp +++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp @@ -328,20 +328,36 @@ struct Sm90ColOrScalarBroadcast { return EmptyProducerLoadCallbacks{}; } - template + template struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks { CUTLASS_DEVICE - ConsumerStoreCallbacks(GTensor&& tCgCol, RTensor&& tCrCol, Params const& params) - : tCgCol(cute::forward(tCgCol)), - tCrCol(cute::forward(tCrCol)), - params(params) {} + ConsumerStoreCallbacks( + GTensor&& tCgCol, + RTensor&& tCrCol, + CTensor&& tCcCol, + ProblemShape problem_shape, + Params const& params + ): + tCgCol(cute::forward(tCgCol)), + tCrCol(cute::forward(tCrCol)), + tCcCol(cute::forward(tCcCol)), + m(get<0>(problem_shape)), + params(params) {} GTensor tCgCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) - RTensor tCrCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + RTensor tCrCol; + CTensor tCcCol; // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) Params const& params; + int m; CUTLASS_DEVICE void begin() { + Tensor pred = make_tensor(shape(tCgCol)); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(pred); ++i) { + pred(i) = get<0>(tCcCol(i)) < m; + } + if (!params.col_broadcast) { fill(tCrCol, *(params.ptr_col)); return; @@ -349,7 +365,7 @@ struct Sm90ColOrScalarBroadcast { // Filter so we don't issue redundant copies over stride-0 modes // (only works if 0-strides are in same location, which is by construction) - copy_aligned(filter(tCgCol), filter(tCrCol)); + copy_if(pred, filter(tCgCol), filter(tCrCol)); } template @@ -381,8 +397,20 @@ struct Sm90ColOrScalarBroadcast { mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx); Tensor tCrCol = make_tensor_like(tCgCol); // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) - return ConsumerStoreCallbacks( - cute::move(tCgCol), cute::move(tCrCol), params); + // Generate an identity tensor matching the shape of the global tensor and + // partition the same way, this will be used to generate the predicate + // tensor for loading + Tensor cCol = make_identity_tensor(mCol.shape()); + Tensor tCcCol = sm90_partition_for_epilogue( // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) + cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx); + + return ConsumerStoreCallbacks( + cute::move(tCgCol), + cute::move(tCrCol), + cute::move(tCcCol), + args.problem_shape_mnkl, + params + ); } }; From 969d03226514d99f43cf7d17d1e336231d91751a Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Fri, 26 Jul 2024 23:02:25 -0400 Subject: [PATCH 056/167] [Bugfix]: Fix Tensorizer test failures (#6835) --- .buildkite/test-pipeline.yaml | 1 - tests/tensorizer_loader/conftest.py | 45 ++++++++++++++++++++++ tests/tensorizer_loader/test_tensorizer.py | 7 +--- 3 files changed, 47 insertions(+), 6 deletions(-) create mode 100644 tests/tensorizer_loader/conftest.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 633bc5ca95bf9..5b4a786305e1f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -220,7 +220,6 @@ steps: - label: Tensorizer Test #mirror_hardwares: [amd] - soft_fail: true fast_check: true commands: - apt-get install -y curl libsodium23 diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py new file mode 100644 index 0000000000000..c5c6fc1057d31 --- /dev/null +++ b/tests/tensorizer_loader/conftest.py @@ -0,0 +1,45 @@ +# isort: skip_file + +import contextlib +import gc + +import pytest +import ray +import torch + +from vllm.distributed import (destroy_distributed_environment, + destroy_model_parallel) +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig + + +def cleanup(): + destroy_model_parallel() + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + gc.collect() + torch.cuda.empty_cache() + ray.shutdown() + + +@pytest.fixture() +def should_do_global_cleanup_after_test(request) -> bool: + """Allow subdirectories to skip global cleanup by overriding this fixture. + This can provide a ~10x speedup for non-GPU unit tests since they don't need + to initialize torch. + """ + + return True + + +@pytest.fixture(autouse=True) +def cleanup_fixture(should_do_global_cleanup_after_test: bool): + yield + if should_do_global_cleanup_after_test: + cleanup() + + +@pytest.fixture(autouse=True) +def tensorizer_config(): + config = TensorizerConfig(tensorizer_uri="vllm") + return config \ No newline at end of file diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index b7030e3cd6d42..2adeae8874bdb 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -40,7 +40,6 @@ tensorize_model_for_testing_script = os.path.join( os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py") - def is_curl_installed(): try: subprocess.check_call(['curl', '--version']) @@ -63,10 +62,6 @@ def write_keyfile(keyfile_path: str): with open(keyfile_path, 'wb') as f: f.write(encryption_params.key) -@pytest.fixture(autouse=True) -def tensorizer_config(): - config = TensorizerConfig(tensorizer_uri="vllm") - return config @patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent') @@ -105,6 +100,7 @@ def test_can_deserialize_s3(vllm_runner): @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_deserialized_encrypted_vllm_model_has_same_outputs( vllm_runner, tmp_path): + cleanup() with vllm_runner(model_ref) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") key_path = tmp_path / (model_ref + ".key") @@ -316,6 +312,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): + cleanup() model_ref = "facebook/opt-125m" model_path = tmp_path / (model_ref + ".tensors") config = TensorizerConfig(tensorizer_uri=str(model_path)) From ced36cd89b9c012eb066ef863b2d1ecf052f3e00 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 26 Jul 2024 20:16:13 -0700 Subject: [PATCH 057/167] [ROCm] Upgrade PyTorch nightly version (#6845) --- Dockerfile.rocm | 4 ++-- docs/source/getting_started/amd-installation.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 7b4c0166a04bd..64bc0f3c12c75 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -55,8 +55,8 @@ RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ *"rocm-6.1"*) \ python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --no-cache-dir --pre \ - torch==2.5.0.dev20240710 \ - torchvision==0.20.0.dev20240710 \ + torch==2.5.0.dev20240726 \ + torchvision==0.20.0.dev20240726 \ --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ *) ;; esac diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 1c7d274b7c47e..9648d07d2790c 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -117,7 +117,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases. $ # Install PyTorch $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1 + $ pip install --no-cache-dir --pre torch==2.5.0.dev20240726 --index-url https://download.pytorch.org/whl/nightly/rocm6.1 $ # Build & install AMD SMI $ pip install /opt/rocm/share/amd_smi From 3c3012398e4aecde9e40981d79a0576203158d24 Mon Sep 17 00:00:00 2001 From: omrishiv <327609+omrishiv@users.noreply.github.com> Date: Fri, 26 Jul 2024 20:20:16 -0700 Subject: [PATCH 058/167] [Doc] add VLLM_TARGET_DEVICE=neuron to documentation for neuron (#6844) Signed-off-by: omrishiv <327609+omrishiv@users.noreply.github.com> --- docs/source/getting_started/neuron-installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst index 62bf779c339d5..0816524468cab 100644 --- a/docs/source/getting_started/neuron-installation.rst +++ b/docs/source/getting_started/neuron-installation.rst @@ -131,6 +131,6 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able $ git clone https://github.com/vllm-project/vllm.git $ cd vllm $ pip install -U -r requirements-neuron.txt - $ pip install . + $ VLLM_TARGET_DEVICE="neuron" pip install . If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed. From ed94e4f427bce8611e198d051dbd3b0097b448e8 Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Sat, 27 Jul 2024 06:45:31 +0300 Subject: [PATCH 059/167] [Bugfix][Model] Jamba assertions and no chunked prefill by default for Jamba (#6784) --- vllm/engine/arg_utils.py | 6 +++++- vllm/model_executor/models/jamba.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cd64d3345b830..bad5be4917216 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -754,10 +754,14 @@ def create_engine_config(self, ) -> EngineConfig: use_sliding_window = (model_config.get_sliding_window() is not None) use_spec_decode = self.speculative_model is not None + has_seqlen_agnostic_layers = ( + model_config.contains_seqlen_agnostic_layers( + parallel_config)) if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and not self.enable_prompt_adapter - and not self.enable_prefix_caching): + and not self.enable_prefix_caching + and not has_seqlen_agnostic_layers): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models with " diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index d4e4f0055aa2b..3444578227259 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -644,6 +644,11 @@ def __init__( lora_config: Optional[LoRAConfig] = None, scheduler_config: Optional[SchedulerConfig] = None, ) -> None: + assert not scheduler_config.chunked_prefill_enabled, \ + "Jamba currently does not support chunked prefill" + assert not cache_config.enable_prefix_caching, \ + "Jamba currently does not support prefix caching" + super().__init__() self.config = config self.scheduler_config = scheduler_config From 14dbd5a7674e5de2862c18adb711d9feecd35063 Mon Sep 17 00:00:00 2001 From: Joe Date: Fri, 26 Jul 2024 20:47:50 -0700 Subject: [PATCH 060/167] [Model] H2O Danube3-4b (#6451) --- .buildkite/run-cpu-test.sh | 2 +- .../kernels/benchmark_paged_attention.py | 2 +- benchmarks/kernels/benchmark_rope.py | 2 +- csrc/attention/attention_kernels.cu | 6 +++ tests/kernels/test_attention.py | 4 +- tests/kernels/test_cache.py | 8 ++- tests/kernels/test_pos_encoding.py | 2 +- tests/models/test_danube3_4b.py | 52 +++++++++++++++++++ vllm/attention/ops/paged_attn.py | 2 +- vllm/utils.py | 6 +++ 10 files changed, 79 insertions(+), 7 deletions(-) create mode 100644 tests/models/test_danube3_4b.py diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 21deec2bba973..45bc8eb2f8477 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,7 +23,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " pip install pytest Pillow protobuf - pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported # online inference docker exec cpu-test bash -c " diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 78cac8a555d1b..a04433142da42 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -175,7 +175,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: parser.add_argument("--num-kv-heads", type=int, default=8) parser.add_argument("--head-size", type=int, - choices=[64, 80, 96, 112, 128, 192, 256], + choices=[64, 80, 96, 112, 120, 128, 192, 256], default=128) parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) parser.add_argument("--use-alibi", action="store_true") diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 78736c7a7ba6f..f542684a9a2a9 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -94,7 +94,7 @@ def benchmark_rope_kernels_multi_lora( parser.add_argument("--num-heads", type=int, default=8) parser.add_argument("--head-size", type=int, - choices=[64, 80, 96, 112, 128, 192, 256], + choices=[64, 80, 96, 112, 120, 128, 192, 256], default=128) parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) parser.add_argument("--dtype", diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 350dbce1d7ba9..875570a1e894f 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -751,6 +751,9 @@ void paged_attention_v1_launcher( case 112: LAUNCH_PAGED_ATTENTION_V1(112); break; + case 120: + LAUNCH_PAGED_ATTENTION_V1(120); + break; case 128: LAUNCH_PAGED_ATTENTION_V1(128); break; @@ -912,6 +915,9 @@ void paged_attention_v2_launcher( case 112: LAUNCH_PAGED_ATTENTION_V2(112); break; + case 120: + LAUNCH_PAGED_ATTENTION_V2(120); + break; case 128: LAUNCH_PAGED_ATTENTION_V2(128); break; diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 2e6412c28958e..c7c6707461c3e 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -28,7 +28,7 @@ # FlashAttention forward only supports head dimension at most 128 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256 +HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256 ] if not is_hip() else [64, 80, 96, 112, 128] BLOCK_SIZES = [16, 32] @@ -134,6 +134,8 @@ def test_paged_attention( seed: int, device: str, ) -> None: + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f9a609464abfc..3fb9b59be1701 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -11,7 +11,7 @@ NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] +HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] BLOCK_SIZES = [8, 16, 32] # Arbitrary values for testing @@ -52,6 +52,8 @@ def test_copy_blocks( kv_cache_dtype: str, device: str, ) -> None: + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -124,6 +126,8 @@ def test_reshape_and_cache( device: str, kv_cache_dtype: str, ) -> None: + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -325,6 +329,8 @@ def test_swap_blocks( ) -> None: if kv_cache_dtype == "fp8" and "cpu" in direction: pytest.skip() + if kv_cache_dtype == "fp8" and head_size % 16: + pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 4c83659929d41..4a7ad6e0fa21d 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -10,7 +10,7 @@ IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 96, 112, 128, 192, 256] +HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size NUM_HEADS = [7, 17] # Arbitrary values for testing BATCH_SIZES = [1, 5] # Arbitrary values for testing diff --git a/tests/models/test_danube3_4b.py b/tests/models/test_danube3_4b.py new file mode 100644 index 0000000000000..bfaa275f73c19 --- /dev/null +++ b/tests/models/test_danube3_4b.py @@ -0,0 +1,52 @@ +"""Compare the outputs of HF and vLLM when using greedy sampling. + +This tests danube3 separately because its head size isn't supported on CPU yet. + +Run `pytest tests/models/test_danube3_4b.py`. +""" +import pytest + +from .utils import check_outputs_equal + +MODELS = ["h2oai/h2o-danube3-4b-base"] + +target_dtype = "half" + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [32]) +def test_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", [target_dtype]) +def test_model_print( + vllm_runner, + model: str, + dtype: str, +) -> None: + with vllm_runner(model, dtype=dtype) as vllm_model: + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index ce7b4d129779c..0f6d2f2d1ab3f 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -31,7 +31,7 @@ class PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 128, 192, 256] + return [64, 80, 96, 112, 120, 128, 192, 256] @staticmethod def get_kv_cache_shape( diff --git a/vllm/utils.py b/vllm/utils.py index 90be09fc7b967..1448316e66edb 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -508,6 +508,12 @@ def create_kv_caches_with_random( seed: int = 0, device: Optional[str] = "cuda", ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + + if cache_dtype == "fp8" and head_size % 16: + raise ValueError( + f"Does not support key cache of type fp8 with head_size {head_size}" + ) + torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) From 52f07e3dec2b76045208f5cfea5670b85a719cc6 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 26 Jul 2024 20:54:27 -0700 Subject: [PATCH 061/167] [Hardware][TPU] Implement tensor parallelism with Ray (#5871) --- requirements-tpu.txt | 1 + vllm/attention/backends/pallas.py | 4 +- vllm/engine/llm_engine.py | 10 +- vllm/executor/ray_tpu_executor.py | 313 ++++++++++++++++++++++++++++++ vllm/worker/tpu_model_runner.py | 42 ++-- vllm/worker/tpu_worker.py | 16 +- 6 files changed, 365 insertions(+), 21 deletions(-) create mode 100644 vllm/executor/ray_tpu_executor.py diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 22487f5524dd7..c2140fbffec9f 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -4,4 +4,5 @@ # Dependencies for TPU # Currently, the TPU backend uses a nightly version of PyTorch XLA. # You can install the dependencies in Dockerfile.tpu. +ray triton # To avoid import errors diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index b83a83bb177d4..c53a2f91b89d7 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -55,8 +55,8 @@ class PallasMetadata(AttentionMetadata): # Currently, input sequences can only contain all prefills # or all decoding. - block_tables: Optional[torch.Tensor] - context_lens: Optional[torch.Tensor] + block_tables: Optional[torch.Tensor] = None + context_lens: Optional[torch.Tensor] = None @property def prefill_metadata(self) -> Optional["PallasMetadata"]: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 48d5305892219..004348d4c49a3 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -394,8 +394,14 @@ def _get_executor_cls(cls, from vllm.executor.neuron_executor import NeuronExecutor executor_class = NeuronExecutor elif engine_config.device_config.device_type == "tpu": - from vllm.executor.tpu_executor import TPUExecutor - executor_class = TPUExecutor + if distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_tpu_executor import RayTPUExecutor + executor_class = RayTPUExecutor + else: + assert distributed_executor_backend is None + from vllm.executor.tpu_executor import TPUExecutor + executor_class = TPUExecutor elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py new file mode 100644 index 0000000000000..7048d47980723 --- /dev/null +++ b/vllm/executor/ray_tpu_executor.py @@ -0,0 +1,313 @@ +import asyncio +import os +from collections import defaultdict +from itertools import islice, repeat +from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Tuple, + Union) + +import vllm.envs as envs +from vllm.executor.executor_base import ExecutorAsyncBase +from vllm.executor.ray_utils import RayWorkerWrapper, ray +from vllm.executor.tpu_executor import TPUExecutor +from vllm.logger import init_logger +from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + get_vllm_instance_id, make_async) + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + + +class RayTPUExecutor(TPUExecutor): + + def __init__(self, *args, **kwargs): + # This is non-None when the execute model loop is running + # in the parallel workers. It's a coroutine in the AsyncLLMEngine case. + self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None + # Updated by implementations that require additional args to be passed + # to the _run_workers execute_model call + self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {} + + super().__init__(*args, **kwargs) + + def _init_executor(self) -> None: + assert self.parallel_config.distributed_executor_backend == "ray" + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel TPU workers. + self._init_workers_ray(placement_group) + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerWrapper] = [] + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("TPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + + assert self.speculative_config is None + worker_module_name = "vllm.worker.tpu_worker" + worker_class_name = "TPUWorker" + + worker = ray.remote( + num_cpus=0, + resources={"TPU": 1}, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + trust_remote_code=self.model_config.trust_remote_code, + ) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + trust_remote_code=self.model_config.trust_remote_code, + ) + else: + # Else, added to the list of workers. + self.workers.append(worker) + + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any TPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "TPU node.") + + # Get the set of TPU IDs used on each node. + worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", + use_dummy_driver=True) + + node_workers = defaultdict(list) + for i, (node_id, _) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + + VLLM_INSTANCE_ID = get_vllm_instance_id() + + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [({ + "VLLM_INSTANCE_ID": + VLLM_INSTANCE_ID, + "VLLM_TRACE_FUNCTION": + str(envs.VLLM_TRACE_FUNCTION), + }, ) for _ in worker_node_and_gpu_ids] + self._run_workers("update_environment_variables", + all_args=all_args_to_update_environment_variables) + + if len(node_workers) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Initialize the actual workers inside worker wrapper. + init_worker_all_kwargs = [ + self._get_worker_kwargs( + local_rank=node_workers[node_id].index(rank), + rank=rank, + distributed_init_method=distributed_init_method, + ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) + ] + self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) + + self._run_workers("init_device") + self._run_workers("load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers) + + def _driver_execute_model( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + return self.driver_worker.execute_method("execute_model", + execute_model_req) + + def _run_workers( + self, + method: str, + *args, + async_run_remote_workers_only: bool = False, + all_args: Optional[List[Tuple[Any, ...]]] = None, + all_kwargs: Optional[List[Dict[str, Any]]] = None, + use_dummy_driver: bool = False, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers. Can be used in the following + ways: + + - async_run_remote_workers_only: If True the method will be run only + in the remote workers, not the driver worker. It will also be + run asynchronously and return a list of futures rather than blocking + on the results. + - args/kwargs: All workers share the same args/kwargs + - all_args/all_kwargs: args/kwargs for each worker are specified + individually + """ + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + count = len(self.workers) + all_worker_args = repeat(args, count) if all_args is None \ + else islice(all_args, 1, None) + all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ + else islice(all_kwargs, 1, None) + + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *worker_args, **worker_kwargs) + for (worker, worker_args, worker_kwargs + ) in zip(self.workers, all_worker_args, all_worker_kwargs) + ] + + if async_run_remote_workers_only: + # Just return futures + return ray_worker_outputs + + driver_args = args if all_args is None else all_args[0] + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + + # Start the driver worker after all the ray workers. + if not use_dummy_driver: + driver_worker_output = self.driver_worker.execute_method( + method, *driver_args, **driver_kwargs) + else: + assert self.driver_dummy_worker is not None + driver_worker_output = ray.get( + self.driver_dummy_worker.execute_method.remote( + method, *driver_args, **driver_kwargs)) + # Get the results of the ray workers. + if self.workers: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) + + def determine_num_available_blocks(self) -> Tuple[int, int]: + num_blocks = self._run_workers("determine_num_available_blocks", ) + num_tpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + return num_tpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, + num_cpu_blocks) + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + self._run_workers("initialize_cache", + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=num_cpu_blocks) + + def execute_model( + self, + execute_model_req: ExecuteModelRequest, + ) -> List[SamplerOutput]: + if self.parallel_worker_tasks is None: + self.parallel_worker_tasks = self._run_workers( + "start_worker_execution_loop", + async_run_remote_workers_only=True, + **self.extra_execute_model_run_workers_kwargs) + + # Only the driver worker returns the sampling results. + return self._driver_execute_model(execute_model_req) + + def stop_remote_worker_execution_loop(self) -> None: + if self.parallel_worker_tasks is None: + return + + self._driver_execute_model() + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + self._wait_for_tasks_completion(parallel_worker_tasks) + + +class RayTPUExecutorAsync(RayTPUExecutor, ExecutorAsyncBase): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.driver_exec_method = make_async(self.driver_worker.execute_method) + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if self.parallel_worker_tasks is None: + # Start model execution loop running in the parallel workers + self.parallel_worker_tasks = asyncio.create_task( + self._start_worker_execution_loop()) + + # Only the driver worker returns the sampling results. + return await self._driver_execute_model_async(execute_model_req) + + async def stop_remote_worker_execution_loop_async(self) -> None: + if self.parallel_worker_tasks is None: + return + + await self._driver_execute_model_async() + parallel_worker_tasks = self.parallel_worker_tasks + self.parallel_worker_tasks = None + # Ensure that workers exit model loop cleanly + # (this will raise otherwise) + await parallel_worker_tasks + + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + return await self.driver_exec_method("execute_model", + execute_model_req) + + async def _start_worker_execution_loop(self): + coros = [ + worker.execute_method.remote("start_worker_execution_loop") + for worker in self.workers + ] + return await asyncio.gather(*coros) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 8a8b412db6731..e5bb101fc7df4 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -1,6 +1,7 @@ import time from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from unittest.mock import patch import numpy as np import torch @@ -45,6 +46,7 @@ class ModelInputForTPU(ModelRunnerInputBase): num_samples: int best_of: List[int] seq_groups: List[List[int]] + virtual_engine: int = 0 def as_broadcastable_tensor_dict( self) -> Dict[str, Union[int, torch.Tensor]]: @@ -55,6 +57,9 @@ def as_broadcastable_tensor_dict( "t": self.t, "p": self.p, "num_samples": self.num_samples, + "best_of": self.best_of, + "seq_groups": self.seq_groups, + "virtual_engine": self.virtual_engine, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) return tensor_dict @@ -113,16 +118,30 @@ def __init__( def load_model(self) -> None: self.device = self.device_config.device - model = get_model( - model_config=self.model_config, - load_config=self.load_config, - device_config=self.device_config, - parallel_config=self.parallel_config, - cache_config=self.cache_config, - scheduler_config=self.scheduler_config, - multimodal_config=self.multimodal_config, - lora_config=None, - ) + # NOTE(woosuk): While the executor assigns the TP ranks to the worker + # process, the ranks can be different from the ranks internally assigned + # by the xm runtime. Therefore, there is a mismatch in the rank + # assignment between the gloo (cpu) runtime and the xm (tpu) runtime. + # This is not a problem in linear layers because all-reduce is + # rank-agnostic. However, it matters for all-gather as the ranks + # determine the order of concatenating the output tensors. + # As a workaround, we use the xm's rank assignment only when loading + # the embedding weights. + xm_tp_rank = xm.get_ordinal() + with patch( + "vllm.model_executor.layers.vocab_parallel_embedding." + "get_tensor_model_parallel_rank", + return_value=xm_tp_rank): + model = get_model( + model_config=self.model_config, + load_config=self.load_config, + device_config=self.device_config, + parallel_config=self.parallel_config, + cache_config=self.cache_config, + scheduler_config=self.scheduler_config, + multimodal_config=self.multimodal_config, + lora_config=None, + ) model = model.eval() xm.wait_device_ops() @@ -463,10 +482,11 @@ def make_model_input_from_broadcasted_tensor_dict( tensor_dict, attn_backend=self.attn_backend) return model_input + @torch.no_grad() def execute_model( self, model_input: ModelInputForTPU, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: Optional[List[Any]], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, ) -> List[SamplerOutput]: diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 03011e03058d8..c88aba7ae08cd 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -70,13 +70,13 @@ def __init__( def init_device(self) -> None: os.environ["PJRT_DEVICE"] = "TPU" - self.device = xm.xla_device() - self.device_config.device = self.device torch.set_grad_enabled(False) torch.set_default_dtype(self.model_config.dtype) - # NOTE(woosuk): This is just a hack to initialize the TP group. - # This cannot perform the actual communication ops. + # NOTE(woosuk): This is just to initialize the TP group and broadcast + # the input objects on CPU. The all-reduce and all-gather ops on TPU + # are invoked by `xm.all_reduce` and `xm.all_gather` which use their + # own context. init_distributed_environment( world_size=self.parallel_config.world_size, rank=self.rank, @@ -88,6 +88,11 @@ def init_device(self) -> None: self.parallel_config.tensor_parallel_size, self.parallel_config.pipeline_parallel_size) + # Device initialization should happen after initializing the distributed + # runtime. + self.device = xm.xla_device() + self.device_config.device = self.device + # Set random seed. set_random_seed(self.model_config.seed) xm.set_rng_state(self.model_config.seed, self.device) @@ -200,8 +205,7 @@ def get_cache_block_size_bytes(self) -> int: @property def do_metadata_broadcast(self) -> bool: - # TODO(woosuk): Support TP. - return False + return self.parallel_config.tensor_parallel_size > 1 @property def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: From c53041ae3b8ded4ac4c3fc745be6bc695b9f0c78 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 27 Jul 2024 05:47:33 +0100 Subject: [PATCH 062/167] [Doc] Add missing mock import to docs `conf.py` (#6834) --- .readthedocs.yaml | 1 + docs/source/conf.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 428e199088589..f1959ad2743f3 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,6 +10,7 @@ build: sphinx: configuration: docs/source/conf.py + fail_on_warning: true # If using Sphinx, optionally build your docs in additional formats such as PDF formats: diff --git a/docs/source/conf.py b/docs/source/conf.py index f4cec05663fcd..b867bfd89dc17 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -94,6 +94,7 @@ def setup(app): # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ + "aiohttp", "cpuinfo", "torch", "transformers", @@ -141,5 +142,6 @@ def add_line(self, line: str, source: str, *lineno: int) -> None: } autodoc_preserve_defaults = True +autodoc_warningiserror = True navigation_with_keys = False From 593e79e7337f7fd9e92b7554dabdff96769dbf15 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Fri, 26 Jul 2024 23:15:20 -0600 Subject: [PATCH 063/167] [Bugfix] torch.set_num_threads() in multiproc_gpu_executor (#6802) [Bugfix] Use torch.set_num_threads() to configure parallelism in multiproc_gpu_executor (#6802) Signed-off-by: Travis Johnson --- vllm/executor/multiproc_gpu_executor.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 19f7a497cdd9f..e1e92958e667c 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -6,6 +6,8 @@ from functools import partial from typing import Any, List, Optional +import torch + from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.executor.gpu_executor import create_worker @@ -45,10 +47,23 @@ def _init_executor(self) -> None: # Disable torch async compiling which won't work with daemonic processes os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" - # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU - # contention amongst the shards - if "OMP_NUM_THREADS" not in os.environ: - os.environ["OMP_NUM_THREADS"] = "1" + # Configure thread parallelism if OMP_NUM_THREADS isn't set + # + # Helps to avoid CPU contention. The default of spawning a thread per + # core combined with multiprocessing for each GPU can have a negative + # impact on performance. The contention is amplified when running in a + # container where CPU limits can cause throttling. + default_omp_num_threads = 1 + if "OMP_NUM_THREADS" not in os.environ and ( + current_parallelism := + torch.get_num_threads()) > default_omp_num_threads: + logger.warning( + "Reducing Torch parallelism from %d threads to %d to avoid " + "unnecessary CPU contention. Set OMP_NUM_THREADS in the " + "external environment to tune this value as needed.", + current_parallelism, default_omp_num_threads) + os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads) + torch.set_num_threads(default_omp_num_threads) # workaround for https://github.com/vllm-project/vllm/issues/6103 if world_size > 1: From aa46953a20685377fc51dcde172114ddd7ffdc68 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Fri, 26 Jul 2024 22:44:13 -0700 Subject: [PATCH 064/167] [Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858) Co-authored-by: Cyrus Leung --- examples/fuyu_example.py | 31 ---- examples/llava_example.py | 25 --- examples/llava_next_example.py | 36 ---- examples/minicpmv_example.py | 55 ------ examples/offline_inference_vision_language.py | 174 ++++++++++++++++++ examples/paligemma_example.py | 25 --- examples/phi3v_example.py | 40 ---- 7 files changed, 174 insertions(+), 212 deletions(-) delete mode 100644 examples/fuyu_example.py delete mode 100644 examples/llava_example.py delete mode 100644 examples/llava_next_example.py delete mode 100644 examples/minicpmv_example.py create mode 100644 examples/offline_inference_vision_language.py delete mode 100644 examples/paligemma_example.py delete mode 100644 examples/phi3v_example.py diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py deleted file mode 100644 index c92b8fb4bc286..0000000000000 --- a/examples/fuyu_example.py +++ /dev/null @@ -1,31 +0,0 @@ -import requests -from PIL import Image - -from vllm import LLM, SamplingParams - - -def run_fuyu(): - llm = LLM(model="adept/fuyu-8b", max_model_len=4096) - - # single-image prompt - prompt = "What is the highest life expectancy at of male?\n" - url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png" - image = Image.open(requests.get(url, stream=True).raw) - sampling_params = SamplingParams(temperature=0, max_tokens=64) - - outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }, - sampling_params=sampling_params) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_fuyu() diff --git a/examples/llava_example.py b/examples/llava_example.py deleted file mode 100644 index 4c9eabd261e5c..0000000000000 --- a/examples/llava_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from vllm import LLM -from vllm.assets.image import ImageAsset - - -def run_llava(): - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - - image = ImageAsset("stop_sign").pil_image - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_llava() diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py deleted file mode 100644 index fd53a6def1a13..0000000000000 --- a/examples/llava_next_example.py +++ /dev/null @@ -1,36 +0,0 @@ -from io import BytesIO - -import requests -from PIL import Image - -from vllm import LLM, SamplingParams - - -def run_llava_next(): - llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=4096) - - prompt = "[INST] \nWhat is shown in this image? [/INST]" - url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg" - image = Image.open(BytesIO(requests.get(url).content)) - sampling_params = SamplingParams(temperature=0.8, - top_p=0.95, - max_tokens=100) - - outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": { - "image": image - } - }, - sampling_params=sampling_params) - - generated_text = "" - for o in outputs: - generated_text += o.outputs[0].text - - print(f"LLM output:{generated_text}") - - -if __name__ == "__main__": - run_llava_next() diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py deleted file mode 100644 index bf20a7ea04ad4..0000000000000 --- a/examples/minicpmv_example.py +++ /dev/null @@ -1,55 +0,0 @@ -from transformers import AutoTokenizer - -from vllm import LLM, SamplingParams -from vllm.assets.image import ImageAsset - -# 2.0 -# The official repo doesn't work yet, so we need to use a fork for now -# For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 -# MODEL_NAME = "HwwwH/MiniCPM-V-2" -# 2.5 -MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5" - -image = ImageAsset("stop_sign").pil_image.convert("RGB") - -tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) -llm = LLM(model=MODEL_NAME, - gpu_memory_utilization=1, - trust_remote_code=True, - max_model_len=4096) - -messages = [{ - 'role': - 'user', - 'content': - '(./)\n' + "What's the content of the image?" -}] -prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) -# 2.0 -# stop_token_ids = [tokenizer.eos_id] -# 2.5 -stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] - -sampling_params = SamplingParams( - stop_token_ids=stop_token_ids, - # temperature=0.7, - # top_p=0.8, - # top_k=100, - # seed=3472, - max_tokens=1024, - # min_tokens=150, - temperature=0, - use_beam_search=True, - # length_penalty=1.2, - best_of=3) - -outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": image - } -}, - sampling_params=sampling_params) -print(outputs[0].outputs[0].text) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py new file mode 100644 index 0000000000000..4a09f77ca59f8 --- /dev/null +++ b/examples/offline_inference_vision_language.py @@ -0,0 +1,174 @@ +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams +from vllm.assets.image import ImageAsset +from vllm.utils import FlexibleArgumentParser + +# Input image and question +image = ImageAsset("cherry_blossom").pil_image.convert("RGB") +question = "What is the content of this image?" + + +# LLaVA-1.5 +def run_llava(question): + + prompt = f"USER: \n{question}\nASSISTANT:" + + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + + return llm, prompt + + +# LLaVA-1.6/LLaVA-NeXT +def run_llava_next(question): + + prompt = f"[INST] \n{question} [/INST]" + llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf") + + return llm, prompt + + +# Fuyu +def run_fuyu(question): + + prompt = f"{question}\n" + llm = LLM(model="adept/fuyu-8b") + + return llm, prompt + + +# Phi-3-Vision +def run_phi3v(question): + + prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501 + # Note: The default setting of max_num_seqs (256) and + # max_model_len (128k) for this model may cause OOM. + # You may lower either to run this example on lower-end GPUs. + + # In this example, we override max_num_seqs to 5 while + # keeping the original context length of 128k. + llm = LLM( + model="microsoft/Phi-3-vision-128k-instruct", + trust_remote_code=True, + max_num_seqs=5, + ) + return llm, prompt + + +# PaliGemma +def run_paligemma(question): + + prompt = question + llm = LLM(model="google/paligemma-3b-mix-224") + + return llm, prompt + + +# Chameleon +def run_chameleon(question): + + prompt = f"{question}" + llm = LLM(model="facebook/chameleon-7b") + return llm, prompt + + +# MiniCPM-V +def run_minicpmv(question): + + # 2.0 + # The official repo doesn't work yet, so we need to use a fork for now + # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa + # model_name = "HwwwH/MiniCPM-V-2" + + # 2.5 + model_name = "openbmb/MiniCPM-Llama3-V-2_5" + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + llm = LLM( + model=model_name, + trust_remote_code=True, + ) + + messages = [{ + 'role': 'user', + 'content': f'(./)\n{question}' + }] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + return llm, prompt + + +model_example_map = { + "llava": run_llava, + "llava-next": run_llava_next, + "fuyu": run_fuyu, + "phi3_v": run_phi3v, + "paligemma": run_paligemma, + "chameleon": run_chameleon, + "minicpmv": run_minicpmv, +} + + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + + llm, prompt = model_example_map[model](question) + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(temperature=0.2, max_tokens=64) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + "image": image + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + "image": image + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models') + args = parser.parse_args() + parser.add_argument('--model-type', + '-m', + type=str, + default="llava", + choices=model_example_map.keys(), + help='Huggingface "model_type".') + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + + args = parser.parse_args() + main(args) diff --git a/examples/paligemma_example.py b/examples/paligemma_example.py deleted file mode 100644 index 92a3cb3ac4129..0000000000000 --- a/examples/paligemma_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from vllm import LLM -from vllm.assets.image import ImageAsset - - -def run_paligemma(): - llm = LLM(model="google/paligemma-3b-mix-224") - - prompt = "caption es" - - image = ImageAsset("stop_sign").pil_image - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_paligemma() diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py deleted file mode 100644 index ae8c38d84e8fd..0000000000000 --- a/examples/phi3v_example.py +++ /dev/null @@ -1,40 +0,0 @@ -from vllm import LLM, SamplingParams -from vllm.assets.image import ImageAsset - - -def run_phi3v(): - model_path = "microsoft/Phi-3-vision-128k-instruct" - - # Note: The default setting of max_num_seqs (256) and - # max_model_len (128k) for this model may cause OOM. - # You may lower either to run this example on lower-end GPUs. - - # In this example, we override max_num_seqs to 5 while - # keeping the original context length of 128k. - llm = LLM( - model=model_path, - trust_remote_code=True, - max_num_seqs=5, - ) - - image = ImageAsset("cherry_blossom").pil_image - - # single-image prompt - prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n" # noqa: E501 - sampling_params = SamplingParams(temperature=0, max_tokens=64) - - outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": { - "image": image - }, - }, - sampling_params=sampling_params) - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -if __name__ == "__main__": - run_phi3v() From 925de97e05dd4709fcd80691cb37da5e582c22e8 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Fri, 26 Jul 2024 23:24:08 -0700 Subject: [PATCH 065/167] [Bugfix] Fix VLM example typo (#6859) --- examples/offline_inference_vision_language.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 4a09f77ca59f8..8a63653343db6 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -158,7 +158,6 @@ def main(args): parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' 'vision language models') - args = parser.parse_args() parser.add_argument('--model-type', '-m', type=str, From a57d75821c6177da75fdebf171d528eef5301961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= Date: Sat, 27 Jul 2024 17:07:02 +0800 Subject: [PATCH 066/167] [bugfix] make args.stream work (#6831) --- examples/api_client.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/api_client.py b/examples/api_client.py index 27a2a08b7b0c3..49a085febdc57 100644 --- a/examples/api_client.py +++ b/examples/api_client.py @@ -31,7 +31,10 @@ def post_http_request(prompt: str, "max_tokens": 16, "stream": stream, } - response = requests.post(api_url, headers=headers, json=pload, stream=True) + response = requests.post(api_url, + headers=headers, + json=pload, + stream=stream) return response From ecb33a28cb6c10ebf3b1aa139f72e759cacb8c15 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 27 Jul 2024 02:54:14 -0700 Subject: [PATCH 067/167] [CI/Build][Doc] Update CI and Doc for VLM example changes (#6860) --- .buildkite/test-pipeline.yaml | 3 +-- docs/source/models/vlm.rst | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5b4a786305e1f..be8807df0b098 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -140,14 +140,13 @@ steps: working_dir: "/vllm-workspace/examples" mirror_hardwares: [amd] commands: - # install aws cli for llava_example.py # install tensorizer for tensorize_vllm_model.py - pip install awscli tensorizer - python3 offline_inference.py - python3 cpu_offload.py - python3 offline_inference_with_prefix.py - python3 llm_engine_example.py - - python3 llava_example.py + - python3 offline_inference_vision_language.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - label: Inputs Test diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index ef4ce0d44a162..a385605c9f8f6 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -73,7 +73,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI generated_text = o.outputs[0].text print(generated_text) -A code example can be found in `examples/llava_example.py `_. +A code example can be found in `examples/offline_inference_vision_language.py `_. Online OpenAI Vision API Compatible Inference From 1ad86acf1789650e2ff27586e36a8159d52755dd Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 27 Jul 2024 19:53:07 +0800 Subject: [PATCH 068/167] [Model] Initial support for BLIP-2 (#5920) Co-authored-by: ywang96 --- docs/source/models/supported_models.rst | 8 + examples/offline_inference_vision_language.py | 11 + examples/template_blip2.jinja | 11 + tests/models/test_blip2.py | 102 +++ tests/models/test_fuyu.py | 8 +- tests/models/test_minicpmv.py | 8 +- tests/models/test_phi3v.py | 8 +- vllm/model_executor/models/__init__.py | 6 +- vllm/model_executor/models/blip.py | 269 +++++++ vllm/model_executor/models/blip2.py | 669 ++++++++++++++++++ vllm/model_executor/models/opt.py | 17 +- vllm/multimodal/base.py | 11 +- 12 files changed, 1107 insertions(+), 21 deletions(-) create mode 100644 examples/template_blip2.jinja create mode 100644 tests/models/test_blip2.py create mode 100644 vllm/model_executor/models/blip.py create mode 100644 vllm/model_executor/models/blip2.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 483f552bba238..83c1b9c8bce86 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo The following is the list of model architectures that are currently supported by vLLM. Alongside each architecture, we include some popular models that use it. +---- + Decoder-only Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. list-table:: @@ -186,6 +188,10 @@ Vision Language Models - Models - Example HuggingFace Models - :ref:`LoRA ` + * - :code:`Blip2ForConditionalGeneration` + - BLIP-2 + - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. + - * - :code:`ChameleonForConditionalGeneration` - Chameleon - :code:`facebook/chameleon-7b` etc. @@ -215,6 +221,8 @@ Vision Language Models - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc. - +---- + If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` for instructions on how to implement support for your model. diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 8a63653343db6..04ba1a96314c9 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -106,6 +106,16 @@ def run_minicpmv(question): return llm, prompt +# BLIP-2 +def run_blip2(question): + + # BLIP-2 prompt format is inaccurate on HuggingFace model repository. + # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa + prompt = f"Question: {question} Answer:" + llm = LLM(model="Salesforce/blip2-opt-2.7b") + return llm, prompt + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -114,6 +124,7 @@ def run_minicpmv(question): "paligemma": run_paligemma, "chameleon": run_chameleon, "minicpmv": run_minicpmv, + "blip-2": run_blip2, } diff --git a/examples/template_blip2.jinja b/examples/template_blip2.jinja new file mode 100644 index 0000000000000..fd41a7f7fa666 --- /dev/null +++ b/examples/template_blip2.jinja @@ -0,0 +1,11 @@ +{%- for message in messages -%} + {%- if message['role'] == 'user' -%} + {{- 'Question: ' + message['content'] + ' ' -}} + {%- elif message['role'] == 'assistant' -%} + {{- 'Answer: ' + message['content'] + ' ' -}} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {{- 'Answer:' -}} +{% endif %} diff --git a/tests/models/test_blip2.py b/tests/models/test_blip2.py new file mode 100644 index 0000000000000..26afd57ae6106 --- /dev/null +++ b/tests/models/test_blip2.py @@ -0,0 +1,102 @@ +from typing import List, Optional, Tuple + +import pytest +from transformers import AutoTokenizer + +from vllm.multimodal.utils import rescale_image_size +from vllm.sequence import SampleLogprobs + +from ..conftest import IMAGE_ASSETS +from .utils import check_logprobs_close + +pytestmark = pytest.mark.vlm + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "Question: What's the content of the image? Answer:", + "cherry_blossom": + "Question: What is the season? Answer:", +}) + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str, + Optional[SampleLogprobs]], + model: str): + """Sanitize vllm output to be comparable with hf output.""" + _, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "\n" + + tokenizer = AutoTokenizer.from_pretrained(model) + hf_output_ids = tokenizer.encode(hf_output_str) + assert hf_output_ids[0] == tokenizer.bos_token_id + hf_output_ids = hf_output_ids[1:] + + return hf_output_ids, hf_output_str, out_logprobs + + +@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"]) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_tokens: int, num_logprobs: int) -> None: + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalData objects and corresponding + vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + # max_model_len should be greater than image_feature_size + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model: + hf_outputs_per_image = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, + vllm_outputs_per_image): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, model) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + ) diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py index 25f63a3d64d0e..7d0f3be5ea008 100644 --- a/tests/models/test_fuyu.py +++ b/tests/models/test_fuyu.py @@ -77,8 +77,8 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=vllm_images) - for prompts, vllm_images in inputs_per_image + images=images) + for prompts, images in inputs_per_image ] with hf_runner(model, dtype=dtype) as hf_model: @@ -89,9 +89,9 @@ def run_test( hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, - images=hf_images, + images=images, eos_token_id=eos_token_id) - for prompts, hf_images in inputs_per_image + for prompts, images in inputs_per_image ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, diff --git a/tests/models/test_minicpmv.py b/tests/models/test_minicpmv.py index 9124fa7a6238c..c57f0f8c08548 100644 --- a/tests/models/test_minicpmv.py +++ b/tests/models/test_minicpmv.py @@ -88,9 +88,9 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=vllm_images, + images=images, stop_token_ids=stop_token_ids) - for prompts, vllm_images in inputs_per_image + for prompts, images in inputs_per_image ] with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad(): @@ -114,9 +114,9 @@ def to(self, device: torch.types.Device): hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, - images=hf_images, + images=images, tokenizer=tokenizer) - for prompts, hf_images in inputs_per_image + for prompts, images in inputs_per_image ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 9da25ab8d78fe..35ffe4ef50a85 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -101,8 +101,8 @@ def run_test( vllm_model.generate_greedy_logprobs(prompts, max_tokens, num_logprobs=num_logprobs, - images=vllm_images) - for prompts, vllm_images in inputs_per_image + images=images) + for prompts, images in inputs_per_image ] # use eager mode for hf runner, since phi3_v didn't work with flash_attn @@ -114,9 +114,9 @@ def run_test( hf_model.generate_greedy_logprobs_limit(prompts, max_tokens, num_logprobs=num_logprobs, - images=hf_images, + images=images, eos_token_id=eos_token_id) - for prompts, hf_images in inputs_per_image + for prompts, images in inputs_per_image ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index ead64c0e92553..fe04c6db5fbc2 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -16,6 +16,8 @@ "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b "BloomForCausalLM": ("bloom", "BloomForCausalLM"), + "Blip2ForConditionalGeneration": + ("blip2", "Blip2ForConditionalGeneration"), "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), @@ -56,8 +58,8 @@ "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"), - "PaliGemmaForConditionalGeneration": - ("paligemma", "PaliGemmaForConditionalGeneration"), + "PaliGemmaForConditionalGeneration": ("paligemma", + "PaliGemmaForConditionalGeneration"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py new file mode 100644 index 0000000000000..0b124d5e8a85a --- /dev/null +++ b/vllm/model_executor/models/blip.py @@ -0,0 +1,269 @@ +"""Minimal implementation of BlipVisionModel intended to be only used +within a vision language model.""" +from typing import Optional, Union + +import torch +import torch.nn as nn +from PIL import Image +from transformers import Blip2VisionConfig, BlipVisionConfig +from transformers.models.blip.modeling_blip import BlipAttention + +from vllm.config import ModelConfig +from vllm.inputs import LLMInputs +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.multimodal.image import (cached_get_tokenizer, + repeat_and_pad_image_tokens) +from vllm.sequence import SequenceData + + +def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: + assert image_size % patch_size == 0 + return image_size // patch_size + + +def get_blip_num_patches(*, image_size: int, patch_size: int) -> int: + grid_length = get_blip_patch_grid_length(image_size=image_size, + patch_size=patch_size) + return grid_length * grid_length + + +def get_blip_image_feature_size( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], ) -> int: + return get_blip_num_patches(image_size=hf_config.image_size, + patch_size=hf_config.patch_size) + + +def get_max_blip_image_tokens( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], ) -> int: + return get_blip_image_feature_size(hf_config) + + +def dummy_seq_data_for_blip( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], + seq_len: int, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + if image_feature_size_override is None: + image_feature_size = get_blip_image_feature_size(hf_config) + else: + image_feature_size = image_feature_size_override + + token_ids = [image_token_id] * image_feature_size + token_ids += [0] * (seq_len - image_feature_size) + return SequenceData(token_ids) + + +def dummy_image_for_blip( + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], + *, + image_width_override: Optional[int] = None, + image_height_override: Optional[int] = None, +): + width = height = hf_config.image_size + if image_width_override is not None: + width = image_width_override + if image_height_override is not None: + height = image_height_override + + image = Image.new("RGB", (width, height), color=0) + return {"image": image} + + +def input_processor_for_blip( + model_config: ModelConfig, + hf_config: Union[BlipVisionConfig, Blip2VisionConfig], + llm_inputs: LLMInputs, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + tokenizer = cached_get_tokenizer(model_config.tokenizer) + + if image_feature_size_override is None: + image_feature_size = get_blip_image_feature_size(hf_config) + else: + image_feature_size = image_feature_size_override + + new_prompt, new_token_ids = repeat_and_pad_image_tokens( + tokenizer, + llm_inputs.get("prompt"), + llm_inputs["prompt_token_ids"], + image_token_id=image_token_id, + repeat_count=image_feature_size, + ) + + # NOTE: Create a defensive copy of the original inputs + return LLMInputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa +class BlipVisionEmbeddings(nn.Module): + + def __init__(self, config: BlipVisionConfig): + super().__init__() + + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + ) + + self.num_patches = get_blip_num_patches(image_size=self.image_size, + patch_size=self.patch_size) + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter( + torch.randn(1, self.num_positions, self.embed_dim)) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size = pixel_values.shape[0] + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to( + dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + + position_embeds = self.position_embedding.to(target_dtype) + embeddings = embeddings + position_embeds[:, :embeddings.size(1), :] + + return embeddings + + +class BlipMLP(nn.Module): + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + self.config = config + + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + + return hidden_states + + +class BlipEncoderLayer(nn.Module): + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + self.self_attn = BlipAttention(config) + self.layer_norm1 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.mlp = BlipMLP(config, quant_config=quant_config) + self.layer_norm2 = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, _ = self.self_attn(hidden_states=hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class BlipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self + attention layers. Each layer is a [`BlipEncoderLayer`]. + + Args: + config: BlipConfig + """ + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None): + super().__init__() + + self.config = config + + if num_hidden_layers_override is None: + num_hidden_layers = config.num_hidden_layers + else: + num_hidden_layers = num_hidden_layers_override + + self.layers = nn.ModuleList([ + BlipEncoderLayer(config=config, quant_config=quant_config) + for _ in range(num_hidden_layers) + ]) + + def forward(self, inputs_embeds: torch.Tensor): + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer(hidden_states) + + return hidden_states + + +class BlipVisionModel(nn.Module): + config_class = BlipVisionConfig + main_input_name = "pixel_values" + + def __init__(self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None): + super().__init__() + + self.config = config + + self.embeddings = BlipVisionEmbeddings(config) + self.encoder = BlipEncoder( + config=config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_override, + ) + self.post_layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + hidden_states = self.embeddings(pixel_values) + hidden_states = self.encoder(inputs_embeds=hidden_states) + + return self.post_layernorm(hidden_states) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py new file mode 100644 index 0000000000000..e00e6c0806957 --- /dev/null +++ b/vllm/model_executor/models/blip2.py @@ -0,0 +1,669 @@ +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict + +import torch +import torch.nn as nn +from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig, + apply_chunking_to_forward) + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, MultiModalConfig +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.opt import OPTModel +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData + +from .blip import (BlipVisionModel, dummy_image_for_blip, + get_max_blip_image_tokens) +from .interfaces import SupportsVision +from .utils import merge_vision_embeddings + +_KEYS_TO_MODIFY_MAPPING = { + "language_model.lm_head": "lm_head", + "language_model.model": "language_model", +} + + +class Blip2QFormerMultiHeadAttention(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + is_cross_attention: bool = False, + ) -> None: + super().__init__() + + self.config = config + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of " + f"the number of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = (config.hidden_size // + config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.scaling = self.attention_head_size**-0.5 + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + kv_hidden_size = config.encoder_hidden_size + else: + kv_hidden_size = config.hidden_size + self.key = nn.Linear(kv_hidden_size, self.all_head_size) + self.value = nn.Linear(kv_hidden_size, self.all_head_size) + + self.position_embedding_type = getattr(config, + "position_embedding_type", + "absolute") + if self.position_embedding_type != "absolute": + raise NotImplementedError("Unsupported position_embedding_type: " + f"{self.position_embedding_type}") + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + x = x.view(*x.size()[:-1], self.num_attention_heads, + self.attention_head_size) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + ): + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores( + self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores( + self.value(encoder_hidden_states)) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + attention_scores = torch.matmul(query_layer, + key_layer.transpose(-1, -2)) + attention_probs = torch.softmax(attention_scores * self.scaling, + dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + context_layer = context_layer.view(*context_layer.size()[:-2], + self.all_head_size) + + return context_layer + + +class Blip2QFormerSelfOutput(nn.Module): + + def __init__(self, config: Blip2QFormerConfig) -> None: + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + input_tensor: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerAttention(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + is_cross_attention: bool = False, + ) -> None: + super().__init__() + + self.attention = Blip2QFormerMultiHeadAttention( + config, + quant_config=quant_config, + cache_config=cache_config, + is_cross_attention=is_cross_attention, + ) + + self.output = Blip2QFormerSelfOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + ) -> Tuple[torch.Tensor]: + self_output = self.attention( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + attention_output = self.output(self_output, hidden_states) + + return attention_output + + +class Blip2QFormerIntermediate(nn.Module): + + def __init__(self, config: Blip2QFormerConfig) -> None: + super().__init__() + + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + self.intermediate_act_fn = get_act_fn(config.hidden_act) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class Blip2QFormerOutput(nn.Module): + + def __init__(self, config: Blip2QFormerConfig) -> None: + super().__init__() + + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + input_tensor: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerLayer(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + layer_idx: int, + ) -> None: + super().__init__() + + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = Blip2QFormerAttention(config, + quant_config=quant_config, + cache_config=cache_config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = Blip2QFormerAttention( + config, + quant_config=quant_config, + cache_config=cache_config, + is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate_query = Blip2QFormerIntermediate(config) + self.output_query = Blip2QFormerOutput(config) + + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + query_length: int, + ): + attention_output = self.attention(hidden_states) + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + query_attention_output = self.crossattention( + query_attention_output, + encoder_hidden_states=encoder_hidden_states, + ) + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], + dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + + return layer_output + + def feed_forward_chunk(self, + attention_output: torch.Tensor) -> torch.Tensor: + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query( + self, attention_output: torch.Tensor) -> torch.Tensor: + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class Blip2QFormerEncoder(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + ) -> None: + super().__init__() + + self.config = config + + self.layer = nn.ModuleList([ + Blip2QFormerLayer(config, + quant_config=quant_config, + cache_config=cache_config, + layer_idx=layer_idx) + for layer_idx in range(config.num_hidden_layers) + ]) + + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + query_length: int, + ) -> torch.Tensor: + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + + hidden_states = layer_module( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + query_length=query_length, + ) + + return hidden_states + + +# Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1025 +class Blip2QFormerModel(nn.Module): + + def __init__( + self, + config: Blip2QFormerConfig, + *, + quant_config: Optional[QuantizationConfig], + cache_config: Optional[CacheConfig], + ) -> None: + super().__init__() + + self.config = config + + self.layernorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = Blip2QFormerEncoder(config, + quant_config=quant_config, + cache_config=cache_config) + + def forward( + self, + query_embeds: torch.FloatTensor, + encoder_hidden_states: torch.FloatTensor, + ) -> torch.Tensor: + query_length = query_embeds.shape[1] + + embedding_output = self.layernorm(query_embeds) + embedding_output = self.dropout(embedding_output) + + sequence_output = self.encoder( + embedding_output, + encoder_hidden_states=encoder_hidden_states, + query_length=query_length, + ) + + return sequence_output + + +class Blip2ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """Shape: (batch_size, num_channels, height, width)""" + + +Blip2ImageInputs = Blip2ImagePixelInputs + +# We use this internally as placeholders since there is no image token +# defined on the HuggingFace repo +BLIP2_IMAGE_TOKEN = "" +BLIP2_IMAGE_TOKEN_ID = 50265 + + +def get_blip2_image_feature_size(hf_config: Blip2Config) -> int: + return hf_config.num_query_tokens + + +def get_max_blip2_image_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config(Blip2Config) + vision_config = hf_config.vision_config + + if isinstance(vision_config, Blip2VisionConfig): + return get_max_blip_image_tokens(vision_config) + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +def dummy_data_for_blip2(ctx: InputContext, seq_len: int): + hf_config = ctx.get_hf_config(Blip2Config) + vision_config = hf_config.vision_config + + image_feature_size = get_blip2_image_feature_size(hf_config) + token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size + token_ids += [0] * (seq_len - image_feature_size) + seq_data = SequenceData(token_ids) + + if isinstance(vision_config, Blip2VisionConfig): + mm_data = dummy_image_for_blip(vision_config) + + return seq_data, mm_data + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + +def input_processor_for_blip2(ctx: InputContext, llm_inputs: LLMInputs): + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + hf_config = ctx.get_hf_config(Blip2Config) + image_feature_size = get_blip2_image_feature_size(hf_config) + + # The original model places image tokens at the front + # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514 + new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size + new_token_ids += llm_inputs["prompt_token_ids"] + + new_prompt = llm_inputs.get("prompt") + if new_prompt is not None: + new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt + + return LLMInputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + + +@MULTIMODAL_REGISTRY.register_image_input_mapper() +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2) +@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2) +class Blip2ForConditionalGeneration(nn.Module, SupportsVision): + + def __init__(self, + config: Blip2Config, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + + super().__init__() + + self.config = config + self.multimodal_config = multimodal_config + + # TODO: Optionally initializes this for supporting embeddings. + self.vision_model = BlipVisionModel(config.vision_config) + + self.query_tokens = nn.Parameter( + torch.zeros(1, config.num_query_tokens, + config.qformer_config.hidden_size)) + + self.qformer = Blip2QFormerModel(config.qformer_config, + cache_config=cache_config, + quant_config=quant_config) + + self.language_projection = nn.Linear( + config.qformer_config.hidden_size, + config.text_config.hidden_size, + bias=True, + ) + + self.quant_config = quant_config + + self.language_model = OPTModel(config.text_config, cache_config, + quant_config) + + self.unpadded_vocab_size = config.text_config.vocab_size + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size) + self.sampler = Sampler() + + def get_lm_head(self): + return self.language_model.decoder.embed_tokens + + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) + actual_dims = tuple(data.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[Blip2ImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + + if pixel_values is None: + return None + + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return Blip2ImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + ) + + def _image_pixels_to_features(self, vision_model: BlipVisionModel, + pixel_values: torch.Tensor) -> torch.Tensor: + + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + image_features = vision_model(pixel_values) + + return image_features + + def _process_image_pixels(self, + inputs: Blip2ImagePixelInputs) -> torch.Tensor: + assert self.vision_model is not None + + pixel_values = inputs["data"] + + return self._image_pixels_to_features(self.vision_model, pixel_values) + + def _process_image_input(self, + image_input: Blip2ImageInputs) -> torch.Tensor: + assert self.vision_model is not None + image_features = self._process_image_pixels(image_input) + + query_tokens = self.query_tokens.expand(image_features.shape[0], -1, + -1) + query_output = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_features, + ) + + return self.language_projection(query_output) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> SamplerOutput: + """Run forward pass for BLIP-2. + + One key thing to understand is the `input_ids` already accounts for the + positions of the to-be-inserted image embeddings. + + Concretely, consider a text prompt: + `"Question: What's the content of the image? Answer:"`. + + Tokenizer outputs: + `[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`. + + To reserve space in KV cache, we have to insert placeholder tokens + before they are inputted to the model, so the input processor prepends + dummy tokens (denoted as `50265`), resulting in: + `[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`. + + We insert 32 tokens since it corresponds to the number of query + embeddings outputted by the Q-Former and inputted to the language model. + + This way, the `positions` and `attn_metadata` are consistent + with the `input_ids`. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + pixel_values: The pixels in each input image. + + See also: + :class:`Blip2ImageInputs` + """ + image_input = self._parse_and_validate_image_input(**kwargs) + + if image_input is not None: + vision_embeddings = self._process_image_input(image_input) + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + + inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds, + vision_embeddings, + BLIP2_IMAGE_TOKEN_ID) + + input_ids = None + else: + inputs_embeds = None + + hidden_states = self.language_model(input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds) + + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.get_lm_head(), hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + # only doing this for language model part for now. + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + + for name, loaded_weight in weights: + if "lm_head.weight" in name: + continue + if "rotary_emb.inv_freq" in name: + continue + for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in name: + name = name.replace(key_to_modify, new_key) + use_default_weight_loading = False + if "vision" in name: + if self.vision_model is not None: + # We only do sharding for language model and + # not vision model for now. + use_default_weight_loading = True + else: + for (param_name, weight_name, + shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + use_default_weight_loading = True + if use_default_weight_loading: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index edc16710c0229..a05090cd46648 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -237,14 +237,19 @@ def __init__( for _ in range(config.num_hidden_layers) ]) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - inputs_embeds = self.embed_tokens(input_ids) + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings(input_ids) pos_embeds = self.embed_positions(positions) if self.project_in is not None: inputs_embeds, _ = self.project_in(inputs_embeds) @@ -272,14 +277,22 @@ def __init__( super().__init__() self.decoder = OPTDecoder(config, cache_config, quant_config) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.decoder.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - return self.decoder(input_ids, positions, kv_caches, attn_metadata) + return self.decoder(input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds) class OPTForCausalLM(nn.Module): diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 0d435bd644e29..5abd0ad61cdf9 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,8 +1,9 @@ import sys from abc import ABC, abstractmethod from collections import UserDict, defaultdict -from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict, - TypeVar, Union, cast) +from typing import Any, Callable, Dict, List, Optional +from typing import Sequence as GenericSequence +from typing import Type, TypedDict, TypeVar, Union, cast import torch import torch.types @@ -15,13 +16,13 @@ logger = init_logger(__name__) -NestedTensors = Union[List[torch.Tensor], torch.Tensor] +NestedTensors = Union[GenericSequence[torch.Tensor], torch.Tensor] """ Use a list instead of a tensor if the dimensions of each element do not match. Currently only supports up to singly nested list of tensors. """ -BatchedTensors = Union[List[NestedTensors], NestedTensors] +BatchedTensors = Union[GenericSequence[NestedTensors], NestedTensors] """ If each input tensor in the batch has the same size, this is a single batched tensor; otherwise, this is a list of :class:`NestedTensors` with one element @@ -53,7 +54,7 @@ def try_concat( # may be list rather than tensors if isinstance(tensors[0], list): return [[t.to(device=device) for t in tensor[0]] - for tensor in tensors] + for tensor in cast(List[List[torch.Tensor]], tensors)] tensors_ = cast(List[torch.Tensor], tensors) From f954d0715c8b68e780aac4a4f3ffd1ab56bebfcd Mon Sep 17 00:00:00 2001 From: Chenggang Wu Date: Sat, 27 Jul 2024 09:24:46 -0700 Subject: [PATCH 069/167] [Docs] Add RunLLM chat widget (#6857) --- docs/source/_static/custom.js | 16 ++++++++++++++++ docs/source/conf.py | 2 ++ 2 files changed, 18 insertions(+) create mode 100644 docs/source/_static/custom.js diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js new file mode 100644 index 0000000000000..f475be71fc448 --- /dev/null +++ b/docs/source/_static/custom.js @@ -0,0 +1,16 @@ +document.addEventListener("DOMContentLoaded", function () { + var script = document.createElement("script"); + script.type = "module"; + script.id = "runllm-widget-script" + + script.src = "https://widget.runllm.com"; + + script.setAttribute("version", "stable"); + script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. + script.setAttribute("runllm-name", "vLLM"); + script.setAttribute("runllm-position", "BOTTOM_RIGHT"); + script.setAttribute("runllm-assistant-id", "207"); + + script.async = true; + document.head.appendChild(script); + }); \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index b867bfd89dc17..1093b30bca11d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -68,6 +68,8 @@ 'use_repository_button': True, 'use_edit_page_button': True, } +html_static_path = ["_static"] +html_js_files = ["custom.js"] # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE') From fad5576c58864a6c2cf528f67e60e03a949b3dac Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 27 Jul 2024 10:28:33 -0700 Subject: [PATCH 070/167] [TPU] Reduce compilation time & Upgrade PyTorch XLA version (#6856) --- Dockerfile.tpu | 2 +- docs/source/getting_started/tpu-installation.rst | 9 ++++++++- vllm/attention/backends/pallas.py | 1 - .../device_communicators/tpu_communicator.py | 3 ++- vllm/worker/tpu_model_runner.py | 15 +++++++++++++-- vllm/worker/tpu_worker.py | 1 - 6 files changed, 24 insertions(+), 7 deletions(-) diff --git a/Dockerfile.tpu b/Dockerfile.tpu index be7dbe63cb237..4fc14d6bd186c 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20240713" +ARG NIGHTLY_DATE="20240726" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index 5e2f514a4a509..2e6c522422c22 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -56,7 +56,7 @@ First, install the dependencies: $ pip uninstall torch torch-xla -y $ # Install PyTorch and PyTorch XLA. - $ export DATE="+20240713" + $ export DATE="+20240726" $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl @@ -75,6 +75,13 @@ Next, build vLLM from source. This will only take a few seconds: $ VLLM_TARGET_DEVICE="tpu" python setup.py develop +.. note:: + + Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape. + The compilation time may take 20~30 minutes in the first run. + However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). + + .. tip:: If you encounter the following error: diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index c53a2f91b89d7..2269ac2606e89 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -3,7 +3,6 @@ import torch import torch_xla.experimental.custom_kernel # Required to register custom ops. -import torch_xla.experimental.dynamo_set_buffer_donor from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py index 69a9a516f3ebe..16525887cf4eb 100644 --- a/vllm/distributed/device_communicators/tpu_communicator.py +++ b/vllm/distributed/device_communicators/tpu_communicator.py @@ -6,6 +6,7 @@ if current_platform.is_tpu(): import torch_xla.core.xla_model as xm + import torch_xla.runtime as xr from torch_xla._internal import pjrt @@ -20,7 +21,7 @@ def __init__(self, group: ProcessGroup): local_rank = dist.get_rank(group) world_size = dist.get_world_size(group) pjrt.initialize_multiprocess(local_rank, world_size) - xm._init_world_size_ordinal() + xr._init_world_size_ordinal() def all_reduce(self, x: torch.Tensor) -> torch.Tensor: return xm.all_reduce(xm.REDUCE_SUM, x) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index e5bb101fc7df4..1692094af8c41 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -7,6 +7,7 @@ import torch import torch.nn as nn import torch_xla.core.xla_model as xm +import torch_xla.runtime as xr from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, @@ -127,7 +128,7 @@ def load_model(self) -> None: # determine the order of concatenating the output tensors. # As a workaround, we use the xm's rank assignment only when loading # the embedding weights. - xm_tp_rank = xm.get_ordinal() + xm_tp_rank = xr.global_ordinal() with patch( "vllm.model_executor.layers.vocab_parallel_embedding." "get_tensor_model_parallel_rank", @@ -146,7 +147,17 @@ def load_model(self) -> None: xm.wait_device_ops() model = ModelWrapper(model) - self.model = torch.compile(model, backend="openxla", fullgraph=True) + # NOTE(woosuk): There are two stages of compilation: torch.compile and + # XLA compilation. Setting dynamic=True can reduce the torch.compile + # overhead by reusing the FX graph for different shapes. + # However, the XLA graph will still require static shapes and needs to + # be re-compiled for every different shapes. This overhead is inevitable + # in the first run, but can be skipped afterwards as we cache the XLA + # graphs in the disk (VLLM_XLA_CACHE_PATH). + self.model = torch.compile(model, + backend="openxla", + fullgraph=True, + dynamic=True) def _dummy_run( self, diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index c88aba7ae08cd..17fa5c35457c2 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -3,7 +3,6 @@ import torch import torch_xla.core.xla_model as xm -import torch_xla.experimental.dynamo_set_buffer_donor # noqa: F401 import torch_xla.runtime as xr import vllm.envs as envs From 75acdaa4b616c2e95c55a47d3158ceec9c72c503 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Sat, 27 Jul 2024 17:52:33 -0400 Subject: [PATCH 071/167] [Kernel] Increase precision of GPTQ/AWQ Marlin kernel (#6795) --- benchmarks/kernels/benchmark_marlin.py | 23 ++- csrc/ops.h | 3 +- csrc/quantization/gptq_marlin/gptq_marlin.cu | 150 ++++++++++++++---- tests/kernels/test_marlin_gemm.py | 13 +- vllm/_custom_ops.py | 6 +- .../layers/quantization/utils/marlin_utils.py | 17 +- 6 files changed, 168 insertions(+), 44 deletions(-) diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 3da4cecd7eeff..684985b81f690 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -10,7 +10,7 @@ GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, - GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS) + MARLIN_SUPPORTED_GROUP_SIZES, MARLIN_SUPPORTED_NUM_BITS) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( MarlinWorkspace, marlin_quantize) from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import ( @@ -56,6 +56,8 @@ def bench_run(results: List[benchmark.Measurement], model: str, (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = marlin_24_quantize(b, num_bits, group_size) + marlin_zp = torch.empty(0, dtype=torch.int, device=b.device) + # GPTQ quant (w_ref, q_w, s, g_idx, rand_perm) = quantize_weights(b, num_bits, group_size, act_order) @@ -87,6 +89,7 @@ def bench_run(results: List[benchmark.Measurement], model: str, "marlin_w_ref": marlin_w_ref, "marlin_q_w": marlin_q_w, "marlin_s": marlin_s, + "marlin_zp": marlin_zp, "marlin_g_idx": marlin_g_idx, "marlin_sort_indices": marlin_sort_indices, "marlin_rand_perm": marlin_rand_perm, @@ -125,11 +128,21 @@ def bench_run(results: List[benchmark.Measurement], model: str, results.append( benchmark.Timer( stmt= - "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)", # noqa: E501 + "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, False)", # noqa: E501 + globals=globals, + label=label, + sub_label=sub_label, + description="gptq_marlin_gemm_fp16", + ).blocked_autorange(min_run_time=min_run_time)) + + results.append( + benchmark.Timer( + stmt= + "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full, False, True)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, - description="gptq_marlin_gemm", + description="gptq_marlin_gemm_fp32", ).blocked_autorange(min_run_time=min_run_time)) if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS @@ -183,12 +196,12 @@ def main(args): ) > 0 and is_k_full not in args.limit_k_full: continue - for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS: + for num_bits in MARLIN_SUPPORTED_NUM_BITS: if len(args.limit_num_bits ) > 0 and num_bits not in args.limit_num_bits: continue - for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES: + for group_size in MARLIN_SUPPORTED_GROUP_SIZES: if len( args.limit_group_size ) > 0 and group_size not in args.limit_group_size: diff --git a/csrc/ops.h b/csrc/ops.h index 9ef1fcb465bf3..f075850248d1c 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -93,7 +93,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& g_idx, torch::Tensor& perm, torch::Tensor& workspace, int64_t num_bits, int64_t size_m, int64_t size_n, int64_t size_k, - bool is_k_full, bool has_zp); + bool is_k_full, bool has_zp, + bool use_fp32_reduce); torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 122c5c16b58ce..36ae2bfafa7c2 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -59,14 +59,16 @@ __global__ void Marlin( const int4* __restrict__ A, // fp16 input matrix of shape mxk const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn int4* __restrict__ C, // fp16 output buffer of shape mxn + int4* __restrict__ C_tmp, // fp32 tmp output buffer (for reduce) const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape // (k/groupsize)xn const int* __restrict__ g_idx, // int32 group indices of shape k - int num_groups, // number of scale groups per output channel - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks, // extra global storage for barrier synchronization + bool use_fp32_reduce // whether to use fp32 global reduce ) {} } // namespace gptq_marlin @@ -532,16 +534,18 @@ __global__ void Marlin( const int4* __restrict__ A, // fp16 input matrix of shape mxk const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn int4* __restrict__ C, // fp16 output buffer of shape mxn + int4* __restrict__ C_tmp, // fp32 tmp output buffer (for reduce) const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape // (k/groupsize)xn const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape // (k/groupsize)x(n/pack_factor) const int* __restrict__ g_idx, // int32 group indices of shape k - int num_groups, // number of scale groups per output channel - int prob_m, // batch dimension m - int prob_n, // output dimension n - int prob_k, // reduction dimension k - int* locks // extra global storage for barrier synchronization + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks, // extra global storage for barrier synchronization + bool use_fp32_reduce // whether to use fp32 global reduce ) { // Each threadblock processes one "stripe" of the B matrix with (roughly) the // same size, which might involve multiple column "slices" (of width 16 * @@ -595,6 +599,8 @@ __global__ void Marlin( int slice_idx; // index of threadblock in current slice; numbered bottom to // top + int par_id = 0; + // We can easily implement parallel problem execution by just remapping // indices and advancing global pointers if (slice_col_par >= n_tiles) { @@ -602,6 +608,7 @@ __global__ void Marlin( C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; locks += (slice_col_par / n_tiles) * n_tiles; slice_col = slice_col_par % n_tiles; + par_id = slice_col_par / n_tiles; } // Compute all information about the current slice which is required for @@ -632,6 +639,7 @@ __global__ void Marlin( C += 16 * thread_m_blocks * prob_n / 8; locks += n_tiles; slice_col = 0; + par_id++; } }; init_slice(); @@ -1321,7 +1329,7 @@ __global__ void Marlin( // finally have to globally reduce over the results. As the striped // partitioning minimizes the number of such reductions and our outputs are // usually rather small, we perform this reduction serially in L2 cache. - auto global_reduce = [&](bool first = false, bool last = false) { + auto global_reduce_fp16 = [&](bool first = false, bool last = false) { // We are very careful here to reduce directly in the output buffer to // maximize L2 cache utilization in this step. To do this, we write out // results in FP16 (but still reduce with FP32 compute). @@ -1382,6 +1390,53 @@ __global__ void Marlin( } }; + // Globally reduce over threadblocks that compute the same column block. + // We use a tmp C buffer to reduce in full fp32 precision. + auto global_reduce_fp32 = [&](bool first = false, bool last = false) { + constexpr int tb_m = thread_m_blocks * 16; + constexpr int tb_n = thread_n_blocks * 16; + + constexpr int c_size = tb_m * tb_n * sizeof(float) / 16; + + constexpr int active_threads = 32 * thread_n_blocks / 4; + bool is_th_active = threadIdx.x < active_threads; + + int par_offset = c_size * n_tiles * par_id; + int slice_offset = c_size * slice_col; + + constexpr int num_floats = thread_m_blocks * 4 * 2 * 4; + constexpr int th_size = num_floats * sizeof(float) / 16; + + int c_cur_offset = par_offset + slice_offset; + + if (!is_th_active) { + return; + } + + if (!first) { + float* frag_c_ptr = reinterpret_cast(&frag_c); + #pragma unroll + for (int k = 0; k < th_size; k++) { + sh[threadIdx.x] = + C_tmp[c_cur_offset + active_threads * k + threadIdx.x]; + + float* sh_c_ptr = reinterpret_cast(&sh[threadIdx.x]); + #pragma unroll + for (int f = 0; f < 4; f++) { + frag_c_ptr[k * 4 + f] += sh_c_ptr[f]; + } + } + } + + if (!last) { + int4* frag_c_ptr = reinterpret_cast(&frag_c); + #pragma unroll + for (int k = 0; k < th_size; k++) { + C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k]; + } + } + }; + // Write out the reduce final result in the correct layout. We only actually // reshuffle matrix fragments in this step, the reduction above is performed // in fragment layout. @@ -1606,7 +1661,11 @@ __global__ void Marlin( if (slice_count > 1) { // only globally reduce if there is more than one // block in a slice barrier_acquire(&locks[slice_col], slice_idx); - global_reduce(slice_idx == 0, last); + if (use_fp32_reduce) { + global_reduce_fp32(slice_idx == 0, last); + } else { + global_reduce_fp16(slice_idx == 0, last); + } barrier_release(&locks[slice_col], last); } if (last) // only the last block in a slice actually writes the result @@ -1661,8 +1720,8 @@ __global__ void Marlin( THREAD_N_BLOCKS, THREAD_K_BLOCKS, pipe_stages, HAS_ACT_ORDER, \ HAS_ZP, GROUP_BLOCKS> \ <<>>( \ - A_ptr, B_ptr, C_ptr, s_ptr, zp_ptr, g_idx_ptr, num_groups, \ - prob_m, prob_n, prob_k, locks); \ + A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \ + num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce); \ } typedef struct { @@ -1801,6 +1860,27 @@ bool is_valid_config(thread_config_t const& th_config, int max_m_blocks, return true; } +int determine_reduce_max_m(int prob_m, int max_par) { + constexpr int tile_m_size = 16; + + if (prob_m <= tile_m_size) { + return tile_m_size; + + } else if (prob_m <= tile_m_size * 2) { + return tile_m_size * 2; + + } else if (prob_m <= tile_m_size * 3) { + return tile_m_size * 3; + + } else if (prob_m <= tile_m_size * 4) { + return tile_m_size * 4; + + } else { + int cur_par = min(div_ceil(prob_m, tile_m_size * 4), max_par); + return tile_m_size * 4 * cur_par; + } +} + exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, int num_bits, int group_size, bool has_act_order, bool is_k_full, @@ -1880,13 +1960,13 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, __CALL_IF(NUM_BITS, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) template -void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, void* zp, - void* g_idx, void* perm, void* a_tmp, int prob_m, - int prob_n, int prob_k, void* workspace, int num_bits, - bool has_act_order, bool is_k_full, bool has_zp, - int num_groups, int group_size, int dev, +void marlin_mm_f16i4(const void* A, const void* B, void* C, void* C_tmp, + void* s, void* zp, void* g_idx, void* perm, void* a_tmp, + int prob_m, int prob_n, int prob_k, void* workspace, + int num_bits, bool has_act_order, bool is_k_full, + bool has_zp, int num_groups, int group_size, int dev, cudaStream_t stream, int thread_k, int thread_n, int sms, - int max_par) { + int max_par, bool use_fp32_reduce) { TORCH_CHECK(num_bits == 4 || num_bits == 8, "num_bits must be 4 or 8. Got = ", num_bits); TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, @@ -1970,6 +2050,7 @@ void marlin_mm_f16i4(const void* A, const void* B, void* C, void* s, void* zp, const int4* A_ptr = (const int4*)A; const int4* B_ptr = (const int4*)B; int4* C_ptr = (int4*)C; + int4* C_tmp_ptr = (int4*)C_tmp; const int4* s_ptr = (const int4*)s; const int4* zp_ptr = (const int4*)zp; const int* g_idx_ptr = (const int*)g_idx; @@ -2049,7 +2130,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& g_idx, torch::Tensor& perm, torch::Tensor& workspace, int64_t num_bits, int64_t size_m, int64_t size_n, int64_t size_k, - bool is_k_full, bool has_zp) { + bool is_k_full, bool has_zp, + bool use_fp32_reduce) { // Verify num_bits TORCH_CHECK(num_bits == 4 || num_bits == 8, "num_bits must be 4 or 8. Got = ", num_bits); @@ -2099,6 +2181,17 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor c = torch::empty({size_m, size_n}, options); torch::Tensor a_tmp = torch::empty({size_m, size_k}, options); + // Alloc C tmp buffer that is going to be used for the global reduce + int reduce_max_m = marlin::determine_reduce_max_m(size_m, marlin::max_par); + int reduce_n = size_n; + auto options_fp32 = + torch::TensorOptions().dtype(at::kFloat).device(a.device()); + if (!use_fp32_reduce) { + reduce_max_m = 0; + reduce_n = 0; + } + torch::Tensor c_tmp = torch::empty({reduce_max_m, reduce_n}, options_fp32); + // thread_k: `k` size of a thread_tile in `weights` (can usually be left as // auto -1) int thread_k = -1; @@ -2171,20 +2264,21 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, if (a.scalar_type() == at::ScalarType::Half) { marlin::marlin_mm_f16i4( a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), - b_scales.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), - perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, + c_tmp.data_ptr(), b_scales.data_ptr(), + b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), + a_tmp.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, marlin::max_par); + thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce); } else if (a.scalar_type() == at::ScalarType::BFloat16) { marlin::marlin_mm_f16i4( a.data_ptr(), b_q_weight.data_ptr(), - c.data_ptr(), b_scales.data_ptr(), - b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), - a_tmp.data_ptr(), size_m, size_n, size_k, + c.data_ptr(), c_tmp.data_ptr(), + b_scales.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), + perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(), num_bits, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, marlin::max_par); + thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce); } else { TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16"); } diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 42087fdcce959..bd35ef2eb2552 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -27,6 +27,7 @@ ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] +USE_FP32_REDUCE_OPTS = [False, True] MARLIN_K_CHUNKS = [128] MARLIN_N_CHUNKS = [64, 128, 256] @@ -175,6 +176,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, num_bits, group_size, @pytest.mark.parametrize("mnk_factors", MNK_FACTORS) @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS) @pytest.mark.parametrize("is_k_full", K_FULL_OPTS) +@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS) def test_gptq_marlin_gemm( k_chunk, n_chunk, @@ -183,6 +185,7 @@ def test_gptq_marlin_gemm( mnk_factors, act_order, is_k_full, + use_fp32_reduce, ): m_factor, n_factor, k_factor = mnk_factors @@ -222,8 +225,9 @@ def test_gptq_marlin_gemm( a_input.shape[0], b_weight.shape[1], a_input.shape[1], - is_k_full, + is_k_full=is_k_full, has_zp=False, + use_fp32_reduce=use_fp32_reduce, ) output_ref = torch.matmul(a_input, w_ref) @@ -365,12 +369,14 @@ def test_fp8_marlin_gemm( @pytest.mark.parametrize("num_bits", MARLIN_SUPPORTED_NUM_BITS) @pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES) @pytest.mark.parametrize("mnk_factors", MNK_FACTORS) +@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS) def test_awq_marlin_gemm( k_chunk, n_chunk, num_bits, group_size, mnk_factors, + use_fp32_reduce, ): m_factor, n_factor, k_factor = mnk_factors @@ -407,8 +413,9 @@ def test_awq_marlin_gemm( a_input.shape[0], b_weight.shape[1], a_input.shape[1], - is_k_full, - has_zp, + is_k_full=is_k_full, + has_zp=has_zp, + use_fp32_reduce=use_fp32_reduce, ) output_ref = torch.matmul(a_input, w_ref) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 0186594656cc1..ad9f01be6ddd4 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -286,12 +286,12 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, b_scales: torch.Tensor, b_zeros: torch.Tensor, g_idx: torch.Tensor, perm: torch.Tensor, workspace: torch.Tensor, num_bits: int, size_m: int, - size_n: int, size_k: int, is_k_full: bool, - has_zp: bool) -> torch.Tensor: + size_n: int, size_k: int, is_k_full: bool, has_zp: bool, + use_fp32_reduce: bool) -> torch.Tensor: return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros, g_idx, perm, workspace, num_bits, size_m, size_n, size_k, is_k_full, - has_zp) + has_zp, use_fp32_reduce) # fp8 marlin diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 25a7cd7bde653..b789ca20cadb3 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -16,6 +16,11 @@ MARLIN_SUPPORTED_NUM_BITS = [4, 8] MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] +# In case there is a performance issue with Marlin, the variable below can be +# changed to False, which allows Marlin to perform global reductions in fp16 +# precision (instead of fp32), and therefore, save on some memory movements. +USE_FP32_REDUCE_DEFAULT = True + def _check_marlin_supported(num_bits: int, group_size: int, is_sym: bool, min_capability: Optional[int], @@ -244,7 +249,8 @@ def apply_gptq_marlin_linear( output_size_per_partition: int, input_size_per_partition: int, is_k_full: bool, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + bias: Optional[torch.Tensor] = None, + use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor: reshaped_x = input.reshape(-1, input.shape[-1]) out_shape = input.shape[:-1] + (output_size_per_partition, ) @@ -260,7 +266,8 @@ def apply_gptq_marlin_linear( size_n=output_size_per_partition, size_k=input_size_per_partition, is_k_full=is_k_full, - has_zp=False) + has_zp=False, + use_fp32_reduce=use_fp32_reduce) if bias is not None: output.add_(bias) # In-place add @@ -279,7 +286,8 @@ def apply_awq_marlin_linear( num_bits: int, output_size_per_partition: int, input_size_per_partition: int, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: + bias: Optional[torch.Tensor] = None, + use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT) -> torch.Tensor: reshaped_x = input.reshape(-1, input.shape[-1]) out_shape = input.shape[:-1] + (output_size_per_partition, ) @@ -295,7 +303,8 @@ def apply_awq_marlin_linear( size_n=output_size_per_partition, size_k=input_size_per_partition, is_k_full=True, - has_zp=True) + has_zp=True, + use_fp32_reduce=use_fp32_reduce) if bias is not None: output.add_(bias) # In-place add From b1366a953498fde9c5e7ab91915367ebc69008b2 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 27 Jul 2024 18:05:17 -0400 Subject: [PATCH 072/167] Add Nemotron to PP_SUPPORTED_MODELS (#6863) --- vllm/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config.py b/vllm/config.py index 92fde449b43fd..e7b54e04b00d5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -39,6 +39,7 @@ "Phi3ForCausalLM", "GPT2LMHeadModel", "MixtralForCausalLM", + "NemotronForCausalLM", ] From 3eeb148f467e3619e8890b1a5ebe86a173f91bc9 Mon Sep 17 00:00:00 2001 From: Elsa Granger <6374697+zeyugao@users.noreply.github.com> Date: Sun, 28 Jul 2024 23:13:49 +0800 Subject: [PATCH 073/167] [Misc] Pass cutlass_fp8_supported correctly in fbgemm_fp8 (#6871) --- .../layers/quantization/fbgemm_fp8.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 5e8d1f1947421..e7c3859967c71 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -9,6 +9,7 @@ UnquantizedLinearMethod) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -72,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase): def __init__(self, quant_config: FBGEMMFp8Config): self.quant_config = quant_config + self.cutlass_fp8_supported = cutlass_fp8_supported() def create_weights( self, @@ -139,11 +141,12 @@ def apply(self, size_k=layer.input_size_per_partition, bias=bias) - return apply_fp8_linear(input=x, - weight=layer.weight, - weight_scale=layer.weight_scale, - input_scale=None, - input_scale_ub=layer.input_scale_ub, - bias=bias, - cutlass_fp8_supported=True, - use_per_token_if_dynamic=True) + return apply_fp8_linear( + input=x, + weight=layer.weight, + weight_scale=layer.weight_scale, + input_scale=None, + input_scale_ub=layer.input_scale_ub, + bias=bias, + cutlass_fp8_supported=self.cutlass_fp8_supported, + use_per_token_if_dynamic=True) From 7cbd9ec7a9bfd4952ad522355b6bbb8e82b54fc9 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 29 Jul 2024 18:16:30 +0800 Subject: [PATCH 074/167] [Model] Initialize support for InternVL2 series models (#6514) Co-authored-by: Roger Wang --- docs/source/models/supported_models.rst | 4 + examples/offline_inference_vision_language.py | 15 + examples/openai_vision_api_client.py | 2 + requirements-test.txt | 1 + tests/models/test_internvl.py | 201 ++++++++ vllm/entrypoints/chat_utils.py | 2 +- vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/intern_vit.py | 270 ++++++++++ vllm/model_executor/models/internlm2.py | 10 +- vllm/model_executor/models/internvl.py | 471 ++++++++++++++++++ vllm/model_executor/models/qwen2.py | 10 +- vllm/transformers_utils/config.py | 8 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/internvl.py | 51 ++ 14 files changed, 1042 insertions(+), 6 deletions(-) create mode 100644 tests/models/test_internvl.py create mode 100644 vllm/model_executor/models/intern_vit.py create mode 100644 vllm/model_executor/models/internvl.py create mode 100644 vllm/transformers_utils/configs/internvl.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 83c1b9c8bce86..4fe33e5ab5d80 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -200,6 +200,10 @@ Vision Language Models - Fuyu - :code:`adept/fuyu-8b` etc. - + * - :code:`InternVLChatModel` + - InternVL2 + - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. + - * - :code:`LlavaForConditionalGeneration` - LLaVA-1.5 - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc. diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 04ba1a96314c9..846246a2062a6 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -106,6 +106,20 @@ def run_minicpmv(question): return llm, prompt +# InternVL +def run_internvl(question): + # Generally, InternVL can use chatml template for conversation + TEMPLATE = "<|im_start|>User\n{prompt}<|im_end|>\n<|im_start|>Assistant\n" + prompt = f"\n{question}\n" + prompt = TEMPLATE.format(prompt=prompt) + llm = LLM( + model="OpenGVLab/InternVL2-4B", + trust_remote_code=True, + max_num_seqs=5, + ) + return llm, prompt + + # BLIP-2 def run_blip2(question): @@ -125,6 +139,7 @@ def run_blip2(question): "chameleon": run_chameleon, "minicpmv": run_minicpmv, "blip-2": run_blip2, + "internvl_chat": run_internvl, } diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py index 2082c378e267c..be90394511f89 100644 --- a/examples/openai_vision_api_client.py +++ b/examples/openai_vision_api_client.py @@ -42,6 +42,7 @@ ], }], model=model, + max_tokens=64, ) result = chat_completion_from_url.choices[0].message.content @@ -78,6 +79,7 @@ def encode_image_base64_from_url(image_url: str) -> str: ], }], model=model, + max_tokens=64, ) result = chat_completion_from_base64.choices[0].message.content diff --git a/requirements-test.txt b/requirements-test.txt index a7604d2e1015e..9b88fcce3e842 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -16,6 +16,7 @@ ray sentence-transformers # required for embedding sparseml==1.8.0 # required for compressed-tensors compressed-tensors==0.4.0 # required for compressed-tensors +timm # required for internvl test # Benchmarking aiohttp diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py new file mode 100644 index 0000000000000..66cb8dda248db --- /dev/null +++ b/tests/models/test_internvl.py @@ -0,0 +1,201 @@ +import types +from typing import List, Optional, Type + +import pytest +import torch +from huggingface_hub import snapshot_download +from PIL.Image import Image + +from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END, + IMG_START, + image_to_pixel_values) +from vllm.multimodal.utils import rescale_image_size +from vllm.utils import is_cpu + +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from .utils import check_logprobs_close + +pytestmark = pytest.mark.vlm + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "<|im_start|>User\n\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 + "cherry_blossom": + "<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 +}) + +# we use snapshot_download to prevent conflicts between +# dynamic_module and trust_remote_code for hf_runner +models = [ + snapshot_download("OpenGVLab/InternVL2-1B"), + snapshot_download("OpenGVLab/InternVL2-2B"), + # snapshot_download("OpenGVLab/InternVL2-4B"), # broken +] + + +class InternVLProcessor: + """A simple processor for InternVL2 HF model which misses a processor.""" + + def __init__(self, hf_runner: HfRunner): + self.num_image_token = hf_runner.model.num_image_token + self.tokenizer = hf_runner.tokenizer + self.dtype = hf_runner.model.dtype + + def __call__(self, text: str, images: Image, **kwargs): + pixel_values = image_to_pixel_values(images).to(self.dtype) + num_patches_list = [pixel_values.shape[0]] + for num_patches in num_patches_list: + context_tokens = IMG_CONTEXT * self.num_image_token * num_patches + image_tokens = IMG_START + context_tokens + IMG_END + text = text.replace('', image_tokens, 1) + prompt = self.tokenizer(text, return_tensors="pt") + prompt.update({"pixel_values": pixel_values}) + return prompt + + +# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py +def generate( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.FloatTensor, + attention_mask: Optional[torch.LongTensor] = None, + **generate_kwargs, +) -> torch.LongTensor: + """Generate method for InternVL2 model without fixed use_cache.""" + assert self.img_context_token_id is not None + vit_embeds = self.extract_feature(pixel_values) + input_embeds = self.language_model.get_input_embeddings()(input_ids) + B, N, C = input_embeds.shape + input_embeds = input_embeds.reshape(B * N, C) + + input_ids = input_ids.reshape(B * N) + selected = (input_ids == self.img_context_token_id) + assert selected.sum() != 0 + input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device) + + input_embeds = input_embeds.reshape(B, N, C) + + outputs = self.language_model.generate( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + **generate_kwargs, + ) + + return outputs + + +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + image_assets: _ImageAssets, + model: str, + *, + size_factors: List[float], + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalDataDict objects + and corresponding vision language config as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + max_model_len=4096, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + vllm_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + with hf_runner(model, dtype=dtype) as hf_model: + img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids( + "") + hf_model.model.img_context_token_id = img_context_token_id + hf_model.processor = InternVLProcessor(hf_model) + hf_model.model.get_output_embeddings = lambda: \ + hf_model.model.language_model.get_output_embeddings() + hf_model.model.generate = types.MethodType(generate, hf_model.model) + eos_token_id = hf_model.tokenizer.eos_token_id + hf_outputs_per_image = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=hf_images, + eos_token_id=eos_token_id) + for prompts, hf_images in inputs_per_image + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, + vllm_outputs_per_image): + # TODO: Check whether using original CLIPVisionModel can improve + # consistency against HF + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +target_dtype = "half" +if is_cpu(): + target_dtype = "bfloat16" + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +@torch.inference_mode() +def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, + dtype: str, max_tokens: int, num_logprobs: int) -> None: + run_test( + hf_runner, + vllm_runner, + image_assets, + model, + size_factors=size_factors, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 1f6d77b828459..fbb7f70b55e16 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -107,7 +107,7 @@ def _image_token_str(model_config: ModelConfig, return None if model_type.startswith("llava"): return tokenizer.decode(model_config.hf_config.image_token_index) - if model_type == "chameleon": + if model_type in ("chameleon", "internvl_chat"): return "" raise TypeError(f"Unknown model type: {model_type}") diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index fe04c6db5fbc2..94c3cea98be7b 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -37,6 +37,7 @@ "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"), "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), + "InternVLChatModel": ("internvl", "InternVLChatModel"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), "LlavaForConditionalGeneration": diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py new file mode 100644 index 0000000000000..86d0930d80126 --- /dev/null +++ b/vllm/model_executor/models/intern_vit.py @@ -0,0 +1,270 @@ +# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2023 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PretrainedConfig + +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig + +NORM2FN = { + 'rms_norm': RMSNorm, + 'layer_norm': nn.LayerNorm, +} + + +class InternVisionEmbeddings(nn.Module): + + def __init__(self, config: PretrainedConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d(in_channels=3, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size) + + self.num_patches = (self.image_size // self.patch_size)**2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter( + torch.randn(1, self.num_positions, self.embed_dim)) + + def _get_pos_embed(self, pos_embed, H, W): + target_dtype = pos_embed.dtype + pos_embed = pos_embed.float().reshape( + 1, self.image_size // self.patch_size, + self.image_size // self.patch_size, -1).permute(0, 3, 1, 2) + pos_embed = F.interpolate(pos_embed, + size=(H, W), + mode='bicubic', + align_corners=False) + pos_embed = pos_embed.reshape(1, -1, H * W).permute(0, 2, + 1).to(target_dtype) + return pos_embed + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to( + target_dtype)) # shape = [*, channel, width, height] + batch_size, _, height, width = patch_embeds.shape + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, + -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + position_embedding = torch.cat([ + self.position_embedding[:, :1, :], + self._get_pos_embed(self.position_embedding[:, 1:, :], height, + width) + ], + dim=1) + embeddings = embeddings + position_embedding.to(target_dtype) + return embeddings + + +class InternAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: PretrainedConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f'embed_dim must be divisible by num_heads ' + f'(got `embed_dim`: {self.embed_dim} and `num_heads`:' + f' {self.num_heads}).') + + self.scale = self.head_dim**-0.5 + self.qkv = nn.Linear(self.embed_dim, + 3 * self.embed_dim, + bias=config.qkv_bias) + + self.qk_normalization = config.qk_normalization + + if self.qk_normalization: + self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) + self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) + + self.proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) + + if self.qk_normalization: + B_, H_, N_, D_ = q.shape + q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view( + B_, N_, H_, D_).transpose(1, 2) + k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view( + B_, N_, H_, D_).transpose(1, 2) + + x = F.scaled_dot_product_attention(q, k, v, scale=self.scale) + x = x.transpose(1, 2).reshape(B, N, C) + + x = self.proj(x) + return x + + +class InternMLP(nn.Module): + + def __init__(self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + self.activation_fn = get_act_fn(config.hidden_act) + self.fc1 = ColumnParallelLinear(config.hidden_size, + config.intermediate_size, + bias=True, + quant_config=quant_config) + self.fc2 = RowParallelLinear(config.intermediate_size, + config.hidden_size, + bias=True, + quant_config=quant_config) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + + return hidden_states + + +class InternVisionEncoderLayer(nn.Module): + + def __init__(self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.embed_dim = config.hidden_size + self.intermediate_size = config.intermediate_size + self.norm_type = config.norm_type + + self.attn = InternAttention(config) + self.mlp = InternMLP(config, quant_config=quant_config) + self.norm1 = NORM2FN[self.norm_type](self.embed_dim, + eps=config.layer_norm_eps) + self.norm2 = NORM2FN[self.norm_type](self.embed_dim, + eps=config.layer_norm_eps) + + self.ls1 = nn.Parameter(config.initializer_factor * + torch.ones(self.embed_dim)) + self.ls2 = nn.Parameter(config.initializer_factor * + torch.ones(self.embed_dim)) + + def forward( + self, + hidden_states: torch.Tensor, + ): + hidden_states = hidden_states + self.attn( + self.norm1(hidden_states)) * self.ls1 + + hidden_states = hidden_states + self.mlp( + self.norm2(hidden_states)) * self.ls2 + + return hidden_states + + +class InternVisionEncoder(nn.Module): + + def __init__(self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None): + super().__init__() + self.config = config + + if num_hidden_layers_override is None: + num_hidden_layers = config.num_hidden_layers + else: + num_hidden_layers = num_hidden_layers_override + self.layers = nn.ModuleList([ + InternVisionEncoderLayer(config=config, quant_config=quant_config) + for _ in range(num_hidden_layers) + ]) + + def forward(self, inputs_embeds: torch.Tensor): + + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer(hidden_states) + + return hidden_states + + +class InternVisionModel(nn.Module): + + def __init__(self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None): + super().__init__() + self.config = config + + self.embeddings = InternVisionEmbeddings(config) + self.encoder = InternVisionEncoder( + config=config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_override) + + def resize_pos_embeddings(self, old_size, new_size, patch_size): + pos_emb = self.embeddings.position_embedding + _, num_positions, embed_dim = pos_emb.shape + cls_emb = pos_emb[:, :1, :] + pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, + old_size // patch_size, + -1).permute(0, 3, 1, 2) + pos_emb = F.interpolate(pos_emb.float(), + size=new_size // patch_size, + mode='bicubic', + align_corners=False) + pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, + -1).permute(0, 2, 1) + pos_emb = torch.cat([cls_emb, pos_emb], dim=1) + self.embeddings.position_embedding = nn.Parameter(pos_emb) + self.embeddings.image_size = new_size + + def get_input_embeddings(self): + return self.embeddings + + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + pixel_embeds: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + if pixel_values is None and pixel_embeds is None: + raise ValueError( + 'You have to specify pixel_values or pixel_embeds') + + if pixel_embeds is not None: + hidden_states = pixel_embeds + elif pixel_values is not None: + if pixel_values.ndim == 4: + hidden_states = self.embeddings(pixel_values) + else: + raise ValueError( + f'wrong pixel_values size: {pixel_values.shape}') + + encoder_outputs = self.encoder(inputs_embeds=hidden_states) + + return encoder_outputs diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 22132f40fc5e6..745fbf99a902d 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -219,14 +219,22 @@ def __init__( ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.tok_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + intermediate_tensors: IntermediateTensors = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - hidden_states = self.tok_embeddings(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.tok_embeddings(input_ids) residual = None for i in range(len(self.layers)): layer = self.layers[i] diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py new file mode 100644 index 0000000000000..f64c78c15f8ee --- /dev/null +++ b/vllm/model_executor/models/internvl.py @@ -0,0 +1,471 @@ +# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2023 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union + +import torch +import torch.nn as nn +import torchvision.transforms as T +from PIL import Image +from transformers import PretrainedConfig + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, MultiModalConfig +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models import ModelRegistry +from vllm.model_executor.models.intern_vit import InternVisionModel +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors +from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.image import cached_get_tokenizer +from vllm.sequence import IntermediateTensors, SamplerOutput + +from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, + get_clip_num_patches) +from .interfaces import SupportsVision +from .utils import merge_vision_embeddings + +IMG_START = '' +IMG_END = '' +IMG_CONTEXT = '' + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + +MAX_IMAGE_FEATURE_SIZE_WIDTH = 3000 +MAX_IMAGE_FEATURE_SIZE_HEIGHT = 500 + + +class InternVLImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: BatchedTensors + """ + Shape: `(batch_size, 1 + num_patches, num_channels, height, width)` + + Note that `num_patches` may be different for each batch, in which case + the data is passed as a list instead of a batched tensor. + """ + + +# copied from https://huggingface.co/OpenGVLab/InternVL2-1B +def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Resize((input_size, input_size), + interpolation=T.InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD) + ]) + return transform + + +# copied from https://huggingface.co/OpenGVLab/InternVL2-1B +def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, + image_size): + best_ratio_diff = float('inf') + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + +def calculate_num_blocks(orig_width: int, + orig_height: int, + min_num=1, + max_num=6, + image_size=448): + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set((i, j) for n in range(min_num, max_num + 1) + for i in range(1, n + 1) for j in range(1, n + 1) + if i * j <= max_num and i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, + target_ratios, orig_width, + orig_height, image_size) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + return blocks, target_width, target_height + + +# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B +def dynamic_preprocess(image, + min_num=1, + max_num=6, + image_size=448, + use_thumbnail=False): + orig_width, orig_height = image.size + + blocks, target_width, target_height = calculate_num_blocks( + orig_width, orig_height, min_num, max_num, image_size) + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ((i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + +# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B +def image_to_pixel_values(image: Image.Image, input_size=448, max_num=6): + transform = build_transform(input_size=input_size) + images = dynamic_preprocess(image, + image_size=input_size, + use_thumbnail=True, + max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + +def get_internvl_num_patches(image_size: int, patch_size: int, + downsample_ratio: float): + return int( + get_clip_num_patches(image_size=image_size, patch_size=patch_size) * + (downsample_ratio**2)) + + +def get_max_internvl_image_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config(PretrainedConfig) + vision_config = hf_config.vision_config + image_size = vision_config.image_size + patch_size = vision_config.patch_size + downsample_ratio = hf_config.downsample_ratio + num_patches = get_internvl_num_patches(image_size, patch_size, + downsample_ratio) + return num_patches * 7 + + +def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + model_config = ctx.model_config + hf_config = ctx.get_hf_config(PretrainedConfig) + vision_config = hf_config.vision_config + + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + width, height = image_data.size + num_blocks, _, _ = calculate_num_blocks(width, height) + elif isinstance(image_data, torch.Tensor): + raise NotImplementedError("Embeddings input is not supported yet") + else: + raise TypeError(f"Invalid image type: {type(image_data)}") + + image_size = vision_config.image_size + patch_size = vision_config.patch_size + downsample_ratio = hf_config.downsample_ratio + num_patches = get_internvl_num_patches(image_size, patch_size, + downsample_ratio) + + tokenizer = cached_get_tokenizer(model_config.tokenizer, + trust_remote_code=True) + + prompt = llm_inputs["prompt"] + prompt_token_ids = llm_inputs["prompt_token_ids"] + if prompt is None: + prompt = tokenizer.decode(prompt_token_ids) + image_prompt = IMG_START + IMG_CONTEXT * (num_blocks + + 1) * num_patches + IMG_END + new_prompt = prompt.replace('', image_prompt, 1) + new_prompt_token_ids = tokenizer.encode(new_prompt) + + return LLMInputs(prompt=prompt, + prompt_token_ids=new_prompt_token_ids, + multi_modal_data=multi_modal_data) + + +def input_mapper_for_internvl(ctx: InputContext, data: object): + if isinstance(data, Image.Image): + data = image_to_pixel_values(data) + model_config = ctx.model_config + tokenizer = cached_get_tokenizer(model_config.tokenizer, + trust_remote_code=True) + image_token_id = tokenizer.encode(IMG_CONTEXT, + add_special_tokens=False, + return_tensors="pt")[0] + + return MultiModalInputs({ + "pixel_values": data, + "image_token_id": image_token_id + }) + + +def dummy_data_for_internvl(ctx: InputContext, seq_len: int): + + image_feature_size = get_max_internvl_image_tokens(ctx) + model_config = ctx.model_config + hf_config = ctx.get_hf_config(PretrainedConfig) + vision_config = hf_config.vision_config + tokenizer = cached_get_tokenizer(model_config.tokenizer, + trust_remote_code=True) + + seq_data = dummy_seq_data_for_clip( + vision_config, + seq_len, + image_token_id=tokenizer.encode(IMG_CONTEXT, + add_special_tokens=False)[0], + image_feature_size_override=image_feature_size, + ) + mm_data = dummy_image_for_clip( + vision_config, + image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH, + image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + ) + + return seq_data, mm_data + + +@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl) +@INPUT_REGISTRY.register_input_processor(input_processor_for_internvl) +class InternVLChatModel(nn.Module, SupportsVision): + + def __init__(self, + config: PretrainedConfig, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None) -> None: + super().__init__() + + self.config = config + self.multimodal_config = multimodal_config + + image_size = config.force_image_size or config.vision_config.image_size + patch_size = config.vision_config.patch_size + self.patch_size = patch_size + self.select_layer = config.select_layer + self.num_image_token = int( + (image_size // patch_size)**2 * (config.downsample_ratio**2)) + self.downsample_ratio = config.downsample_ratio + self.ps_version = config.ps_version + + vision_feature_layer = self.select_layer + if vision_feature_layer < 0: + num_hidden_layers = config.vision_config.num_hidden_layers \ + + vision_feature_layer + 1 + else: + num_hidden_layers = vision_feature_layer + 1 + self.vision_model = InternVisionModel( + config.vision_config, num_hidden_layers_override=num_hidden_layers) + + llm_class = ModelRegistry.load_model_cls( + config.text_config.architectures[0]) + self.language_model = llm_class(config.text_config, cache_config, + quant_config) + + vit_hidden_size = config.vision_config.hidden_size + llm_hidden_size = config.text_config.hidden_size + + self.mlp1 = nn.Sequential( + nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2), + nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2, + llm_hidden_size), nn.GELU(), + nn.Linear(llm_hidden_size, llm_hidden_size)) + + self.img_context_token_id = None + + def pixel_shuffle(self, x, scale_factor=0.5): + n, w, h, c = x.size() + # N, W, H, C --> N, W, H * scale, C // scale + x = x.view(n, w, int(h * scale_factor), int(c / scale_factor)) + # N, W, H * scale, C // scale --> N, H * scale, W, C // scale + x = x.permute(0, 2, 1, 3).contiguous() + x = x.view(n, int(h * scale_factor), int(w * scale_factor), + int(c / (scale_factor * scale_factor))) + if self.ps_version == 'v1': + pass + else: + x = x.permute(0, 2, 1, 3).contiguous() + return x + + def extract_feature(self, pixel_values): + vit_embeds = self.vision_model(pixel_values=pixel_values) + vit_embeds = vit_embeds[:, 1:, :] + + h = w = int(vit_embeds.shape[1]**0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = self.pixel_shuffle(vit_embeds, + scale_factor=self.downsample_ratio) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, + vit_embeds.shape[-1]) + vit_embeds = self.mlp1(vit_embeds) + return vit_embeds + + def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: + if list(data.shape[1:]) != [2]: + raise ValueError( + f"The expected image sizes shape is batch dimension plus " + f"{[2]}. You supplied {data.shape}.") + + return data + + def _validate_pixel_values( + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: + + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) + + def _validate_shape(d: torch.Tensor): + actual_dims = tuple(d.shape) + + if actual_dims != expected_dims: + expected_expr = ("num_patches", *map(str, expected_dims)) + raise ValueError( + "The expected shape of pixel values in each batch element " + f"is {expected_expr}. You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[InternVLImagePixelInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_token_id = kwargs.pop("image_token_id", None) + + if pixel_values is None: + return None + + self.img_context_token_id = image_token_id[0] + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return InternVLImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values(pixel_values), + ) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> SamplerOutput: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is not None: + inputs_embeds = self.language_model.model.get_input_embeddings( + input_ids) + vit_embeds = self.extract_feature(image_input["data"]) + inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds, + vit_embeds, + self.img_context_token_id) + input_ids = None + else: + inputs_embeds = None + + hidden_states = self.language_model.model(input_ids, + positions, + kv_caches, + attn_metadata, + None, + inputs_embeds=inputs_embeds) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + return self.language_model.sample(logits, sampling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + (".gate_up_proj", ".w1", 0), + (".gate_up_proj", ".w3", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if self.config.text_config.tie_word_embeddings \ + and "lm_head.weight" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # We only do sharding for language model + # and not vision model for now. + if "vision_embed_tokens" in name and self.vision_embed_tokens: + continue + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + if "wqkv" in name: + config = self.config.text_config + kv_groups = (config.num_attention_heads // + config.num_key_value_heads) + head_dim = config.hidden_size // config.num_attention_heads + loaded_weight = loaded_weight.view(-1, 2 + kv_groups, + head_dim, + loaded_weight.shape[-1]) + wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1], + dim=1) + wq = wq.reshape(-1, wq.shape[-1]) + wk = wk.reshape(-1, wk.shape[-1]) + wv = wv.reshape(-1, wv.shape[-1]) + weight_loader = param.weight_loader + weight_loader(param, wq, 'q') + weight_loader(param, wk, 'k') + weight_loader(param, wv, 'v') + continue + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index e9aa4416eded4..3deb3d8840cc4 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -243,14 +243,22 @@ def __init__( ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_tokens(input_ids) residual = None for i in range(len(self.layers)): layer = self.layers[i] diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 3ba2e01985598..3d13631b9b2b6 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,9 +6,10 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, - JAISConfig, MedusaConfig, - MLPSpeculatorConfig, MPTConfig, - NemotronConfig, RWConfig) + InternVLChatConfig, JAISConfig, + MedusaConfig, MLPSpeculatorConfig, + MPTConfig, NemotronConfig, + RWConfig) if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -26,6 +27,7 @@ "jais": JAISConfig, "mlp_speculator": MLPSpeculatorConfig, "medusa": MedusaConfig, + "internvl_chat": InternVLChatConfig, "nemotron": NemotronConfig, } diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 1750950b3c38b..5ccacd4a4c40a 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -4,6 +4,7 @@ # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig +from vllm.transformers_utils.configs.internvl import InternVLChatConfig from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.medusa import MedusaConfig from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig @@ -15,6 +16,7 @@ "DbrxConfig", "MPTConfig", "RWConfig", + "InternVLChatConfig", "JAISConfig", "MedusaConfig", "MLPSpeculatorConfig", diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py new file mode 100644 index 0000000000000..ac2492317aa36 --- /dev/null +++ b/vllm/transformers_utils/configs/internvl.py @@ -0,0 +1,51 @@ +# Adapted from +# https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2024 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +from transformers.configuration_utils import PretrainedConfig + + +class InternVLChatConfig(PretrainedConfig): + model_type = 'internvl_chat' + is_composition = True + + def __init__(self, + vision_config=None, + llm_config=None, + use_backbone_lora=0, + use_llm_lora=0, + select_layer=-1, + force_image_size=None, + downsample_ratio=0.5, + template=None, + dynamic_image_size=False, + use_thumbnail=False, + ps_version='v1', + min_dynamic_patch=1, + max_dynamic_patch=6, + **kwargs): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {} + + if llm_config is None: + llm_config = {} + + self.vision_config = PretrainedConfig(**vision_config) + self.text_config = PretrainedConfig(**llm_config) + + self.use_backbone_lora = use_backbone_lora + self.use_llm_lora = use_llm_lora + self.select_layer = select_layer + self.force_image_size = force_image_size + self.downsample_ratio = downsample_ratio + self.template = template + self.dynamic_image_size = dynamic_image_size + self.use_thumbnail = use_thumbnail + self.ps_version = ps_version # pixel shuffle version + self.min_dynamic_patch = min_dynamic_patch + self.max_dynamic_patch = max_dynamic_patch From 766435e660a786933392eb8ef0a873bc38cf0c8b Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Mon, 29 Jul 2024 11:42:35 -0400 Subject: [PATCH 075/167] [Kernel] Tuned FP8 Kernels for Ada Lovelace (#6677) Co-authored-by: Varun Sundar Rabindranath --- .../cutlass_benchmarks/w8a8_benchmarks.py | 2 +- .../cutlass_w8a8/scaled_mm_c2x.cu | 520 ++---------------- .../cutlass_w8a8/scaled_mm_c2x.cuh | 340 ++++++++++++ .../scaled_mm_c2x_sm80_dispatch.cuh | 139 +++++ .../scaled_mm_c2x_sm89_dispatch.cuh | 362 ++++++++++++ tests/kernels/test_cutlass.py | 4 +- 6 files changed, 877 insertions(+), 490 deletions(-) create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c2x_sm89_dispatch.cuh diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 234c2c8a1074c..70247e94e63cf 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -13,7 +13,7 @@ from vllm import _custom_ops as ops from vllm.utils import FlexibleArgumentParser -DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:] +DEFAULT_MODELS = list(WEIGHT_SHAPES.keys()) DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] DEFAULT_TP_SIZES = [1] diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index 6ce25c5ac897b..d26c43de522c9 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -1,470 +1,16 @@ #include #include - -#include - -// clang-format will break include orders -// clang-format off -#include "cute/tensor.hpp" -#include "cute/atom/mma_atom.hpp" -#include "cutlass/numeric_types.h" - -#include "cutlass/util/device_memory.h" - #include "cutlass/cutlass.h" -#include "cutlass/gemm_coord.h" -#include "cutlass/arch/mma_sm75.h" -#include "cutlass/arch/arch.h" -#include "cutlass/arch/mma.h" -#include "cutlass/gemm/device/gemm.h" -#include "cutlass/gemm/device/gemm_universal_adapter.h" -#include "cutlass/epilogue/threadblock/fusion/visitors.hpp" -#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h" - -#include "broadcast_load_epilogue_c2x.hpp" -#include "common.hpp" -// clang-format on - -using namespace cute; +#include "scaled_mm_c2x.cuh" +#include "scaled_mm_c2x_sm80_dispatch.cuh" +#include "scaled_mm_c2x_sm89_dispatch.cuh" /* This file defines quantized GEMM operations using the CUTLASS 2.x API, for NVIDIA GPUs with SM versions prior to sm90 (Hopper). - - Epilogue functions can be defined to post-process the output before it is - written to GPU memory. - Epilogues must contain a public type named EVTCompute of type Sm80EVT, - as well as a static prepare_args function that constructs an - EVTCompute::Arguments struct. */ -namespace { - -// Wrappers for the GEMM kernel that is used to guard against compilation on -// architectures that will never use the kernel. The purpose of this is to -// reduce the size of the compiled binary. -// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef -// into code that will be executed on the device where it is defined. -template -struct enable_sm75_to_sm80 : Kernel { - template - CUTLASS_DEVICE static void invoke(Args&&... args) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800 - Kernel::invoke(std::forward(args)...); -#endif - } -}; - -template -struct enable_sm80_to_sm89 : Kernel { - template - CUTLASS_DEVICE static void invoke(Args&&... args) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890 - Kernel::invoke(std::forward(args)...); -#endif - } -}; - -template -struct enable_sm89_to_sm90 : Kernel { - template - CUTLASS_DEVICE static void invoke(Args&&... args) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900 - Kernel::invoke(std::forward(args)...); -#endif - } -}; - -/* - * This class provides the common ScaleA and ScaleB descriptors for the - * ScaledEpilogue and ScaledEpilogueBias classes. - */ -template -struct ScaledEpilogueBase { - protected: - using Accum = cutlass::epilogue::threadblock::VisitorAccFetch; - - using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast< - OutputTileThreadMap, float, Stride, Int<0>, Int<0>>>; - - using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast< - OutputTileThreadMap, float, Stride, Int<1>, Int<0>>>; -}; - -/* - This epilogue function defines a quantized GEMM operation similar to - torch._scaled_mm. - - A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or - per-row. B can be quantized per-tensor or per-column. - Any combination of per-tensor and per-row or column is supported. - A and B must have symmetric quantization (zero point == 0). - - So the GEMM operation is D = (a_scales * A) (b_scales * B), where the - scales are applied elementwise with numpy-style broadcasting. - - ScaleA and ScaleB define the epilogue functions that apply the scales for - the A and B operands respectively. These scales may be either per-tensor or - per row or column. -*/ -template -struct ScaledEpilogue - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::ScaleA; - using ScaleB = typename SUPER::ScaleB; - - using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTCompute0 = - cutlass::epilogue::threadblock::Sm80EVT; - - using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::threadblock::Sm80EVT; - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { - using ScaleAArgs = typename ScaleA::Arguments; - using ScaleBArgs = typename ScaleB::Arguments; - - ScaleBArgs b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; - ScaleAArgs a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; - - typename EVTCompute0::Arguments evt0_compute_args{b_args}; - - typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args}; - return evt_compute_args; - } -}; - -template -struct ScaledEpilogueBias - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::ScaleA; - using ScaleB = typename SUPER::ScaleB; - - using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTCompute0 = - cutlass::epilogue::threadblock::Sm80EVT; - - using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast< - OutputTileThreadMap, ElementD, Stride, Int<1>, Int<0>>>; - - public: - using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT; - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& bias) { - using ScaleAArgs = typename ScaleA::Arguments; - using ScaleBArgs = typename ScaleB::Arguments; - using BiasArgs = typename Bias::Arguments; - - ScaleBArgs b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; - ScaleAArgs a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; - BiasArgs bias_args{static_cast(bias.data_ptr()), {}}; - - typename EVTCompute0::Arguments evt0_compute_args{b_args}; - - typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args, - bias_args}; - return evt_compute_args; - } -}; - -template typename ArchGuard, - typename ElementAB_, typename ElementD_, - template typename Epilogue_, typename TileShape, - typename WarpShape, typename InstructionShape, int32_t MainLoopStages> -struct cutlass_2x_gemm { - using ElementAB = ElementAB_; - using ElementD = ElementD_; - - using ElementAcc = - typename std::conditional, int32_t, - float>::type; - - using Operator = - typename std::conditional, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::arch::OpMultiplyAdd>::type; - - using OutputTileThreadMap = - cutlass::epilogue::threadblock::OutputTileThreadLayout< - TileShape, WarpShape, float, 4, 1 /* epilogue stages */ - >; - - using Epilogue = Epilogue_; - using EVTCompute = typename Epilogue::EVTCompute; - - using D = cutlass::epilogue::threadblock::VisitorAuxStore< - OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest, - Stride, Int<0>>>; - - using EVTD = cutlass::epilogue::threadblock::Sm80EVT; - - // clang-format off - using RowMajor = typename cutlass::layout::RowMajor; - using ColumnMajor = typename cutlass::layout::ColumnMajor; - using KernelType = - ArchGuard::GemmKernel>; - // clang-format on - - using Op = cutlass::gemm::device::GemmUniversalAdapter; -}; - -template -void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - EpilogueArgs&&... epilogue_params) { - using ElementAB = typename Gemm::ElementAB; - using ElementD = typename Gemm::ElementD; - - int32_t m = a.size(0); - int32_t n = b.size(1); - int32_t k = a.size(1); - cutlass::gemm::GemmCoord problem_size{m, n, k}; - - int64_t lda = a.stride(0); - int64_t ldb = b.stride(1); - int64_t ldc = out.stride(0); - - using StrideC = Stride, Int<0>>; - StrideC c_stride{ldc, Int<1>{}, Int<0>{}}; - - auto a_ptr = static_cast(a.data_ptr()); - auto b_ptr = static_cast(b.data_ptr()); - auto c_ptr = static_cast(out.data_ptr()); - - typename Gemm::D::Arguments d_args{c_ptr, c_stride}; - - using Epilogue = typename Gemm::Epilogue; - auto evt_args = - Epilogue::prepare_args(std::forward(epilogue_params)...); - - typename Gemm::EVTD::Arguments epilogue_args{ - evt_args, - d_args, - }; - - typename Gemm::Op::Arguments args{ - cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel, // universal mode - problem_size, // problem size - 1, // batch count - epilogue_args, - a_ptr, - b_ptr, - nullptr, - nullptr, - 0, - 0, - 0, - 0, - lda, - ldb, - ldc, - ldc}; - - // Launch the CUTLASS GEMM kernel. - typename Gemm::Op gemm_op; - size_t workspace_size = gemm_op.get_workspace_size(args); - cutlass::device_memory::allocation workspace(workspace_size); - - auto stream = at::cuda::getCurrentCUDAStream(a.get_device()); - - CUTLASS_CHECK(gemm_op.can_implement(args)); - cutlass::Status status = gemm_op(args, workspace.get(), stream); - CUTLASS_CHECK(status); -} - -template -void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - EpilogueArgs&&... args) { - // In some cases, the GPU isn't able to accommodate the - // shared memory requirements of the Gemm. In such cases, use - // the FallbackGemm instead. - static const int max_shared_mem_per_block_opt_in = - get_cuda_max_shared_memory_per_block_opt_in(0); - - size_t const gemm_shared_mem_size = - sizeof(typename Gemm::KernelType::SharedStorage); - size_t const fallback_gemm_shared_mem_size = - sizeof(typename FallbackGemm::KernelType::SharedStorage); - - if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) { - return cutlass_gemm_caller(out, a, b, - std::forward(args)...); - } else { - TORCH_CHECK(fallback_gemm_shared_mem_size <= - max_shared_mem_per_block_opt_in); - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } -} - -template typename Epilogue> -struct sm80_config_default { - // This config is used in 2 cases, - // - M in (128, inf) - // - M in (64, 128] and N >= 8192 - // Shared Memory required by this Gemm - 81920 bytes - static_assert(std::is_same()); - using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; - using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; - using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; - using Cutlass2xGemm = - cutlass_2x_gemm; -}; - -template typename Epilogue> -struct sm80_config_M64 { - // This config is used in 2 cases, - // - M in (32, 64] - // - M in (64, 128] and N < 8192 - // Shared Memory required by this Gemm - 122880 bytes - static_assert(std::is_same()); - using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>; - using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; - using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; - using Cutlass2xGemm = - cutlass_2x_gemm; -}; - -template typename Epilogue> -struct sm80_config_M32 { - // M in (16, 32] - // Shared Memory required by this Gemm - 61440 bytes - static_assert(std::is_same()); - using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>; - using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>; - using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; - using Cutlass2xGemm = - cutlass_2x_gemm; -}; - -template typename Epilogue> -struct sm80_config_M16 { - // M in [1, 16] - // Shared Memory required by this Gemm - 51200 bytes - static_assert(std::is_same()); - using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>; - using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>; - using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; - using Cutlass2xGemm = - cutlass_2x_gemm; -}; - -} // namespace - -template typename Epilogue, - typename... EpilogueArgs> -void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, - EpilogueArgs&&... args) { - static_assert(std::is_same()); - TORCH_CHECK(a.dtype() == torch::kInt8); - TORCH_CHECK(b.dtype() == torch::kInt8); - - using Cutlass2xGemmDefault = - typename sm80_config_default::Cutlass2xGemm; - using Cutlass2xGemmM128BigN = - typename sm80_config_default::Cutlass2xGemm; - using Cutlass2xGemmM128SmallN = - typename sm80_config_M64::Cutlass2xGemm; - using Cutlass2xGemmM64 = - typename sm80_config_M64::Cutlass2xGemm; - using Cutlass2xGemmM32 = - typename sm80_config_M32::Cutlass2xGemm; - using Cutlass2xGemmM16 = - typename sm80_config_M16::Cutlass2xGemm; - - // Due to shared memory requirements, some Gemms may fail to run on some - // GPUs. As the name indicates, the Fallback Gemm is used as an alternative - // in such cases. - // sm80_config_M16 has the least shared-memory requirement. However, - // based on some profiling, we select sm80_config_M32 as a better alternative - // performance wise. - using FallbackGemm = - typename sm80_config_M32::Cutlass2xGemm; - - uint32_t const m = a.size(0); - uint32_t const mp2 = - std::max(static_cast(16), next_pow_2(m)); // next power of 2 - if (mp2 <= 16) { - // M in [1, 16] - return fallback_cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } else if (mp2 <= 32) { - // M in (16, 32] - return fallback_cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } else if (mp2 <= 64) { - // M in (32, 64] - return fallback_cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } else if (mp2 <= 128) { - // M in (64, 128] - uint32_t const n = out.size(1); - bool const small_n = n < 8192; - if (small_n) { - return fallback_cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } else { - return fallback_cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } - } else { - // M in (128, inf) - return fallback_cutlass_gemm_caller( - out, a, b, std::forward(args)...); - } -} - template