From 8b6c5de6633579af55fdd61f1ec6f6cf6dc686a0 Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Wed, 31 Jul 2024 06:40:57 +0000 Subject: [PATCH 01/10] Support Qwen model Signed-off-by: Muralidhar Andoorveedu --- docs/source/serving/distributed_serving.rst | 2 +- vllm/config.py | 618 +++++++++----------- vllm/model_executor/models/qwen.py | 53 +- vllm/model_executor/models/utils.py | 2 +- 4 files changed, 327 insertions(+), 348 deletions(-) diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst index 5f14fd2b0ee0a..ea36579b79323 100644 --- a/docs/source/serving/distributed_serving.rst +++ b/docs/source/serving/distributed_serving.rst @@ -50,7 +50,7 @@ You can also additionally specify :code:`--pipeline-parallel-size` to enable pip $ --pipeline-parallel-size 2 .. note:: - Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, and Mixtral style models. + Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, and Qwen style models. Multi-Node Inference and Serving -------------------------------- diff --git a/vllm/config.py b/vllm/config.py index e065744592378..8f4bdc81e3ec0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -11,17 +11,24 @@ from vllm.model_executor.models import ModelRegistry from vllm.tracing import is_otel_installed from vllm.transformers_utils.config import get_config, get_hf_text_config -from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_openvino, is_tpu, is_xpu, - print_warning_once) +from vllm.utils import ( + cuda_device_count_stateless, + get_cpu_memory, + is_cpu, + is_hip, + is_neuron, + is_openvino, + is_tpu, + is_xpu, + print_warning_once, +) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup from vllm.executor.executor_base import ExecutorBase from vllm.model_executor.model_loader.loader import BaseModelLoader - from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( - BaseTokenizerGroup) + from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import BaseTokenizerGroup logger = init_logger(__name__) @@ -50,8 +57,8 @@ class ModelConfig: Args: model: Name or path of the huggingface model to use. - It is also used as the content for `model_name` tag in metrics - output when `served_model_name` is not specified. + It is also used as the content for `model_name` tag in metrics + output when `served_model_name` is not specified. tokenizer: Name or path of the huggingface tokenizer to use. tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if available, and "slow" will always use the slow tokenizer. @@ -98,8 +105,8 @@ class ModelConfig: skip_tokenizer_init: If true, skip initialization of tokenizer and detokenizer. served_model_name: The model name used in metrics tag `model_name`, - matches the model name exposed via the APIs. If multiple model - names provided, the first name will be used. If not specified, + matches the model name exposed via the APIs. If multiple model + names provided, the first name will be used. If not specified, the model name will be the same as `model`. """ @@ -146,35 +153,36 @@ def __init__( self.quantization_param_path = quantization_param_path self.enforce_eager = enforce_eager if max_context_len_to_capture is not None: - raise ValueError("`max_context_len_to_capture` is deprecated. " - "Use `max_seq_len_to_capture` instead.") + raise ValueError("`max_context_len_to_capture` is deprecated. " "Use `max_seq_len_to_capture` instead.") self.max_seq_len_to_capture = max_seq_len_to_capture self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init - self.hf_config = get_config(self.model, trust_remote_code, revision, - code_revision, rope_scaling, rope_theta) + self.hf_config = get_config(self.model, trust_remote_code, revision, code_revision, rope_scaling, rope_theta) self.hf_text_config = get_hf_text_config(self.hf_config) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) - if (not self.disable_sliding_window - and self.hf_text_config.model_type == "gemma2" - and self.hf_text_config.sliding_window is not None): + if ( + not self.disable_sliding_window + and self.hf_text_config.model_type == "gemma2" + and self.hf_text_config.sliding_window is not None + ): print_warning_once( "Gemma 2 uses sliding window attention for every odd layer, " "which is currently not supported by vLLM. Disabling sliding " "window and capping the max length to the sliding window size " - f"({self.hf_text_config.sliding_window}).") + f"({self.hf_text_config.sliding_window})." + ) self.disable_sliding_window = True self.max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, max_model_len=max_model_len, disable_sliding_window=self.disable_sliding_window, - sliding_window_len=self.get_hf_config_sliding_window()) - self.served_model_name = get_served_model_name(model, - served_model_name) + sliding_window_len=self.get_hf_config_sliding_window(), + ) + self.served_model_name = get_served_model_name(model, served_model_name) self.multimodal_config = multimodal_config if not self.skip_tokenizer_init: @@ -186,15 +194,12 @@ def __init__( def _verify_tokenizer_mode(self) -> None: tokenizer_mode = self.tokenizer_mode.lower() if tokenizer_mode not in ["auto", "slow"]: - raise ValueError( - f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " - "either 'auto' or 'slow'.") + raise ValueError(f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " "either 'auto' or 'slow'.") self.tokenizer_mode = tokenizer_mode def _verify_embedding_mode(self) -> None: architectures = getattr(self.hf_config, "architectures", []) - self.embedding_mode = any( - ModelRegistry.is_embedding_model(arch) for arch in architectures) + self.embedding_mode = any(ModelRegistry.is_embedding_model(arch) for arch in architectures) def _parse_quant_hf_config(self): quant_cfg = getattr(self.hf_config, "quantization_config", None) @@ -207,8 +212,14 @@ def _verify_quantization(self) -> None: supported_quantization = [*QUANTIZATION_METHODS] rocm_supported_quantization = ["gptq", "squeezellm"] optimized_quantization_methods = [ - "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin", - "fbgemm_fp8", "compressed_tensors", "compressed-tensors" + "fp8", + "marlin", + "gptq_marlin_24", + "gptq_marlin", + "awq_marlin", + "fbgemm_fp8", + "compressed_tensors", + "compressed-tensors", ] if self.quantization is not None: self.quantization = self.quantization.lower() @@ -221,8 +232,7 @@ def _verify_quantization(self) -> None: # Detect which checkpoint is it for _, method in QUANTIZATION_METHODS.items(): - quantization_override = method.override_quantization_method( - quant_cfg, self.quantization) + quantization_override = method.override_quantization_method(quant_cfg, self.quantization) if quantization_override: quant_method = quantization_override self.quantization = quantization_override @@ -236,60 +246,53 @@ def _verify_quantization(self) -> None: "Quantization method specified in the model config " f"({quant_method}) does not match the quantization " f"method specified in the `quantization` argument " - f"({self.quantization}).") + f"({self.quantization})." + ) if self.quantization is not None: if self.quantization not in supported_quantization: raise ValueError( - f"Unknown quantization method: {self.quantization}. Must " - f"be one of {supported_quantization}.") - if is_hip( - ) and self.quantization not in rocm_supported_quantization: - raise ValueError( - f"{self.quantization} quantization is currently not " - f"supported in ROCm.") + f"Unknown quantization method: {self.quantization}. Must " f"be one of {supported_quantization}." + ) + if is_hip() and self.quantization not in rocm_supported_quantization: + raise ValueError(f"{self.quantization} quantization is currently not " f"supported in ROCm.") if self.quantization not in optimized_quantization_methods: logger.warning( "%s quantization is not fully " "optimized yet. The speed can be slower than " - "non-quantized models.", self.quantization) + "non-quantized models.", + self.quantization, + ) def _verify_cuda_graph(self) -> None: if self.max_seq_len_to_capture is None: self.max_seq_len_to_capture = self.max_model_len - self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, - self.max_model_len) + self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) - def verify_with_parallel_config( - self, - parallel_config: "ParallelConfig", - ) -> None: - total_num_attention_heads = getattr(self.hf_text_config, - "num_attention_heads", 0) + def verify_with_parallel_config(self, parallel_config: "ParallelConfig") -> None: + total_num_attention_heads = getattr(self.hf_text_config, "num_attention_heads", 0) tensor_parallel_size = parallel_config.tensor_parallel_size if total_num_attention_heads % tensor_parallel_size != 0: raise ValueError( f"Total number of attention heads ({total_num_attention_heads})" " must be divisible by tensor parallel size " - f"({tensor_parallel_size}).") + f"({tensor_parallel_size})." + ) pipeline_parallel_size = parallel_config.pipeline_parallel_size architectures = getattr(self.hf_config, "architectures", []) - if not all(arch in _PP_SUPPORTED_MODELS - for arch in architectures) and pipeline_parallel_size > 1: + if not all(arch in _PP_SUPPORTED_MODELS for arch in architectures) and pipeline_parallel_size > 1: raise NotImplementedError( - "Pipeline parallelism is only supported for the following " - f" architectures: {_PP_SUPPORTED_MODELS}.") + "Pipeline parallelism is only supported for the following " f" architectures: {_PP_SUPPORTED_MODELS}." + ) if self.quantization == "bitsandbytes" and ( - parallel_config.tensor_parallel_size > 1 - or parallel_config.pipeline_parallel_size > 1): - raise ValueError( - "BitAndBytes quantization with TP or PP is not supported yet.") + parallel_config.tensor_parallel_size > 1 or parallel_config.pipeline_parallel_size > 1 + ): + raise ValueError("BitAndBytes quantization with TP or PP is not supported yet.") if self.quantization == "bitsandbytes" and self.enforce_eager is False: - raise ValueError( - "BitAndBytes with enforce_eager = False is not supported yet.") + raise ValueError("BitAndBytes with enforce_eager = False is not supported yet.") def get_hf_config_sliding_window(self) -> Optional[int]: """Get the sliding window size, or None if disabled.""" @@ -297,14 +300,12 @@ def get_hf_config_sliding_window(self) -> Optional[int]: # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in # addition to sliding window size. We check if that field is present # and if it's False, return None. - if (hasattr(self.hf_text_config, "use_sliding_window") - and not self.hf_text_config.use_sliding_window): + if hasattr(self.hf_text_config, "use_sliding_window") and not self.hf_text_config.use_sliding_window: return None return getattr(self.hf_text_config, "sliding_window", None) def get_sliding_window(self) -> Optional[int]: - """Get the sliding window size, or None if disabled. - """ + """Get the sliding window size, or None if disabled.""" # If user disables sliding window, return None. if self.disable_sliding_window: return None @@ -319,16 +320,14 @@ def get_hidden_size(self) -> int: def get_head_size(self) -> int: # TODO remove hard code - if hasattr(self.hf_text_config, "model_type" - ) and self.hf_text_config.model_type == 'deepseek_v2': + if hasattr(self.hf_text_config, "model_type") and self.hf_text_config.model_type == 'deepseek_v2': # FlashAttention supports only head_size 32, 64, 128, 256, # we need to pad head_size 192 to 256 return 256 if hasattr(self.hf_text_config, "head_dim"): return self.hf_text_config.head_dim # FIXME(woosuk): This may not be true for all models. - return (self.hf_text_config.hidden_size // - self.hf_text_config.num_attention_heads) + return self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads def get_total_num_kv_heads(self) -> int: """Returns the total number of KV heads.""" @@ -337,11 +336,10 @@ def get_total_num_kv_heads(self) -> int: # multi_query flag is ignored and we use n_head_kv for the number of # KV heads. falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] - new_decoder_arch_falcon = ( - self.hf_config.model_type in falcon_model_types - and getattr(self.hf_config, "new_decoder_architecture", False)) - if not new_decoder_arch_falcon and getattr(self.hf_text_config, - "multi_query", False): + new_decoder_arch_falcon = self.hf_config.model_type in falcon_model_types and getattr( + self.hf_config, "new_decoder_architecture", False + ) + if not new_decoder_arch_falcon and getattr(self.hf_text_config, "multi_query", False): # Multi-query attention, only one KV head. # Currently, tensor parallelism is not supported in this case. return 1 @@ -352,8 +350,7 @@ def get_total_num_kv_heads(self) -> int: return self.hf_config.attn_config["kv_n_heads"] return self.hf_config.num_attention_heads if self.hf_config.model_type == "dbrx": - return getattr(self.hf_config.attn_config, "kv_n_heads", - self.hf_config.num_attention_heads) + return getattr(self.hf_config.attn_config, "kv_n_heads", self.hf_config.num_attention_heads) attributes = [ # For Falcon: @@ -380,48 +377,35 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: # the tensor parallel size. We will replicate the KV heads in the # case where the number of KV heads is smaller than the tensor # parallel size so each GPU has at least one KV head. - return max(1, - total_num_kv_heads // parallel_config.tensor_parallel_size) + return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size) - def get_num_attention_heads(self, - parallel_config: "ParallelConfig") -> int: + def get_num_attention_heads(self, parallel_config: "ParallelConfig") -> int: num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) return num_heads // parallel_config.tensor_parallel_size def get_num_layers(self, parallel_config: "ParallelConfig") -> int: from vllm.distributed.utils import get_pp_indices - total_num_hidden_layers = getattr(self.hf_text_config, - "num_hidden_layers", 0) + + total_num_hidden_layers = getattr(self.hf_text_config, "num_hidden_layers", 0) pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size pp_size = parallel_config.pipeline_parallel_size start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) return end - start - def contains_seqlen_agnostic_layers( - self, parallel_config: "ParallelConfig") -> bool: + def contains_seqlen_agnostic_layers(self, parallel_config: "ParallelConfig") -> bool: """True for Mamba/SSM models (Jamba)""" return self._get_num_seqlen_agnostic_layers(parallel_config) > 0 - def get_layers_block_type(self, - parallel_config: "ParallelConfig") -> List[str]: + def get_layers_block_type(self, parallel_config: "ParallelConfig") -> List[str]: num_layers = self.get_num_layers(parallel_config) # Transformers supports layers_block_type @property - return getattr(self.hf_config, "layers_block_type", - ["attention"] * num_layers) + return getattr(self.hf_config, "layers_block_type", ["attention"] * num_layers) - def get_num_attention_layers(self, - parallel_config: "ParallelConfig") -> int: - return len([ - t for t in self.get_layers_block_type(parallel_config) - if t == "attention" - ]) + def get_num_attention_layers(self, parallel_config: "ParallelConfig") -> int: + return len([t for t in self.get_layers_block_type(parallel_config) if t == "attention"]) - def _get_num_seqlen_agnostic_layers( - self, parallel_config: "ParallelConfig") -> int: - return len([ - t for t in self.get_layers_block_type(parallel_config) - if t != "attention" - ]) + def _get_num_seqlen_agnostic_layers(self, parallel_config: "ParallelConfig") -> int: + return len([t for t in self.get_layers_block_type(parallel_config) if t != "attention"]) class CacheConfig: @@ -471,9 +455,7 @@ def metrics_info(self): def _verify_args(self) -> None: if self.gpu_memory_utilization > 1.0: - raise ValueError( - "GPU memory utilization must be less than 1.0. Got " - f"{self.gpu_memory_utilization}.") + raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.") def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": @@ -483,7 +465,8 @@ def _verify_cache_dtype(self) -> None: "Using fp8 data type to store kv cache. It reduces the GPU " "memory footprint and boosts the performance. " "Meanwhile, it may cause accuracy drop without a proper " - "scaling factor") + "scaling factor" + ) else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") @@ -494,25 +477,26 @@ def _verify_prefix_caching(self) -> None: if self.sliding_window is not None: raise NotImplementedError( "Prefix caching is not supported with sliding window. " - "Run with --disable-sliding-window to use prefix caching.") + "Run with --disable-sliding-window to use prefix caching." + ) if self.cache_dtype == "fp8": raise NotImplementedError( "Prefix caching is not supported for fp8 cache_dtype. " - "Run with --kv-cache-dtype auto to use prefix caching.") + "Run with --kv-cache-dtype auto to use prefix caching." + ) - def verify_with_parallel_config( - self, - parallel_config: "ParallelConfig", - ) -> None: + def verify_with_parallel_config(self, parallel_config: "ParallelConfig") -> None: total_cpu_memory = get_cpu_memory() # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel # group are in the same node. However, the GPUs may span multiple nodes. num_gpus_per_node = parallel_config.tensor_parallel_size cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node - msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of " - f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is " - "allocated for the swap space.") + msg = ( + f"{cpu_memory_usage / _GB:.2f} GiB out of " + f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is " + "allocated for the swap space." + ) if cpu_memory_usage > 0.7 * total_cpu_memory: raise ValueError("Too large swap space. " + msg) elif cpu_memory_usage > 0.4 * total_cpu_memory: @@ -530,21 +514,20 @@ class TokenizerPoolConfig: The way the config will be used depends on the pool type. """ + pool_size: int pool_type: Union[str, Type["BaseTokenizerGroup"]] extra_config: dict def __post_init__(self): - if self.pool_type not in ("ray", ) and not isinstance( - self.pool_type, type): + if self.pool_type not in ("ray",) and not isinstance(self.pool_type, type): raise ValueError(f"Unknown pool type: {self.pool_type}") if not isinstance(self.extra_config, dict): raise ValueError("extra_config must be a dictionary.") @classmethod def create_config( - cls, tokenizer_pool_size: int, tokenizer_pool_type: str, - tokenizer_pool_extra_config: Optional[Union[str, dict]] + cls, tokenizer_pool_size: int, tokenizer_pool_type: str, tokenizer_pool_extra_config: Optional[Union[str, dict]] ) -> Optional["TokenizerPoolConfig"]: """Create a TokenizerPoolConfig from the given parameters. @@ -559,14 +542,10 @@ def create_config( """ if tokenizer_pool_size: if isinstance(tokenizer_pool_extra_config, str): - tokenizer_pool_extra_config_parsed = json.loads( - tokenizer_pool_extra_config) + tokenizer_pool_extra_config_parsed = json.loads(tokenizer_pool_extra_config) else: - tokenizer_pool_extra_config_parsed = ( - tokenizer_pool_extra_config or {}) - tokenizer_pool_config = cls(tokenizer_pool_size, - tokenizer_pool_type, - tokenizer_pool_extra_config_parsed) + tokenizer_pool_extra_config_parsed = tokenizer_pool_extra_config or {} + tokenizer_pool_config = cls(tokenizer_pool_size, tokenizer_pool_type, tokenizer_pool_extra_config_parsed) else: tokenizer_pool_config = None return tokenizer_pool_config @@ -586,44 +565,40 @@ class LoadFormat(str, enum.Enum): @dataclass class LoadConfig: """ - download_dir: Directory to download and load the weights, default to the - default cache directory of huggingface. - load_format: The format of the model weights to load: - "auto" will try to load the weights in the safetensors format and - fall back to the pytorch bin format if safetensors format is - not available. - "pt" will load the weights in the pytorch bin format. - "safetensors" will load the weights in the safetensors format. - "npcache" will load the weights in pytorch format and store - a numpy cache to speed up the loading. - "dummy" will initialize the weights with random values, which is - mainly for profiling. - "tensorizer" will use CoreWeave's tensorizer library for - fast weight loading. - "bitsandbytes" will load nf4 type weights. - ignore_patterns: The list of patterns to ignore when loading the model. - Default to "original/**/*" to avoid repeated loading of llama's - checkpoints. - + download_dir: Directory to download and load the weights, default to the + default cache directory of huggingface. + load_format: The format of the model weights to load: + "auto" will try to load the weights in the safetensors format and + fall back to the pytorch bin format if safetensors format is + not available. + "pt" will load the weights in the pytorch bin format. + "safetensors" will load the weights in the safetensors format. + "npcache" will load the weights in pytorch format and store + a numpy cache to speed up the loading. + "dummy" will initialize the weights with random values, which is + mainly for profiling. + "tensorizer" will use CoreWeave's tensorizer library for + fast weight loading. + "bitsandbytes" will load nf4 type weights. + ignore_patterns: The list of patterns to ignore when loading the model. + Default to "original/**/*" to avoid repeated loading of llama's + checkpoints. + """ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO download_dir: Optional[str] = None - model_loader_extra_config: Optional[Union[str, dict]] = field( - default_factory=dict) + model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict) ignore_patterns: Optional[Union[List[str], str]] = None def __post_init__(self): model_loader_extra_config = self.model_loader_extra_config or {} if isinstance(model_loader_extra_config, str): - self.model_loader_extra_config = json.loads( - model_loader_extra_config) + self.model_loader_extra_config = json.loads(model_loader_extra_config) self._verify_load_format() if self.ignore_patterns is not None and len(self.ignore_patterns) > 0: - logger.info( - "Ignoring the following patterns when downloading weights: %s", - self.ignore_patterns) + logger.info("Ignoring the following patterns when downloading weights: %s", self.ignore_patterns) else: self.ignore_patterns = ["original/**/*"] @@ -637,13 +612,13 @@ def _verify_load_format(self) -> None: rocm_not_supported_load_format: List[str] = [] if is_hip() and load_format in rocm_not_supported_load_format: rocm_supported_load_format = [ - f for f in LoadFormat.__members__ - if (f not in rocm_not_supported_load_format) + f for f in LoadFormat.__members__ if (f not in rocm_not_supported_load_format) ] raise ValueError( f"load format '{load_format}' is not supported in ROCm. " f"Supported load formats are " - f"{rocm_supported_load_format}") + f"{rocm_supported_load_format}" + ) class ParallelConfig: @@ -679,8 +654,7 @@ def __init__( tokenizer_pool_config: Optional[TokenizerPoolConfig] = None, ray_workers_use_nsight: bool = False, placement_group: Optional["PlacementGroup"] = None, - distributed_executor_backend: Optional[Union[ - str, Type["ExecutorBase"]]] = None, + distributed_executor_backend: Optional[Union[str, Type["ExecutorBase"]]] = None, ) -> None: self.pipeline_parallel_size = pipeline_parallel_size self.tensor_parallel_size = tensor_parallel_size @@ -696,36 +670,42 @@ def __init__( if self.distributed_executor_backend is None: self.distributed_executor_backend = "ray" elif not self.use_ray: - raise ValueError(f"worker-use-ray can't be used with " - f"distributed executor backend " - f"'{self.distributed_executor_backend}'.") + raise ValueError( + f"worker-use-ray can't be used with " + f"distributed executor backend " + f"'{self.distributed_executor_backend}'." + ) if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. from vllm.executor import ray_utils + backend = "mp" ray_found = ray_utils.ray_is_available() if cuda_device_count_stateless() < self.world_size: if not ray_found: - raise ValueError("Unable to load Ray which is " - "required for multi-node inference, " - "please install Ray with `pip install " - "ray`.") from ray_utils.ray_import_err + raise ValueError( + "Unable to load Ray which is " + "required for multi-node inference, " + "please install Ray with `pip install " + "ray`." + ) from ray_utils.ray_import_err backend = "ray" elif ray_found: if self.placement_group: backend = "ray" else: from ray import is_initialized as ray_is_initialized + if ray_is_initialized(): from ray.util import get_current_placement_group + if get_current_placement_group(): backend = "ray" self.distributed_executor_backend = backend - logger.info("Defaulting to use %s for distributed inference", - backend) + logger.info("Defaulting to use %s for distributed inference", backend) self._verify_args() self.rank: int = 0 @@ -733,32 +713,31 @@ def __init__( @property def use_ray(self) -> bool: return self.distributed_executor_backend == "ray" or ( - isinstance(self.distributed_executor_backend, type) - and self.distributed_executor_backend.uses_ray) + isinstance(self.distributed_executor_backend, type) and self.distributed_executor_backend.uses_ray + ) def _verify_args(self) -> None: # Lazy import to avoid circular import from vllm.executor.executor_base import ExecutorBase - if self.distributed_executor_backend not in ( - "ray", "mp", None) and not (isinstance( - self.distributed_executor_backend, type) and issubclass( - self.distributed_executor_backend, ExecutorBase)): + if self.distributed_executor_backend not in ("ray", "mp", None) and not ( + isinstance(self.distributed_executor_backend, type) + and issubclass(self.distributed_executor_backend, ExecutorBase) + ): raise ValueError( "Unrecognized distributed executor backend " f"{self.distributed_executor_backend}. Supported " - "values are 'ray', 'mp' or custom ExecutorBase subclass.") + "values are 'ray', 'mp' or custom ExecutorBase subclass." + ) if self.use_ray: from vllm.executor import ray_utils + ray_utils.assert_ray_available() if is_hip(): self.disable_custom_all_reduce = True - logger.info( - "Disabled the custom all-reduce kernel because it is not " - "supported on AMD GPUs.") + logger.info("Disabled the custom all-reduce kernel because it is not " "supported on AMD GPUs.") if self.ray_workers_use_nsight and not self.use_ray: - raise ValueError("Unable to use nsight profiling unless workers " - "run with Ray.") + raise ValueError("Unable to use nsight profiling unless workers " "run with Ray.") class SchedulerConfig: @@ -781,7 +760,7 @@ class SchedulerConfig: enable_chunked_prefill: If True, prefill requests can be chunked based on the remaining max_num_batched_tokens. embedding_mode: Whether the running model is for embedding. - preemption_mode: Whether to perform preemption by swapping or + preemption_mode: Whether to perform preemption by swapping or recomputation. If not specified, we determine the mode as follows: We use recomputation by default since it incurs lower overhead than swapping. However, when the sequence group has multiple sequences @@ -789,16 +768,18 @@ class SchedulerConfig: such a case, we use swapping instead. """ - def __init__(self, - max_num_batched_tokens: Optional[int], - max_num_seqs: int, - max_model_len: int, - use_v2_block_manager: bool = False, - num_lookahead_slots: int = 0, - delay_factor: float = 0.0, - enable_chunked_prefill: bool = False, - embedding_mode: Optional[bool] = False, - preemption_mode: Optional[str] = None) -> None: + def __init__( + self, + max_num_batched_tokens: Optional[int], + max_num_seqs: int, + max_model_len: int, + use_v2_block_manager: bool = False, + num_lookahead_slots: int = 0, + delay_factor: float = 0.0, + enable_chunked_prefill: bool = False, + embedding_mode: Optional[bool] = False, + preemption_mode: Optional[str] = None, + ) -> None: if max_num_batched_tokens is not None: self.max_num_batched_tokens = max_num_batched_tokens else: @@ -808,16 +789,13 @@ def __init__(self, self.max_num_batched_tokens = 512 elif embedding_mode: # For embedding, choose specific value for higher throughput - self.max_num_batched_tokens = max( - max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS) + self.max_num_batched_tokens = max(max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS) else: # If max_model_len is too short, use 2048 as the default value # for higher throughput. self.max_num_batched_tokens = max(max_model_len, 2048) if enable_chunked_prefill: - logger.info( - "Chunked prefill is enabled with max_num_batched_tokens=%d.", - self.max_num_batched_tokens) + logger.info("Chunked prefill is enabled with max_num_batched_tokens=%d.", self.max_num_batched_tokens) self.max_num_seqs = max_num_seqs self.max_model_len = max_model_len @@ -830,27 +808,27 @@ def __init__(self, self._verify_args() def _verify_args(self) -> None: - if (self.max_num_batched_tokens < self.max_model_len - and not self.chunked_prefill_enabled): + if self.max_num_batched_tokens < self.max_model_len and not self.chunked_prefill_enabled: raise ValueError( f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " f"smaller than max_model_len ({self.max_model_len}). " "This effectively limits the maximum sequence length to " "max_num_batched_tokens and makes vLLM reject longer " "sequences. Please increase max_num_batched_tokens or " - "decrease max_model_len.") + "decrease max_model_len." + ) if self.max_num_batched_tokens < self.max_num_seqs: raise ValueError( f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " "be greater than or equal to max_num_seqs " - f"({self.max_num_seqs}).") + f"({self.max_num_seqs})." + ) if self.num_lookahead_slots < 0: raise ValueError( - "num_lookahead_slots " - f"({self.num_lookahead_slots}) must be greater than or " - "equal to 0.") + "num_lookahead_slots " f"({self.num_lookahead_slots}) must be greater than or " "equal to 0." + ) class DeviceConfig: @@ -956,7 +934,7 @@ def maybe_create_spec_config( typical_acceptance_sampler_posterior_threshold (Optional[float]): A threshold value that sets a lower bound on the posterior probability of a token in the target model for it to be - accepted. This threshold is used only when we use the + accepted. This threshold is used only when we use the TypicalAcceptanceSampler for token acceptance. typical_acceptance_sampler_posterior_alpha (Optional[float]): A scaling factor for the entropy-based threshold in the @@ -966,7 +944,7 @@ def maybe_create_spec_config( If set to False, token log probabilities are returned according to the log probability settings in SamplingParams. If not specified, it defaults to True. - + Returns: Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if the necessary conditions are met, else None. @@ -974,25 +952,26 @@ def maybe_create_spec_config( if speculative_model is None: if num_speculative_tokens is not None: - raise ValueError("num_speculative_tokens was provided without " - "speculative_model.") + raise ValueError("num_speculative_tokens was provided without " "speculative_model.") return None - if (speculative_disable_by_batch_size is not None - and speculative_disable_by_batch_size < 2): - raise ValueError("Expect the batch size threshold of disabling " - "speculative decoding is > 1, but got " - f"{speculative_disable_by_batch_size=}") + if speculative_disable_by_batch_size is not None and speculative_disable_by_batch_size < 2: + raise ValueError( + "Expect the batch size threshold of disabling " + "speculative decoding is > 1, but got " + f"{speculative_disable_by_batch_size=}" + ) if enable_chunked_prefill: raise ValueError( "Speculative decoding and chunked prefill are " - f"currently mutually exclusive ({enable_chunked_prefill=}).") + f"currently mutually exclusive ({enable_chunked_prefill=})." + ) if not use_v2_block_manager: raise ValueError( - "Speculative decoding requires usage of the V2 " - "block manager. Enable it with --use-v2-block-manager.") + "Speculative decoding requires usage of the V2 " "block manager. Enable it with --use-v2-block-manager." + ) # TODO: The user should be able to specify revision/quantization/max # model len for the draft model. It is not currently supported. @@ -1008,8 +987,7 @@ def maybe_create_spec_config( if ngram_prompt_lookup_min < 1: raise ValueError(f"{ngram_prompt_lookup_min=} must be > 0") if ngram_prompt_lookup_min > ngram_prompt_lookup_max: - raise ValueError(f"{ngram_prompt_lookup_min=} cannot be " - f"larger than {ngram_prompt_lookup_max=}") + raise ValueError(f"{ngram_prompt_lookup_min=} cannot be " f"larger than {ngram_prompt_lookup_max=}") # TODO: current we still need extract vocab_size from target model # config, in future, we may try refactor it out, and set @@ -1032,15 +1010,13 @@ def maybe_create_spec_config( max_model_len=None, quantization=draft_quantization, enforce_eager=target_model_config.enforce_eager, - max_seq_len_to_capture=target_model_config. - max_seq_len_to_capture, + max_seq_len_to_capture=target_model_config.max_seq_len_to_capture, max_logprobs=target_model_config.max_logprobs, ) draft_hf_config = draft_model_config.hf_config - if (num_speculative_tokens is not None - and hasattr(draft_hf_config, "num_lookahead_tokens")): + if num_speculative_tokens is not None and hasattr(draft_hf_config, "num_lookahead_tokens"): draft_hf_config.num_lookahead_tokens = num_speculative_tokens n_predict = getattr(draft_hf_config, "n_predict", None) @@ -1054,25 +1030,23 @@ def maybe_create_spec_config( raise ValueError( "This speculative model supports a maximum of " f"num_speculative_tokens={n_predict}, but " - f"{num_speculative_tokens=} was provided.") + f"{num_speculative_tokens=} was provided." + ) - draft_model_config.max_model_len = ( - SpeculativeConfig._maybe_override_draft_max_model_len( - speculative_max_model_len, - draft_model_config.max_model_len, - target_model_config.max_model_len, - )) + draft_model_config.max_model_len = SpeculativeConfig._maybe_override_draft_max_model_len( + speculative_max_model_len, draft_model_config.max_model_len, target_model_config.max_model_len + ) - draft_parallel_config = ( - SpeculativeConfig.create_draft_parallel_config( - target_parallel_config, - speculative_draft_tensor_parallel_size)) + draft_parallel_config = SpeculativeConfig.create_draft_parallel_config( + target_parallel_config, speculative_draft_tensor_parallel_size + ) if num_speculative_tokens is None: raise ValueError( "num_speculative_tokens must be provided with " "speculative_model unless the draft model config contains an " - "n_predict parameter.") + "n_predict parameter." + ) if typical_acceptance_sampler_posterior_threshold is None: typical_acceptance_sampler_posterior_threshold = 0.09 @@ -1089,18 +1063,14 @@ def maybe_create_spec_config( ngram_prompt_lookup_max, ngram_prompt_lookup_min, draft_token_acceptance_method=draft_token_acceptance_method, - typical_acceptance_sampler_posterior_threshold=\ - typical_acceptance_sampler_posterior_threshold, - typical_acceptance_sampler_posterior_alpha=\ - typical_acceptance_sampler_posterior_alpha, - disable_logprobs=disable_logprobs + typical_acceptance_sampler_posterior_threshold=typical_acceptance_sampler_posterior_threshold, + typical_acceptance_sampler_posterior_alpha=typical_acceptance_sampler_posterior_alpha, + disable_logprobs=disable_logprobs, ) @staticmethod def _maybe_override_draft_max_model_len( - speculative_max_model_len: Optional[int], - draft_max_model_len: int, - target_max_model_len: int, + speculative_max_model_len: Optional[int], draft_max_model_len: int, target_max_model_len: int ) -> int: """Determine the max sequence len for the draft model. This is usually the draft_max_model_len, but may be the target_max_model_len if it is @@ -1117,51 +1087,37 @@ def _maybe_override_draft_max_model_len( if speculative_max_model_len is not None: if speculative_max_model_len > draft_max_model_len: - raise ValueError(f"{speculative_max_model_len=} cannot be " - f"larger than {draft_max_model_len=}") + raise ValueError(f"{speculative_max_model_len=} cannot be " f"larger than {draft_max_model_len=}") if speculative_max_model_len > target_max_model_len: - raise ValueError(f"{speculative_max_model_len=} cannot be " - f"larger than {target_max_model_len=}") + raise ValueError(f"{speculative_max_model_len=} cannot be " f"larger than {target_max_model_len=}") return speculative_max_model_len - return min( - draft_max_model_len, - target_max_model_len, - ) + return min(draft_max_model_len, target_max_model_len) @staticmethod def create_draft_parallel_config( - target_parallel_config: ParallelConfig, - speculative_draft_tensor_parallel_size: Optional[int] + target_parallel_config: ParallelConfig, speculative_draft_tensor_parallel_size: Optional[int] ) -> ParallelConfig: """Create a parallel config for use by the draft worker. This is mostly a copy of the target parallel config, except the tp_size. """ if speculative_draft_tensor_parallel_size is None: - speculative_draft_tensor_parallel_size = \ - target_parallel_config.tensor_parallel_size + speculative_draft_tensor_parallel_size = target_parallel_config.tensor_parallel_size elif speculative_draft_tensor_parallel_size != 1: # TODO(wooyeon): allow tp values larger than 1 - raise ValueError( - f"{speculative_draft_tensor_parallel_size=} cannot be" - f"other value than 1") + raise ValueError(f"{speculative_draft_tensor_parallel_size=} cannot be" f"other value than 1") draft_parallel_config = ParallelConfig( - pipeline_parallel_size=target_parallel_config. - pipeline_parallel_size, + pipeline_parallel_size=target_parallel_config.pipeline_parallel_size, tensor_parallel_size=speculative_draft_tensor_parallel_size, - distributed_executor_backend=target_parallel_config. - distributed_executor_backend, - max_parallel_loading_workers=target_parallel_config. - max_parallel_loading_workers, - disable_custom_all_reduce=target_parallel_config. - disable_custom_all_reduce, + distributed_executor_backend=target_parallel_config.distributed_executor_backend, + max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers, + disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce, tokenizer_pool_config=target_parallel_config.tokenizer_pool_config, - ray_workers_use_nsight=target_parallel_config. - ray_workers_use_nsight, + ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight, placement_group=target_parallel_config.placement_group, ) @@ -1200,13 +1156,13 @@ def __init__( typical_acceptance_sampler_posterior_threshold (Optional[float]): A threshold value that sets a lower bound on the posterior probability of a token in the target model for it to be - accepted. This threshold is used only when we use the + accepted. This threshold is used only when we use the TypicalAcceptanceSampler for token acceptance. typical_acceptance_sampler_posterior_alpha (Optional[float]): A scaling factor for the entropy-based threshold in the TypicalAcceptanceSampler. disable_logprobs: If set to True, token log probabilities will not - be returned even if requested by sampling parameters. This + be returned even if requested by sampling parameters. This reduces latency by skipping logprob calculation in proposal sampling, target sampling, and after accepted tokens are determined. If set to False, log probabilities will be @@ -1215,44 +1171,47 @@ def __init__( self.draft_model_config = draft_model_config self.draft_parallel_config = draft_parallel_config self.num_speculative_tokens = num_speculative_tokens - self.speculative_disable_by_batch_size = \ - speculative_disable_by_batch_size + self.speculative_disable_by_batch_size = speculative_disable_by_batch_size self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0 self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0 self.draft_token_acceptance_method = draft_token_acceptance_method - self.typical_acceptance_sampler_posterior_threshold = \ - typical_acceptance_sampler_posterior_threshold - self.typical_acceptance_sampler_posterior_alpha = \ - typical_acceptance_sampler_posterior_alpha + self.typical_acceptance_sampler_posterior_threshold = typical_acceptance_sampler_posterior_threshold + self.typical_acceptance_sampler_posterior_alpha = typical_acceptance_sampler_posterior_alpha self.disable_logprobs = disable_logprobs self._verify_args() def _verify_args(self) -> None: if self.num_speculative_tokens <= 0: - raise ValueError("Expected num_speculative_tokens to be greater " - f"than zero ({self.num_speculative_tokens}).") + raise ValueError( + "Expected num_speculative_tokens to be greater " f"than zero ({self.num_speculative_tokens})." + ) if self.draft_model_config: - self.draft_model_config.verify_with_parallel_config( - self.draft_parallel_config) + self.draft_model_config.verify_with_parallel_config(self.draft_parallel_config) # Validate and set draft token acceptance related settings. - if (self.draft_token_acceptance_method is None): - raise ValueError("draft_token_acceptance_method is not set. " - "Expected values are rejection_sampler or " - "typical_acceptance_sampler.") + if self.draft_token_acceptance_method is None: + raise ValueError( + "draft_token_acceptance_method is not set. " + "Expected values are rejection_sampler or " + "typical_acceptance_sampler." + ) - if (self.draft_token_acceptance_method != 'rejection_sampler' - and self.draft_token_acceptance_method != - 'typical_acceptance_sampler'): + if ( + self.draft_token_acceptance_method != 'rejection_sampler' + and self.draft_token_acceptance_method != 'typical_acceptance_sampler' + ): raise ValueError( "Expected draft_token_acceptance_method to be either " "rejection_sampler or typical_acceptance_sampler. Instead it " - f"is {self.draft_token_acceptance_method}") + f"is {self.draft_token_acceptance_method}" + ) - if (self.typical_acceptance_sampler_posterior_threshold < 0 - or self.typical_acceptance_sampler_posterior_alpha < 0): + if ( + self.typical_acceptance_sampler_posterior_threshold < 0 + or self.typical_acceptance_sampler_posterior_alpha < 0 + ): raise ValueError( "Expected typical_acceptance_sampler_posterior_threshold " "and typical_acceptance_sampler_posterior_alpha to be > 0. " @@ -1260,7 +1219,8 @@ def _verify_args(self) -> None: f"typical_acceptance_sampler_posterior_threshold = " f"{self.typical_acceptance_sampler_posterior_threshold} and " f"typical_acceptance_sampler_posterior_alpha = " - f"{self.typical_acceptance_sampler_posterior_alpha}") + f"{self.typical_acceptance_sampler_posterior_alpha}" + ) @property def num_lookahead_slots(self) -> int: @@ -1298,40 +1258,35 @@ def __post_init__(self): possible_max_ranks = (8, 16, 32, 64) possible_lora_extra_vocab_size = (0, 256, 512) if self.max_lora_rank not in possible_max_ranks: - raise ValueError( - f"max_lora_rank ({self.max_lora_rank}) must be one of " - f"{possible_max_ranks}.") + raise ValueError(f"max_lora_rank ({self.max_lora_rank}) must be one of " f"{possible_max_ranks}.") if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: raise ValueError( f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " - f"must be one of {possible_lora_extra_vocab_size}.") + f"must be one of {possible_lora_extra_vocab_size}." + ) if self.max_loras < 1: raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") if self.max_cpu_loras is None: self.max_cpu_loras = self.max_loras elif self.max_cpu_loras < self.max_loras: - raise ValueError( - f"max_cpu_loras ({self.max_cpu_loras}) must be >= " - f"max_loras ({self.max_loras})") + raise ValueError(f"max_cpu_loras ({self.max_cpu_loras}) must be >= " f"max_loras ({self.max_loras})") def verify_with_model_config(self, model_config: ModelConfig): if self.lora_dtype in (None, "auto"): self.lora_dtype = model_config.dtype elif isinstance(self.lora_dtype, str): self.lora_dtype = getattr(torch, self.lora_dtype) - if model_config.quantization and model_config.quantization not in [ - "awq", "gptq" - ]: + if model_config.quantization and model_config.quantization not in ["awq", "gptq"]: # TODO support marlin and squeezellm - logger.warning("%s quantization is not tested with LoRA yet.", - model_config.quantization) + logger.warning("%s quantization is not tested with LoRA yet.", model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): if scheduler_config.max_num_batched_tokens > 65528: raise ValueError( "Due to limitations of the custom LoRA CUDA kernel, " "max_num_batched_tokens must be <= 65528 when " - "LoRA is enabled.") + "LoRA is enabled." + ) if scheduler_config.chunked_prefill_enabled: raise ValueError("LoRA is not supported with chunked prefill yet.") @@ -1354,8 +1309,7 @@ def __post_init__(self): ) from e if self.max_prompt_adapters < 1: - raise ValueError(f"max_prompt_adapters " - f"({self.max_prompt_adapters}) must be >= 1.") + raise ValueError(f"max_prompt_adapters " f"({self.max_prompt_adapters}) must be >= 1.") if self.max_prompt_adapter_token == 0: raise ValueError("max_prompt_adapter_token must be set.") if self.max_cpu_prompt_adapters is None: @@ -1365,14 +1319,14 @@ def verify_with_model_config(self, model_config: ModelConfig): if self.prompt_adapter_dtype in (None, "auto"): self.prompt_adapter_dtype = model_config.dtype elif isinstance(self.prompt_adapter_dtype, str): - self.prompt_adapter_dtype = getattr(torch, - self.prompt_adapter_dtype) + self.prompt_adapter_dtype = getattr(torch, self.prompt_adapter_dtype) @dataclass class MultiModalConfig: """Configs the input data format and how models should run for multimodal models.""" + # TODO: Add configs to init vision tower or not. pass @@ -1388,10 +1342,7 @@ class MultiModalConfig: _ROCM_NOT_SUPPORTED_DTYPE: List[str] = [] # -def _get_and_verify_dtype( - config: PretrainedConfig, - dtype: Union[str, torch.dtype], -) -> torch.dtype: +def _get_and_verify_dtype(config: PretrainedConfig, dtype: Union[str, torch.dtype]) -> torch.dtype: # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct # because config.torch_dtype can be None. config_dtype = getattr(config, "torch_dtype", None) @@ -1406,7 +1357,8 @@ def _get_and_verify_dtype( logger.info( "For Gemma 2, we downcast float32 to bfloat16 instead " "of float16 by default. Please specify `dtype` if you " - "want to use float16.") + "want to use float16." + ) torch_dtype = torch.bfloat16 else: # Following the common practice, we use float16 for float32 @@ -1469,15 +1421,13 @@ def _get_and_verify_max_len( for key in possible_keys: max_len = getattr(hf_config, key, None) if max_len is not None: - max_len_key = key if max_len < derived_max_model_len \ - else max_len_key + max_len_key = key if max_len < derived_max_model_len else max_len_key derived_max_model_len = min(derived_max_model_len, max_len) # If sliding window is manually disabled, max_length should be less # than the sliding window length in the model config. if disable_sliding_window and sliding_window_len is not None: - max_len_key = "sliding_window" \ - if sliding_window_len < derived_max_model_len else max_len_key + max_len_key = "sliding_window" if sliding_window_len < derived_max_model_len else max_len_key derived_max_model_len = min(derived_max_model_len, sliding_window_len) # If none of the keys were found in the config, use a default and @@ -1491,8 +1441,10 @@ def _get_and_verify_max_len( logger.warning( "The model's config.json does not contain any of the following " "keys to determine the original maximum length of the model: " - "%s. Assuming the model's maximum length is %d.", possible_keys, - default_max_len) + "%s. Assuming the model's maximum length is %d.", + possible_keys, + default_max_len, + ) derived_max_model_len = default_max_len rope_scaling = getattr(hf_config, "rope_scaling", None) @@ -1502,8 +1454,7 @@ def _get_and_verify_max_len( elif "rope_type" in rope_scaling: rope_type = rope_scaling["rope_type"] else: - raise ValueError( - "rope_scaling must have a 'type' or 'rope_type' key.") + raise ValueError("rope_scaling must have a 'type' or 'rope_type' key.") # The correct one should be "longrope", kept "su" here # to be backward compatible @@ -1514,13 +1465,13 @@ def _get_and_verify_max_len( raise NotImplementedError( "Disabling sliding window is not supported for models " "with rope_scaling. Please raise an issue so we can " - "investigate.") + "investigate." + ) assert "factor" in rope_scaling scaling_factor = rope_scaling["factor"] if rope_type == "yarn": - derived_max_model_len = rope_scaling[ - "original_max_position_embeddings"] + derived_max_model_len = rope_scaling["original_max_position_embeddings"] derived_max_model_len *= scaling_factor # If the user specified a max length, make sure it is smaller than the @@ -1539,7 +1490,8 @@ def _get_and_verify_max_len( raise NotImplementedError( "Disabling sliding window is not supported for models " "model_max_length in the config. Please raise an issue " - "so we can investigate.") + "so we can investigate." + ) pass else: raise ValueError( @@ -1548,17 +1500,17 @@ def _get_and_verify_max_len( f"({max_len_key}={derived_max_model_len} or model_max_length=" f"{model_max_length} in model's config.json). This may lead " "to incorrect model outputs or CUDA errors. Make sure the " - "value is correct and within the model context size.") + "value is correct and within the model context size." + ) return int(max_model_len) -def get_served_model_name(model: str, - served_model_name: Optional[Union[str, List[str]]]): +def get_served_model_name(model: str, served_model_name: Optional[Union[str, List[str]]]): """ - If the input is a non-empty list, the first model_name in - `served_model_name` is taken. - If the input is a non-empty string, it is used directly. - For cases where the input is either an empty string or an + If the input is a non-empty list, the first model_name in + `served_model_name` is taken. + If the input is a non-empty string, it is used directly. + For cases where the input is either an empty string or an empty list, the fallback is to use `self.model`. """ if not served_model_name: @@ -1579,19 +1531,18 @@ def __post_init__(self): valid_guided_backends = ['outlines', 'lm-format-enforcer'] backend = self.guided_decoding_backend if backend not in valid_guided_backends: - raise ValueError(f"Invalid guided_decoding_backend '{backend}," - f"must be one of {valid_guided_backends}") + raise ValueError(f"Invalid guided_decoding_backend '{backend}," f"must be one of {valid_guided_backends}") @dataclass class ObservabilityConfig: """Configuration for observability.""" + otlp_traces_endpoint: Optional[str] = None def __post_init__(self): if not is_otel_installed() and self.otlp_traces_endpoint is not None: - raise ValueError("OpenTelemetry packages must be installed before " - "configuring 'otlp_traces_endpoint'") + raise ValueError("OpenTelemetry packages must be installed before " "configuring 'otlp_traces_endpoint'") @dataclass(frozen=True) @@ -1614,21 +1565,16 @@ class EngineConfig: prompt_adapter_config: Optional[PromptAdapterConfig] def __post_init__(self): - """Verify configs are valid & consistent with each other. - """ + """Verify configs are valid & consistent with each other.""" self.model_config.verify_with_parallel_config(self.parallel_config) self.cache_config.verify_with_parallel_config(self.parallel_config) if self.lora_config: self.lora_config.verify_with_model_config(self.model_config) - self.lora_config.verify_with_scheduler_config( - self.scheduler_config) + self.lora_config.verify_with_scheduler_config(self.scheduler_config) if self.prompt_adapter_config: - self.prompt_adapter_config.verify_with_model_config( - self.model_config) + self.prompt_adapter_config.verify_with_model_config(self.model_config) def to_dict(self): - """Return the configs as a dictionary, for use in **kwargs. - """ - return dict( - (field.name, getattr(self, field.name)) for field in fields(self)) + """Return the configs as a dictionary, for use in **kwargs.""" + return dict((field.name, getattr(self, field.name)) for field in fields(self)) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 47c85c783db7a..981a1684c0bd7 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -12,7 +12,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig -from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed import (get_tensor_model_parallel_world_size, get_pp_group) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -30,6 +30,8 @@ from vllm.sequence import IntermediateTensors, SamplerOutput from vllm.utils import print_warning_once +from .utils import is_pp_missing_parameter, make_layers + class QWenMLP(nn.Module): @@ -195,10 +197,10 @@ def __init__( config.vocab_size, config.hidden_size, ) - self.h = nn.ModuleList([ - QWenBlock(config, cache_config, quant_config) - for _ in range(config.num_hidden_layers) - ]) + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: QWenBlock(config, cache_config, quant_config), + ) self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) def forward( @@ -207,18 +209,29 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors], ) -> torch.Tensor: - hidden_states = self.wte(input_ids) - residual = None - for i in range(len(self.h)): + if get_pp_group().is_first_rank: + hidden_states = self.wte(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for i in range(self.start_layer, self.end_layer): layer = self.h[i] hidden_states, residual = layer( positions, hidden_states, - kv_caches[i], + kv_caches[i - self.start_layer], attn_metadata, residual, ) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) hidden_states, _ = self.ln_f(hidden_states, residual) return hidden_states @@ -250,9 +263,23 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata) + attn_metadata, intermediate_tensors) return hidden_states + def make_empty_intermediate_tensors( + self, batch_size: int, dtype: torch.dtype, + device: torch.device) -> IntermediateTensors: + return IntermediateTensors({ + "hidden_states": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + "residual": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + }) + def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: logits = self.logits_processor(self.lm_head, hidden_states, @@ -284,6 +311,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -292,6 +322,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue # Skip loading visual weights to support Qwen-VL models # in cases with text-only inputs # TODO: add support for Qwen-VL diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 91b4a27814bf4..ff30c329ac245 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -132,7 +132,7 @@ def forward(*args, **kwargs): def make_layers( num_hidden_layers: int, layer_fn: LayerFn, - prefix: str, + prefix: str = "", ) -> Tuple[int, int, torch.nn.ModuleList]: """Make a list of layers with the given layer function, taking pipeline parallelism into account. From 998730910914f48c335994c11b3156d0e7a555c7 Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Wed, 31 Jul 2024 07:01:56 +0000 Subject: [PATCH 02/10] Add prefix Signed-off-by: Muralidhar Andoorveedu --- vllm/model_executor/models/qwen.py | 5 ++++- vllm/model_executor/models/utils.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 981a1684c0bd7..3947b535e8aad 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -134,6 +134,7 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -188,6 +189,7 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -199,7 +201,8 @@ def __init__( ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: QWenBlock(config, cache_config, quant_config), + lambda prefix: QWenBlock(config, cache_config, quant_config, prefix), + prefix=f"{prefix}" ) self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index ff30c329ac245..91b4a27814bf4 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -132,7 +132,7 @@ def forward(*args, **kwargs): def make_layers( num_hidden_layers: int, layer_fn: LayerFn, - prefix: str = "", + prefix: str, ) -> Tuple[int, int, torch.nn.ModuleList]: """Make a list of layers with the given layer function, taking pipeline parallelism into account. From 1d50eca3944f15267dc1b9bd52cd5c231f36ba5e Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Wed, 31 Jul 2024 17:50:43 +0000 Subject: [PATCH 03/10] Change prefix Signed-off-by: Muralidhar Andoorveedu --- vllm/model_executor/models/qwen.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 3947b535e8aad..1b4611ef577ae 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -134,7 +134,6 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", ): super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -202,7 +201,7 @@ def __init__( self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: QWenBlock(config, cache_config, quant_config, prefix), - prefix=f"{prefix}" + prefix=f"{prefix}.layers" ) self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) From 3556334216d9dfefc58bb74f0a655f82b30739cd Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Thu, 1 Aug 2024 01:14:11 +0000 Subject: [PATCH 04/10] Fix name Signed-off-by: Muralidhar Andoorveedu --- vllm/model_executor/models/qwen.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 1b4611ef577ae..60b4e231aaf46 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -198,10 +198,10 @@ def __init__( config.vocab_size, config.hidden_size, ) - self.start_layer, self.end_layer, self.layers = make_layers( + self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: QWenBlock(config, cache_config, quant_config, prefix), - prefix=f"{prefix}.layers" + lambda prefix: QWenBlock(config, cache_config, quant_config), + prefix=f"{prefix}.h" ) self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -324,9 +324,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - # Skip layers on other devices. - if is_pp_missing_parameter(name, self): - continue # Skip loading visual weights to support Qwen-VL models # in cases with text-only inputs # TODO: add support for Qwen-VL @@ -336,6 +333,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): "Only text inputs are allowed. Images won't be handled " "until Qwen-VL models are fully supported.") continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) From 0f5b096b3fe47c978060c1efe2041e458fd57079 Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Thu, 1 Aug 2024 01:14:22 +0000 Subject: [PATCH 05/10] Fix name Signed-off-by: Muralidhar Andoorveedu --- vllm/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config.py b/vllm/config.py index 8f4bdc81e3ec0..d05e959622ca4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -49,6 +49,7 @@ "NemotronForCausalLM", "Qwen2ForCausalLM", "Qwen2MoeForCausalLM", + "QWenLMHeadModel", ] From 9501b117072eae82ad07b9c918662e1802539b63 Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Thu, 1 Aug 2024 01:15:44 +0000 Subject: [PATCH 06/10] Format Signed-off-by: Muralidhar Andoorveedu --- vllm/model_executor/models/qwen.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 60b4e231aaf46..a6ce0d084b830 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -12,7 +12,8 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig -from vllm.distributed import (get_tensor_model_parallel_world_size, get_pp_group) +from vllm.distributed import (get_tensor_model_parallel_world_size, + get_pp_group) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -201,8 +202,7 @@ def __init__( self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, lambda prefix: QWenBlock(config, cache_config, quant_config), - prefix=f"{prefix}.h" - ) + prefix=f"{prefix}.h") self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) def forward( From f5702dd9f4b8fa8bae234c510465c6fcf7684df1 Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Thu, 1 Aug 2024 01:16:05 +0000 Subject: [PATCH 07/10] Format Signed-off-by: Muralidhar Andoorveedu --- vllm/model_executor/models/qwen.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index a6ce0d084b830..eb61adf34e9a7 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -12,8 +12,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig -from vllm.distributed import (get_tensor_model_parallel_world_size, - get_pp_group) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, From ca1cadff7c48c2e62f901954996ecdf6557b5fec Mon Sep 17 00:00:00 2001 From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com> Date: Wed, 31 Jul 2024 23:16:12 -0700 Subject: [PATCH 08/10] Update distributed_serving.rst --- docs/source/serving/distributed_serving.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst index ea36579b79323..fcb2646df50d3 100644 --- a/docs/source/serving/distributed_serving.rst +++ b/docs/source/serving/distributed_serving.rst @@ -50,7 +50,7 @@ You can also additionally specify :code:`--pipeline-parallel-size` to enable pip $ --pipeline-parallel-size 2 .. note:: - Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, and Qwen style models. + Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, Qwen, Qwen2, and Nemotron style models. Multi-Node Inference and Serving -------------------------------- From 06aa6f01b5e5da9bbaabbeb36bcd7420a9cfdb9f Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Thu, 1 Aug 2024 06:42:49 +0000 Subject: [PATCH 09/10] Revert config Signed-off-by: Muralidhar Andoorveedu --- vllm/config.py | 619 +++++++++++++++++++++++++++---------------------- 1 file changed, 336 insertions(+), 283 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d05e959622ca4..e065744592378 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -11,24 +11,17 @@ from vllm.model_executor.models import ModelRegistry from vllm.tracing import is_otel_installed from vllm.transformers_utils.config import get_config, get_hf_text_config -from vllm.utils import ( - cuda_device_count_stateless, - get_cpu_memory, - is_cpu, - is_hip, - is_neuron, - is_openvino, - is_tpu, - is_xpu, - print_warning_once, -) +from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, + is_hip, is_neuron, is_openvino, is_tpu, is_xpu, + print_warning_once) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup from vllm.executor.executor_base import ExecutorBase from vllm.model_executor.model_loader.loader import BaseModelLoader - from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import BaseTokenizerGroup + from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( + BaseTokenizerGroup) logger = init_logger(__name__) @@ -49,7 +42,6 @@ "NemotronForCausalLM", "Qwen2ForCausalLM", "Qwen2MoeForCausalLM", - "QWenLMHeadModel", ] @@ -58,8 +50,8 @@ class ModelConfig: Args: model: Name or path of the huggingface model to use. - It is also used as the content for `model_name` tag in metrics - output when `served_model_name` is not specified. + It is also used as the content for `model_name` tag in metrics + output when `served_model_name` is not specified. tokenizer: Name or path of the huggingface tokenizer to use. tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if available, and "slow" will always use the slow tokenizer. @@ -106,8 +98,8 @@ class ModelConfig: skip_tokenizer_init: If true, skip initialization of tokenizer and detokenizer. served_model_name: The model name used in metrics tag `model_name`, - matches the model name exposed via the APIs. If multiple model - names provided, the first name will be used. If not specified, + matches the model name exposed via the APIs. If multiple model + names provided, the first name will be used. If not specified, the model name will be the same as `model`. """ @@ -154,36 +146,35 @@ def __init__( self.quantization_param_path = quantization_param_path self.enforce_eager = enforce_eager if max_context_len_to_capture is not None: - raise ValueError("`max_context_len_to_capture` is deprecated. " "Use `max_seq_len_to_capture` instead.") + raise ValueError("`max_context_len_to_capture` is deprecated. " + "Use `max_seq_len_to_capture` instead.") self.max_seq_len_to_capture = max_seq_len_to_capture self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init - self.hf_config = get_config(self.model, trust_remote_code, revision, code_revision, rope_scaling, rope_theta) + self.hf_config = get_config(self.model, trust_remote_code, revision, + code_revision, rope_scaling, rope_theta) self.hf_text_config = get_hf_text_config(self.hf_config) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) - if ( - not self.disable_sliding_window - and self.hf_text_config.model_type == "gemma2" - and self.hf_text_config.sliding_window is not None - ): + if (not self.disable_sliding_window + and self.hf_text_config.model_type == "gemma2" + and self.hf_text_config.sliding_window is not None): print_warning_once( "Gemma 2 uses sliding window attention for every odd layer, " "which is currently not supported by vLLM. Disabling sliding " "window and capping the max length to the sliding window size " - f"({self.hf_text_config.sliding_window})." - ) + f"({self.hf_text_config.sliding_window}).") self.disable_sliding_window = True self.max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, max_model_len=max_model_len, disable_sliding_window=self.disable_sliding_window, - sliding_window_len=self.get_hf_config_sliding_window(), - ) - self.served_model_name = get_served_model_name(model, served_model_name) + sliding_window_len=self.get_hf_config_sliding_window()) + self.served_model_name = get_served_model_name(model, + served_model_name) self.multimodal_config = multimodal_config if not self.skip_tokenizer_init: @@ -195,12 +186,15 @@ def __init__( def _verify_tokenizer_mode(self) -> None: tokenizer_mode = self.tokenizer_mode.lower() if tokenizer_mode not in ["auto", "slow"]: - raise ValueError(f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " "either 'auto' or 'slow'.") + raise ValueError( + f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be " + "either 'auto' or 'slow'.") self.tokenizer_mode = tokenizer_mode def _verify_embedding_mode(self) -> None: architectures = getattr(self.hf_config, "architectures", []) - self.embedding_mode = any(ModelRegistry.is_embedding_model(arch) for arch in architectures) + self.embedding_mode = any( + ModelRegistry.is_embedding_model(arch) for arch in architectures) def _parse_quant_hf_config(self): quant_cfg = getattr(self.hf_config, "quantization_config", None) @@ -213,14 +207,8 @@ def _verify_quantization(self) -> None: supported_quantization = [*QUANTIZATION_METHODS] rocm_supported_quantization = ["gptq", "squeezellm"] optimized_quantization_methods = [ - "fp8", - "marlin", - "gptq_marlin_24", - "gptq_marlin", - "awq_marlin", - "fbgemm_fp8", - "compressed_tensors", - "compressed-tensors", + "fp8", "marlin", "gptq_marlin_24", "gptq_marlin", "awq_marlin", + "fbgemm_fp8", "compressed_tensors", "compressed-tensors" ] if self.quantization is not None: self.quantization = self.quantization.lower() @@ -233,7 +221,8 @@ def _verify_quantization(self) -> None: # Detect which checkpoint is it for _, method in QUANTIZATION_METHODS.items(): - quantization_override = method.override_quantization_method(quant_cfg, self.quantization) + quantization_override = method.override_quantization_method( + quant_cfg, self.quantization) if quantization_override: quant_method = quantization_override self.quantization = quantization_override @@ -247,53 +236,60 @@ def _verify_quantization(self) -> None: "Quantization method specified in the model config " f"({quant_method}) does not match the quantization " f"method specified in the `quantization` argument " - f"({self.quantization})." - ) + f"({self.quantization}).") if self.quantization is not None: if self.quantization not in supported_quantization: raise ValueError( - f"Unknown quantization method: {self.quantization}. Must " f"be one of {supported_quantization}." - ) - if is_hip() and self.quantization not in rocm_supported_quantization: - raise ValueError(f"{self.quantization} quantization is currently not " f"supported in ROCm.") + f"Unknown quantization method: {self.quantization}. Must " + f"be one of {supported_quantization}.") + if is_hip( + ) and self.quantization not in rocm_supported_quantization: + raise ValueError( + f"{self.quantization} quantization is currently not " + f"supported in ROCm.") if self.quantization not in optimized_quantization_methods: logger.warning( "%s quantization is not fully " "optimized yet. The speed can be slower than " - "non-quantized models.", - self.quantization, - ) + "non-quantized models.", self.quantization) def _verify_cuda_graph(self) -> None: if self.max_seq_len_to_capture is None: self.max_seq_len_to_capture = self.max_model_len - self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) + self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, + self.max_model_len) - def verify_with_parallel_config(self, parallel_config: "ParallelConfig") -> None: - total_num_attention_heads = getattr(self.hf_text_config, "num_attention_heads", 0) + def verify_with_parallel_config( + self, + parallel_config: "ParallelConfig", + ) -> None: + total_num_attention_heads = getattr(self.hf_text_config, + "num_attention_heads", 0) tensor_parallel_size = parallel_config.tensor_parallel_size if total_num_attention_heads % tensor_parallel_size != 0: raise ValueError( f"Total number of attention heads ({total_num_attention_heads})" " must be divisible by tensor parallel size " - f"({tensor_parallel_size})." - ) + f"({tensor_parallel_size}).") pipeline_parallel_size = parallel_config.pipeline_parallel_size architectures = getattr(self.hf_config, "architectures", []) - if not all(arch in _PP_SUPPORTED_MODELS for arch in architectures) and pipeline_parallel_size > 1: + if not all(arch in _PP_SUPPORTED_MODELS + for arch in architectures) and pipeline_parallel_size > 1: raise NotImplementedError( - "Pipeline parallelism is only supported for the following " f" architectures: {_PP_SUPPORTED_MODELS}." - ) + "Pipeline parallelism is only supported for the following " + f" architectures: {_PP_SUPPORTED_MODELS}.") if self.quantization == "bitsandbytes" and ( - parallel_config.tensor_parallel_size > 1 or parallel_config.pipeline_parallel_size > 1 - ): - raise ValueError("BitAndBytes quantization with TP or PP is not supported yet.") + parallel_config.tensor_parallel_size > 1 + or parallel_config.pipeline_parallel_size > 1): + raise ValueError( + "BitAndBytes quantization with TP or PP is not supported yet.") if self.quantization == "bitsandbytes" and self.enforce_eager is False: - raise ValueError("BitAndBytes with enforce_eager = False is not supported yet.") + raise ValueError( + "BitAndBytes with enforce_eager = False is not supported yet.") def get_hf_config_sliding_window(self) -> Optional[int]: """Get the sliding window size, or None if disabled.""" @@ -301,12 +297,14 @@ def get_hf_config_sliding_window(self) -> Optional[int]: # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in # addition to sliding window size. We check if that field is present # and if it's False, return None. - if hasattr(self.hf_text_config, "use_sliding_window") and not self.hf_text_config.use_sliding_window: + if (hasattr(self.hf_text_config, "use_sliding_window") + and not self.hf_text_config.use_sliding_window): return None return getattr(self.hf_text_config, "sliding_window", None) def get_sliding_window(self) -> Optional[int]: - """Get the sliding window size, or None if disabled.""" + """Get the sliding window size, or None if disabled. + """ # If user disables sliding window, return None. if self.disable_sliding_window: return None @@ -321,14 +319,16 @@ def get_hidden_size(self) -> int: def get_head_size(self) -> int: # TODO remove hard code - if hasattr(self.hf_text_config, "model_type") and self.hf_text_config.model_type == 'deepseek_v2': + if hasattr(self.hf_text_config, "model_type" + ) and self.hf_text_config.model_type == 'deepseek_v2': # FlashAttention supports only head_size 32, 64, 128, 256, # we need to pad head_size 192 to 256 return 256 if hasattr(self.hf_text_config, "head_dim"): return self.hf_text_config.head_dim # FIXME(woosuk): This may not be true for all models. - return self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads + return (self.hf_text_config.hidden_size // + self.hf_text_config.num_attention_heads) def get_total_num_kv_heads(self) -> int: """Returns the total number of KV heads.""" @@ -337,10 +337,11 @@ def get_total_num_kv_heads(self) -> int: # multi_query flag is ignored and we use n_head_kv for the number of # KV heads. falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] - new_decoder_arch_falcon = self.hf_config.model_type in falcon_model_types and getattr( - self.hf_config, "new_decoder_architecture", False - ) - if not new_decoder_arch_falcon and getattr(self.hf_text_config, "multi_query", False): + new_decoder_arch_falcon = ( + self.hf_config.model_type in falcon_model_types + and getattr(self.hf_config, "new_decoder_architecture", False)) + if not new_decoder_arch_falcon and getattr(self.hf_text_config, + "multi_query", False): # Multi-query attention, only one KV head. # Currently, tensor parallelism is not supported in this case. return 1 @@ -351,7 +352,8 @@ def get_total_num_kv_heads(self) -> int: return self.hf_config.attn_config["kv_n_heads"] return self.hf_config.num_attention_heads if self.hf_config.model_type == "dbrx": - return getattr(self.hf_config.attn_config, "kv_n_heads", self.hf_config.num_attention_heads) + return getattr(self.hf_config.attn_config, "kv_n_heads", + self.hf_config.num_attention_heads) attributes = [ # For Falcon: @@ -378,35 +380,48 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int: # the tensor parallel size. We will replicate the KV heads in the # case where the number of KV heads is smaller than the tensor # parallel size so each GPU has at least one KV head. - return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size) + return max(1, + total_num_kv_heads // parallel_config.tensor_parallel_size) - def get_num_attention_heads(self, parallel_config: "ParallelConfig") -> int: + def get_num_attention_heads(self, + parallel_config: "ParallelConfig") -> int: num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) return num_heads // parallel_config.tensor_parallel_size def get_num_layers(self, parallel_config: "ParallelConfig") -> int: from vllm.distributed.utils import get_pp_indices - - total_num_hidden_layers = getattr(self.hf_text_config, "num_hidden_layers", 0) + total_num_hidden_layers = getattr(self.hf_text_config, + "num_hidden_layers", 0) pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size pp_size = parallel_config.pipeline_parallel_size start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size) return end - start - def contains_seqlen_agnostic_layers(self, parallel_config: "ParallelConfig") -> bool: + def contains_seqlen_agnostic_layers( + self, parallel_config: "ParallelConfig") -> bool: """True for Mamba/SSM models (Jamba)""" return self._get_num_seqlen_agnostic_layers(parallel_config) > 0 - def get_layers_block_type(self, parallel_config: "ParallelConfig") -> List[str]: + def get_layers_block_type(self, + parallel_config: "ParallelConfig") -> List[str]: num_layers = self.get_num_layers(parallel_config) # Transformers supports layers_block_type @property - return getattr(self.hf_config, "layers_block_type", ["attention"] * num_layers) + return getattr(self.hf_config, "layers_block_type", + ["attention"] * num_layers) - def get_num_attention_layers(self, parallel_config: "ParallelConfig") -> int: - return len([t for t in self.get_layers_block_type(parallel_config) if t == "attention"]) + def get_num_attention_layers(self, + parallel_config: "ParallelConfig") -> int: + return len([ + t for t in self.get_layers_block_type(parallel_config) + if t == "attention" + ]) - def _get_num_seqlen_agnostic_layers(self, parallel_config: "ParallelConfig") -> int: - return len([t for t in self.get_layers_block_type(parallel_config) if t != "attention"]) + def _get_num_seqlen_agnostic_layers( + self, parallel_config: "ParallelConfig") -> int: + return len([ + t for t in self.get_layers_block_type(parallel_config) + if t != "attention" + ]) class CacheConfig: @@ -456,7 +471,9 @@ def metrics_info(self): def _verify_args(self) -> None: if self.gpu_memory_utilization > 1.0: - raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.") + raise ValueError( + "GPU memory utilization must be less than 1.0. Got " + f"{self.gpu_memory_utilization}.") def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": @@ -466,8 +483,7 @@ def _verify_cache_dtype(self) -> None: "Using fp8 data type to store kv cache. It reduces the GPU " "memory footprint and boosts the performance. " "Meanwhile, it may cause accuracy drop without a proper " - "scaling factor" - ) + "scaling factor") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") @@ -478,26 +494,25 @@ def _verify_prefix_caching(self) -> None: if self.sliding_window is not None: raise NotImplementedError( "Prefix caching is not supported with sliding window. " - "Run with --disable-sliding-window to use prefix caching." - ) + "Run with --disable-sliding-window to use prefix caching.") if self.cache_dtype == "fp8": raise NotImplementedError( "Prefix caching is not supported for fp8 cache_dtype. " - "Run with --kv-cache-dtype auto to use prefix caching." - ) + "Run with --kv-cache-dtype auto to use prefix caching.") - def verify_with_parallel_config(self, parallel_config: "ParallelConfig") -> None: + def verify_with_parallel_config( + self, + parallel_config: "ParallelConfig", + ) -> None: total_cpu_memory = get_cpu_memory() # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel # group are in the same node. However, the GPUs may span multiple nodes. num_gpus_per_node = parallel_config.tensor_parallel_size cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node - msg = ( - f"{cpu_memory_usage / _GB:.2f} GiB out of " - f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is " - "allocated for the swap space." - ) + msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of " + f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is " + "allocated for the swap space.") if cpu_memory_usage > 0.7 * total_cpu_memory: raise ValueError("Too large swap space. " + msg) elif cpu_memory_usage > 0.4 * total_cpu_memory: @@ -515,20 +530,21 @@ class TokenizerPoolConfig: The way the config will be used depends on the pool type. """ - pool_size: int pool_type: Union[str, Type["BaseTokenizerGroup"]] extra_config: dict def __post_init__(self): - if self.pool_type not in ("ray",) and not isinstance(self.pool_type, type): + if self.pool_type not in ("ray", ) and not isinstance( + self.pool_type, type): raise ValueError(f"Unknown pool type: {self.pool_type}") if not isinstance(self.extra_config, dict): raise ValueError("extra_config must be a dictionary.") @classmethod def create_config( - cls, tokenizer_pool_size: int, tokenizer_pool_type: str, tokenizer_pool_extra_config: Optional[Union[str, dict]] + cls, tokenizer_pool_size: int, tokenizer_pool_type: str, + tokenizer_pool_extra_config: Optional[Union[str, dict]] ) -> Optional["TokenizerPoolConfig"]: """Create a TokenizerPoolConfig from the given parameters. @@ -543,10 +559,14 @@ def create_config( """ if tokenizer_pool_size: if isinstance(tokenizer_pool_extra_config, str): - tokenizer_pool_extra_config_parsed = json.loads(tokenizer_pool_extra_config) + tokenizer_pool_extra_config_parsed = json.loads( + tokenizer_pool_extra_config) else: - tokenizer_pool_extra_config_parsed = tokenizer_pool_extra_config or {} - tokenizer_pool_config = cls(tokenizer_pool_size, tokenizer_pool_type, tokenizer_pool_extra_config_parsed) + tokenizer_pool_extra_config_parsed = ( + tokenizer_pool_extra_config or {}) + tokenizer_pool_config = cls(tokenizer_pool_size, + tokenizer_pool_type, + tokenizer_pool_extra_config_parsed) else: tokenizer_pool_config = None return tokenizer_pool_config @@ -566,40 +586,44 @@ class LoadFormat(str, enum.Enum): @dataclass class LoadConfig: """ - download_dir: Directory to download and load the weights, default to the - default cache directory of huggingface. - load_format: The format of the model weights to load: - "auto" will try to load the weights in the safetensors format and - fall back to the pytorch bin format if safetensors format is - not available. - "pt" will load the weights in the pytorch bin format. - "safetensors" will load the weights in the safetensors format. - "npcache" will load the weights in pytorch format and store - a numpy cache to speed up the loading. - "dummy" will initialize the weights with random values, which is - mainly for profiling. - "tensorizer" will use CoreWeave's tensorizer library for - fast weight loading. - "bitsandbytes" will load nf4 type weights. - ignore_patterns: The list of patterns to ignore when loading the model. - Default to "original/**/*" to avoid repeated loading of llama's - checkpoints. - + download_dir: Directory to download and load the weights, default to the + default cache directory of huggingface. + load_format: The format of the model weights to load: + "auto" will try to load the weights in the safetensors format and + fall back to the pytorch bin format if safetensors format is + not available. + "pt" will load the weights in the pytorch bin format. + "safetensors" will load the weights in the safetensors format. + "npcache" will load the weights in pytorch format and store + a numpy cache to speed up the loading. + "dummy" will initialize the weights with random values, which is + mainly for profiling. + "tensorizer" will use CoreWeave's tensorizer library for + fast weight loading. + "bitsandbytes" will load nf4 type weights. + ignore_patterns: The list of patterns to ignore when loading the model. + Default to "original/**/*" to avoid repeated loading of llama's + checkpoints. + """ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO download_dir: Optional[str] = None - model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict) + model_loader_extra_config: Optional[Union[str, dict]] = field( + default_factory=dict) ignore_patterns: Optional[Union[List[str], str]] = None def __post_init__(self): model_loader_extra_config = self.model_loader_extra_config or {} if isinstance(model_loader_extra_config, str): - self.model_loader_extra_config = json.loads(model_loader_extra_config) + self.model_loader_extra_config = json.loads( + model_loader_extra_config) self._verify_load_format() if self.ignore_patterns is not None and len(self.ignore_patterns) > 0: - logger.info("Ignoring the following patterns when downloading weights: %s", self.ignore_patterns) + logger.info( + "Ignoring the following patterns when downloading weights: %s", + self.ignore_patterns) else: self.ignore_patterns = ["original/**/*"] @@ -613,13 +637,13 @@ def _verify_load_format(self) -> None: rocm_not_supported_load_format: List[str] = [] if is_hip() and load_format in rocm_not_supported_load_format: rocm_supported_load_format = [ - f for f in LoadFormat.__members__ if (f not in rocm_not_supported_load_format) + f for f in LoadFormat.__members__ + if (f not in rocm_not_supported_load_format) ] raise ValueError( f"load format '{load_format}' is not supported in ROCm. " f"Supported load formats are " - f"{rocm_supported_load_format}" - ) + f"{rocm_supported_load_format}") class ParallelConfig: @@ -655,7 +679,8 @@ def __init__( tokenizer_pool_config: Optional[TokenizerPoolConfig] = None, ray_workers_use_nsight: bool = False, placement_group: Optional["PlacementGroup"] = None, - distributed_executor_backend: Optional[Union[str, Type["ExecutorBase"]]] = None, + distributed_executor_backend: Optional[Union[ + str, Type["ExecutorBase"]]] = None, ) -> None: self.pipeline_parallel_size = pipeline_parallel_size self.tensor_parallel_size = tensor_parallel_size @@ -671,42 +696,36 @@ def __init__( if self.distributed_executor_backend is None: self.distributed_executor_backend = "ray" elif not self.use_ray: - raise ValueError( - f"worker-use-ray can't be used with " - f"distributed executor backend " - f"'{self.distributed_executor_backend}'." - ) + raise ValueError(f"worker-use-ray can't be used with " + f"distributed executor backend " + f"'{self.distributed_executor_backend}'.") if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. from vllm.executor import ray_utils - backend = "mp" ray_found = ray_utils.ray_is_available() if cuda_device_count_stateless() < self.world_size: if not ray_found: - raise ValueError( - "Unable to load Ray which is " - "required for multi-node inference, " - "please install Ray with `pip install " - "ray`." - ) from ray_utils.ray_import_err + raise ValueError("Unable to load Ray which is " + "required for multi-node inference, " + "please install Ray with `pip install " + "ray`.") from ray_utils.ray_import_err backend = "ray" elif ray_found: if self.placement_group: backend = "ray" else: from ray import is_initialized as ray_is_initialized - if ray_is_initialized(): from ray.util import get_current_placement_group - if get_current_placement_group(): backend = "ray" self.distributed_executor_backend = backend - logger.info("Defaulting to use %s for distributed inference", backend) + logger.info("Defaulting to use %s for distributed inference", + backend) self._verify_args() self.rank: int = 0 @@ -714,31 +733,32 @@ def __init__( @property def use_ray(self) -> bool: return self.distributed_executor_backend == "ray" or ( - isinstance(self.distributed_executor_backend, type) and self.distributed_executor_backend.uses_ray - ) + isinstance(self.distributed_executor_backend, type) + and self.distributed_executor_backend.uses_ray) def _verify_args(self) -> None: # Lazy import to avoid circular import from vllm.executor.executor_base import ExecutorBase - if self.distributed_executor_backend not in ("ray", "mp", None) and not ( - isinstance(self.distributed_executor_backend, type) - and issubclass(self.distributed_executor_backend, ExecutorBase) - ): + if self.distributed_executor_backend not in ( + "ray", "mp", None) and not (isinstance( + self.distributed_executor_backend, type) and issubclass( + self.distributed_executor_backend, ExecutorBase)): raise ValueError( "Unrecognized distributed executor backend " f"{self.distributed_executor_backend}. Supported " - "values are 'ray', 'mp' or custom ExecutorBase subclass." - ) + "values are 'ray', 'mp' or custom ExecutorBase subclass.") if self.use_ray: from vllm.executor import ray_utils - ray_utils.assert_ray_available() if is_hip(): self.disable_custom_all_reduce = True - logger.info("Disabled the custom all-reduce kernel because it is not " "supported on AMD GPUs.") + logger.info( + "Disabled the custom all-reduce kernel because it is not " + "supported on AMD GPUs.") if self.ray_workers_use_nsight and not self.use_ray: - raise ValueError("Unable to use nsight profiling unless workers " "run with Ray.") + raise ValueError("Unable to use nsight profiling unless workers " + "run with Ray.") class SchedulerConfig: @@ -761,7 +781,7 @@ class SchedulerConfig: enable_chunked_prefill: If True, prefill requests can be chunked based on the remaining max_num_batched_tokens. embedding_mode: Whether the running model is for embedding. - preemption_mode: Whether to perform preemption by swapping or + preemption_mode: Whether to perform preemption by swapping or recomputation. If not specified, we determine the mode as follows: We use recomputation by default since it incurs lower overhead than swapping. However, when the sequence group has multiple sequences @@ -769,18 +789,16 @@ class SchedulerConfig: such a case, we use swapping instead. """ - def __init__( - self, - max_num_batched_tokens: Optional[int], - max_num_seqs: int, - max_model_len: int, - use_v2_block_manager: bool = False, - num_lookahead_slots: int = 0, - delay_factor: float = 0.0, - enable_chunked_prefill: bool = False, - embedding_mode: Optional[bool] = False, - preemption_mode: Optional[str] = None, - ) -> None: + def __init__(self, + max_num_batched_tokens: Optional[int], + max_num_seqs: int, + max_model_len: int, + use_v2_block_manager: bool = False, + num_lookahead_slots: int = 0, + delay_factor: float = 0.0, + enable_chunked_prefill: bool = False, + embedding_mode: Optional[bool] = False, + preemption_mode: Optional[str] = None) -> None: if max_num_batched_tokens is not None: self.max_num_batched_tokens = max_num_batched_tokens else: @@ -790,13 +808,16 @@ def __init__( self.max_num_batched_tokens = 512 elif embedding_mode: # For embedding, choose specific value for higher throughput - self.max_num_batched_tokens = max(max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS) + self.max_num_batched_tokens = max( + max_model_len, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS) else: # If max_model_len is too short, use 2048 as the default value # for higher throughput. self.max_num_batched_tokens = max(max_model_len, 2048) if enable_chunked_prefill: - logger.info("Chunked prefill is enabled with max_num_batched_tokens=%d.", self.max_num_batched_tokens) + logger.info( + "Chunked prefill is enabled with max_num_batched_tokens=%d.", + self.max_num_batched_tokens) self.max_num_seqs = max_num_seqs self.max_model_len = max_model_len @@ -809,27 +830,27 @@ def __init__( self._verify_args() def _verify_args(self) -> None: - if self.max_num_batched_tokens < self.max_model_len and not self.chunked_prefill_enabled: + if (self.max_num_batched_tokens < self.max_model_len + and not self.chunked_prefill_enabled): raise ValueError( f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " f"smaller than max_model_len ({self.max_model_len}). " "This effectively limits the maximum sequence length to " "max_num_batched_tokens and makes vLLM reject longer " "sequences. Please increase max_num_batched_tokens or " - "decrease max_model_len." - ) + "decrease max_model_len.") if self.max_num_batched_tokens < self.max_num_seqs: raise ValueError( f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " "be greater than or equal to max_num_seqs " - f"({self.max_num_seqs})." - ) + f"({self.max_num_seqs}).") if self.num_lookahead_slots < 0: raise ValueError( - "num_lookahead_slots " f"({self.num_lookahead_slots}) must be greater than or " "equal to 0." - ) + "num_lookahead_slots " + f"({self.num_lookahead_slots}) must be greater than or " + "equal to 0.") class DeviceConfig: @@ -935,7 +956,7 @@ def maybe_create_spec_config( typical_acceptance_sampler_posterior_threshold (Optional[float]): A threshold value that sets a lower bound on the posterior probability of a token in the target model for it to be - accepted. This threshold is used only when we use the + accepted. This threshold is used only when we use the TypicalAcceptanceSampler for token acceptance. typical_acceptance_sampler_posterior_alpha (Optional[float]): A scaling factor for the entropy-based threshold in the @@ -945,7 +966,7 @@ def maybe_create_spec_config( If set to False, token log probabilities are returned according to the log probability settings in SamplingParams. If not specified, it defaults to True. - + Returns: Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if the necessary conditions are met, else None. @@ -953,26 +974,25 @@ def maybe_create_spec_config( if speculative_model is None: if num_speculative_tokens is not None: - raise ValueError("num_speculative_tokens was provided without " "speculative_model.") + raise ValueError("num_speculative_tokens was provided without " + "speculative_model.") return None - if speculative_disable_by_batch_size is not None and speculative_disable_by_batch_size < 2: - raise ValueError( - "Expect the batch size threshold of disabling " - "speculative decoding is > 1, but got " - f"{speculative_disable_by_batch_size=}" - ) + if (speculative_disable_by_batch_size is not None + and speculative_disable_by_batch_size < 2): + raise ValueError("Expect the batch size threshold of disabling " + "speculative decoding is > 1, but got " + f"{speculative_disable_by_batch_size=}") if enable_chunked_prefill: raise ValueError( "Speculative decoding and chunked prefill are " - f"currently mutually exclusive ({enable_chunked_prefill=})." - ) + f"currently mutually exclusive ({enable_chunked_prefill=}).") if not use_v2_block_manager: raise ValueError( - "Speculative decoding requires usage of the V2 " "block manager. Enable it with --use-v2-block-manager." - ) + "Speculative decoding requires usage of the V2 " + "block manager. Enable it with --use-v2-block-manager.") # TODO: The user should be able to specify revision/quantization/max # model len for the draft model. It is not currently supported. @@ -988,7 +1008,8 @@ def maybe_create_spec_config( if ngram_prompt_lookup_min < 1: raise ValueError(f"{ngram_prompt_lookup_min=} must be > 0") if ngram_prompt_lookup_min > ngram_prompt_lookup_max: - raise ValueError(f"{ngram_prompt_lookup_min=} cannot be " f"larger than {ngram_prompt_lookup_max=}") + raise ValueError(f"{ngram_prompt_lookup_min=} cannot be " + f"larger than {ngram_prompt_lookup_max=}") # TODO: current we still need extract vocab_size from target model # config, in future, we may try refactor it out, and set @@ -1011,13 +1032,15 @@ def maybe_create_spec_config( max_model_len=None, quantization=draft_quantization, enforce_eager=target_model_config.enforce_eager, - max_seq_len_to_capture=target_model_config.max_seq_len_to_capture, + max_seq_len_to_capture=target_model_config. + max_seq_len_to_capture, max_logprobs=target_model_config.max_logprobs, ) draft_hf_config = draft_model_config.hf_config - if num_speculative_tokens is not None and hasattr(draft_hf_config, "num_lookahead_tokens"): + if (num_speculative_tokens is not None + and hasattr(draft_hf_config, "num_lookahead_tokens")): draft_hf_config.num_lookahead_tokens = num_speculative_tokens n_predict = getattr(draft_hf_config, "n_predict", None) @@ -1031,23 +1054,25 @@ def maybe_create_spec_config( raise ValueError( "This speculative model supports a maximum of " f"num_speculative_tokens={n_predict}, but " - f"{num_speculative_tokens=} was provided." - ) + f"{num_speculative_tokens=} was provided.") - draft_model_config.max_model_len = SpeculativeConfig._maybe_override_draft_max_model_len( - speculative_max_model_len, draft_model_config.max_model_len, target_model_config.max_model_len - ) + draft_model_config.max_model_len = ( + SpeculativeConfig._maybe_override_draft_max_model_len( + speculative_max_model_len, + draft_model_config.max_model_len, + target_model_config.max_model_len, + )) - draft_parallel_config = SpeculativeConfig.create_draft_parallel_config( - target_parallel_config, speculative_draft_tensor_parallel_size - ) + draft_parallel_config = ( + SpeculativeConfig.create_draft_parallel_config( + target_parallel_config, + speculative_draft_tensor_parallel_size)) if num_speculative_tokens is None: raise ValueError( "num_speculative_tokens must be provided with " "speculative_model unless the draft model config contains an " - "n_predict parameter." - ) + "n_predict parameter.") if typical_acceptance_sampler_posterior_threshold is None: typical_acceptance_sampler_posterior_threshold = 0.09 @@ -1064,14 +1089,18 @@ def maybe_create_spec_config( ngram_prompt_lookup_max, ngram_prompt_lookup_min, draft_token_acceptance_method=draft_token_acceptance_method, - typical_acceptance_sampler_posterior_threshold=typical_acceptance_sampler_posterior_threshold, - typical_acceptance_sampler_posterior_alpha=typical_acceptance_sampler_posterior_alpha, - disable_logprobs=disable_logprobs, + typical_acceptance_sampler_posterior_threshold=\ + typical_acceptance_sampler_posterior_threshold, + typical_acceptance_sampler_posterior_alpha=\ + typical_acceptance_sampler_posterior_alpha, + disable_logprobs=disable_logprobs ) @staticmethod def _maybe_override_draft_max_model_len( - speculative_max_model_len: Optional[int], draft_max_model_len: int, target_max_model_len: int + speculative_max_model_len: Optional[int], + draft_max_model_len: int, + target_max_model_len: int, ) -> int: """Determine the max sequence len for the draft model. This is usually the draft_max_model_len, but may be the target_max_model_len if it is @@ -1088,37 +1117,51 @@ def _maybe_override_draft_max_model_len( if speculative_max_model_len is not None: if speculative_max_model_len > draft_max_model_len: - raise ValueError(f"{speculative_max_model_len=} cannot be " f"larger than {draft_max_model_len=}") + raise ValueError(f"{speculative_max_model_len=} cannot be " + f"larger than {draft_max_model_len=}") if speculative_max_model_len > target_max_model_len: - raise ValueError(f"{speculative_max_model_len=} cannot be " f"larger than {target_max_model_len=}") + raise ValueError(f"{speculative_max_model_len=} cannot be " + f"larger than {target_max_model_len=}") return speculative_max_model_len - return min(draft_max_model_len, target_max_model_len) + return min( + draft_max_model_len, + target_max_model_len, + ) @staticmethod def create_draft_parallel_config( - target_parallel_config: ParallelConfig, speculative_draft_tensor_parallel_size: Optional[int] + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: Optional[int] ) -> ParallelConfig: """Create a parallel config for use by the draft worker. This is mostly a copy of the target parallel config, except the tp_size. """ if speculative_draft_tensor_parallel_size is None: - speculative_draft_tensor_parallel_size = target_parallel_config.tensor_parallel_size + speculative_draft_tensor_parallel_size = \ + target_parallel_config.tensor_parallel_size elif speculative_draft_tensor_parallel_size != 1: # TODO(wooyeon): allow tp values larger than 1 - raise ValueError(f"{speculative_draft_tensor_parallel_size=} cannot be" f"other value than 1") + raise ValueError( + f"{speculative_draft_tensor_parallel_size=} cannot be" + f"other value than 1") draft_parallel_config = ParallelConfig( - pipeline_parallel_size=target_parallel_config.pipeline_parallel_size, + pipeline_parallel_size=target_parallel_config. + pipeline_parallel_size, tensor_parallel_size=speculative_draft_tensor_parallel_size, - distributed_executor_backend=target_parallel_config.distributed_executor_backend, - max_parallel_loading_workers=target_parallel_config.max_parallel_loading_workers, - disable_custom_all_reduce=target_parallel_config.disable_custom_all_reduce, + distributed_executor_backend=target_parallel_config. + distributed_executor_backend, + max_parallel_loading_workers=target_parallel_config. + max_parallel_loading_workers, + disable_custom_all_reduce=target_parallel_config. + disable_custom_all_reduce, tokenizer_pool_config=target_parallel_config.tokenizer_pool_config, - ray_workers_use_nsight=target_parallel_config.ray_workers_use_nsight, + ray_workers_use_nsight=target_parallel_config. + ray_workers_use_nsight, placement_group=target_parallel_config.placement_group, ) @@ -1157,13 +1200,13 @@ def __init__( typical_acceptance_sampler_posterior_threshold (Optional[float]): A threshold value that sets a lower bound on the posterior probability of a token in the target model for it to be - accepted. This threshold is used only when we use the + accepted. This threshold is used only when we use the TypicalAcceptanceSampler for token acceptance. typical_acceptance_sampler_posterior_alpha (Optional[float]): A scaling factor for the entropy-based threshold in the TypicalAcceptanceSampler. disable_logprobs: If set to True, token log probabilities will not - be returned even if requested by sampling parameters. This + be returned even if requested by sampling parameters. This reduces latency by skipping logprob calculation in proposal sampling, target sampling, and after accepted tokens are determined. If set to False, log probabilities will be @@ -1172,47 +1215,44 @@ def __init__( self.draft_model_config = draft_model_config self.draft_parallel_config = draft_parallel_config self.num_speculative_tokens = num_speculative_tokens - self.speculative_disable_by_batch_size = speculative_disable_by_batch_size + self.speculative_disable_by_batch_size = \ + speculative_disable_by_batch_size self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0 self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0 self.draft_token_acceptance_method = draft_token_acceptance_method - self.typical_acceptance_sampler_posterior_threshold = typical_acceptance_sampler_posterior_threshold - self.typical_acceptance_sampler_posterior_alpha = typical_acceptance_sampler_posterior_alpha + self.typical_acceptance_sampler_posterior_threshold = \ + typical_acceptance_sampler_posterior_threshold + self.typical_acceptance_sampler_posterior_alpha = \ + typical_acceptance_sampler_posterior_alpha self.disable_logprobs = disable_logprobs self._verify_args() def _verify_args(self) -> None: if self.num_speculative_tokens <= 0: - raise ValueError( - "Expected num_speculative_tokens to be greater " f"than zero ({self.num_speculative_tokens})." - ) + raise ValueError("Expected num_speculative_tokens to be greater " + f"than zero ({self.num_speculative_tokens}).") if self.draft_model_config: - self.draft_model_config.verify_with_parallel_config(self.draft_parallel_config) + self.draft_model_config.verify_with_parallel_config( + self.draft_parallel_config) # Validate and set draft token acceptance related settings. - if self.draft_token_acceptance_method is None: - raise ValueError( - "draft_token_acceptance_method is not set. " - "Expected values are rejection_sampler or " - "typical_acceptance_sampler." - ) + if (self.draft_token_acceptance_method is None): + raise ValueError("draft_token_acceptance_method is not set. " + "Expected values are rejection_sampler or " + "typical_acceptance_sampler.") - if ( - self.draft_token_acceptance_method != 'rejection_sampler' - and self.draft_token_acceptance_method != 'typical_acceptance_sampler' - ): + if (self.draft_token_acceptance_method != 'rejection_sampler' + and self.draft_token_acceptance_method != + 'typical_acceptance_sampler'): raise ValueError( "Expected draft_token_acceptance_method to be either " "rejection_sampler or typical_acceptance_sampler. Instead it " - f"is {self.draft_token_acceptance_method}" - ) + f"is {self.draft_token_acceptance_method}") - if ( - self.typical_acceptance_sampler_posterior_threshold < 0 - or self.typical_acceptance_sampler_posterior_alpha < 0 - ): + if (self.typical_acceptance_sampler_posterior_threshold < 0 + or self.typical_acceptance_sampler_posterior_alpha < 0): raise ValueError( "Expected typical_acceptance_sampler_posterior_threshold " "and typical_acceptance_sampler_posterior_alpha to be > 0. " @@ -1220,8 +1260,7 @@ def _verify_args(self) -> None: f"typical_acceptance_sampler_posterior_threshold = " f"{self.typical_acceptance_sampler_posterior_threshold} and " f"typical_acceptance_sampler_posterior_alpha = " - f"{self.typical_acceptance_sampler_posterior_alpha}" - ) + f"{self.typical_acceptance_sampler_posterior_alpha}") @property def num_lookahead_slots(self) -> int: @@ -1259,35 +1298,40 @@ def __post_init__(self): possible_max_ranks = (8, 16, 32, 64) possible_lora_extra_vocab_size = (0, 256, 512) if self.max_lora_rank not in possible_max_ranks: - raise ValueError(f"max_lora_rank ({self.max_lora_rank}) must be one of " f"{possible_max_ranks}.") + raise ValueError( + f"max_lora_rank ({self.max_lora_rank}) must be one of " + f"{possible_max_ranks}.") if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: raise ValueError( f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " - f"must be one of {possible_lora_extra_vocab_size}." - ) + f"must be one of {possible_lora_extra_vocab_size}.") if self.max_loras < 1: raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") if self.max_cpu_loras is None: self.max_cpu_loras = self.max_loras elif self.max_cpu_loras < self.max_loras: - raise ValueError(f"max_cpu_loras ({self.max_cpu_loras}) must be >= " f"max_loras ({self.max_loras})") + raise ValueError( + f"max_cpu_loras ({self.max_cpu_loras}) must be >= " + f"max_loras ({self.max_loras})") def verify_with_model_config(self, model_config: ModelConfig): if self.lora_dtype in (None, "auto"): self.lora_dtype = model_config.dtype elif isinstance(self.lora_dtype, str): self.lora_dtype = getattr(torch, self.lora_dtype) - if model_config.quantization and model_config.quantization not in ["awq", "gptq"]: + if model_config.quantization and model_config.quantization not in [ + "awq", "gptq" + ]: # TODO support marlin and squeezellm - logger.warning("%s quantization is not tested with LoRA yet.", model_config.quantization) + logger.warning("%s quantization is not tested with LoRA yet.", + model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): if scheduler_config.max_num_batched_tokens > 65528: raise ValueError( "Due to limitations of the custom LoRA CUDA kernel, " "max_num_batched_tokens must be <= 65528 when " - "LoRA is enabled." - ) + "LoRA is enabled.") if scheduler_config.chunked_prefill_enabled: raise ValueError("LoRA is not supported with chunked prefill yet.") @@ -1310,7 +1354,8 @@ def __post_init__(self): ) from e if self.max_prompt_adapters < 1: - raise ValueError(f"max_prompt_adapters " f"({self.max_prompt_adapters}) must be >= 1.") + raise ValueError(f"max_prompt_adapters " + f"({self.max_prompt_adapters}) must be >= 1.") if self.max_prompt_adapter_token == 0: raise ValueError("max_prompt_adapter_token must be set.") if self.max_cpu_prompt_adapters is None: @@ -1320,14 +1365,14 @@ def verify_with_model_config(self, model_config: ModelConfig): if self.prompt_adapter_dtype in (None, "auto"): self.prompt_adapter_dtype = model_config.dtype elif isinstance(self.prompt_adapter_dtype, str): - self.prompt_adapter_dtype = getattr(torch, self.prompt_adapter_dtype) + self.prompt_adapter_dtype = getattr(torch, + self.prompt_adapter_dtype) @dataclass class MultiModalConfig: """Configs the input data format and how models should run for multimodal models.""" - # TODO: Add configs to init vision tower or not. pass @@ -1343,7 +1388,10 @@ class MultiModalConfig: _ROCM_NOT_SUPPORTED_DTYPE: List[str] = [] # -def _get_and_verify_dtype(config: PretrainedConfig, dtype: Union[str, torch.dtype]) -> torch.dtype: +def _get_and_verify_dtype( + config: PretrainedConfig, + dtype: Union[str, torch.dtype], +) -> torch.dtype: # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct # because config.torch_dtype can be None. config_dtype = getattr(config, "torch_dtype", None) @@ -1358,8 +1406,7 @@ def _get_and_verify_dtype(config: PretrainedConfig, dtype: Union[str, torch.dtyp logger.info( "For Gemma 2, we downcast float32 to bfloat16 instead " "of float16 by default. Please specify `dtype` if you " - "want to use float16." - ) + "want to use float16.") torch_dtype = torch.bfloat16 else: # Following the common practice, we use float16 for float32 @@ -1422,13 +1469,15 @@ def _get_and_verify_max_len( for key in possible_keys: max_len = getattr(hf_config, key, None) if max_len is not None: - max_len_key = key if max_len < derived_max_model_len else max_len_key + max_len_key = key if max_len < derived_max_model_len \ + else max_len_key derived_max_model_len = min(derived_max_model_len, max_len) # If sliding window is manually disabled, max_length should be less # than the sliding window length in the model config. if disable_sliding_window and sliding_window_len is not None: - max_len_key = "sliding_window" if sliding_window_len < derived_max_model_len else max_len_key + max_len_key = "sliding_window" \ + if sliding_window_len < derived_max_model_len else max_len_key derived_max_model_len = min(derived_max_model_len, sliding_window_len) # If none of the keys were found in the config, use a default and @@ -1442,10 +1491,8 @@ def _get_and_verify_max_len( logger.warning( "The model's config.json does not contain any of the following " "keys to determine the original maximum length of the model: " - "%s. Assuming the model's maximum length is %d.", - possible_keys, - default_max_len, - ) + "%s. Assuming the model's maximum length is %d.", possible_keys, + default_max_len) derived_max_model_len = default_max_len rope_scaling = getattr(hf_config, "rope_scaling", None) @@ -1455,7 +1502,8 @@ def _get_and_verify_max_len( elif "rope_type" in rope_scaling: rope_type = rope_scaling["rope_type"] else: - raise ValueError("rope_scaling must have a 'type' or 'rope_type' key.") + raise ValueError( + "rope_scaling must have a 'type' or 'rope_type' key.") # The correct one should be "longrope", kept "su" here # to be backward compatible @@ -1466,13 +1514,13 @@ def _get_and_verify_max_len( raise NotImplementedError( "Disabling sliding window is not supported for models " "with rope_scaling. Please raise an issue so we can " - "investigate." - ) + "investigate.") assert "factor" in rope_scaling scaling_factor = rope_scaling["factor"] if rope_type == "yarn": - derived_max_model_len = rope_scaling["original_max_position_embeddings"] + derived_max_model_len = rope_scaling[ + "original_max_position_embeddings"] derived_max_model_len *= scaling_factor # If the user specified a max length, make sure it is smaller than the @@ -1491,8 +1539,7 @@ def _get_and_verify_max_len( raise NotImplementedError( "Disabling sliding window is not supported for models " "model_max_length in the config. Please raise an issue " - "so we can investigate." - ) + "so we can investigate.") pass else: raise ValueError( @@ -1501,17 +1548,17 @@ def _get_and_verify_max_len( f"({max_len_key}={derived_max_model_len} or model_max_length=" f"{model_max_length} in model's config.json). This may lead " "to incorrect model outputs or CUDA errors. Make sure the " - "value is correct and within the model context size." - ) + "value is correct and within the model context size.") return int(max_model_len) -def get_served_model_name(model: str, served_model_name: Optional[Union[str, List[str]]]): +def get_served_model_name(model: str, + served_model_name: Optional[Union[str, List[str]]]): """ - If the input is a non-empty list, the first model_name in - `served_model_name` is taken. - If the input is a non-empty string, it is used directly. - For cases where the input is either an empty string or an + If the input is a non-empty list, the first model_name in + `served_model_name` is taken. + If the input is a non-empty string, it is used directly. + For cases where the input is either an empty string or an empty list, the fallback is to use `self.model`. """ if not served_model_name: @@ -1532,18 +1579,19 @@ def __post_init__(self): valid_guided_backends = ['outlines', 'lm-format-enforcer'] backend = self.guided_decoding_backend if backend not in valid_guided_backends: - raise ValueError(f"Invalid guided_decoding_backend '{backend}," f"must be one of {valid_guided_backends}") + raise ValueError(f"Invalid guided_decoding_backend '{backend}," + f"must be one of {valid_guided_backends}") @dataclass class ObservabilityConfig: """Configuration for observability.""" - otlp_traces_endpoint: Optional[str] = None def __post_init__(self): if not is_otel_installed() and self.otlp_traces_endpoint is not None: - raise ValueError("OpenTelemetry packages must be installed before " "configuring 'otlp_traces_endpoint'") + raise ValueError("OpenTelemetry packages must be installed before " + "configuring 'otlp_traces_endpoint'") @dataclass(frozen=True) @@ -1566,16 +1614,21 @@ class EngineConfig: prompt_adapter_config: Optional[PromptAdapterConfig] def __post_init__(self): - """Verify configs are valid & consistent with each other.""" + """Verify configs are valid & consistent with each other. + """ self.model_config.verify_with_parallel_config(self.parallel_config) self.cache_config.verify_with_parallel_config(self.parallel_config) if self.lora_config: self.lora_config.verify_with_model_config(self.model_config) - self.lora_config.verify_with_scheduler_config(self.scheduler_config) + self.lora_config.verify_with_scheduler_config( + self.scheduler_config) if self.prompt_adapter_config: - self.prompt_adapter_config.verify_with_model_config(self.model_config) + self.prompt_adapter_config.verify_with_model_config( + self.model_config) def to_dict(self): - """Return the configs as a dictionary, for use in **kwargs.""" - return dict((field.name, getattr(self, field.name)) for field in fields(self)) + """Return the configs as a dictionary, for use in **kwargs. + """ + return dict( + (field.name, getattr(self, field.name)) for field in fields(self)) From ec09d4d88b26e89492c92afdbc5ff4547ea762d9 Mon Sep 17 00:00:00 2001 From: Muralidhar Andoorveedu Date: Thu, 1 Aug 2024 06:43:15 +0000 Subject: [PATCH 10/10] Add QWenLMHeadModel to supported models Signed-off-by: Muralidhar Andoorveedu --- vllm/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/config.py b/vllm/config.py index e065744592378..ef56e2b6395be 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -42,6 +42,7 @@ "NemotronForCausalLM", "Qwen2ForCausalLM", "Qwen2MoeForCausalLM", + "QWenLMHeadModel", ]