From 66d617e3437e62a6650ffcb85b3190669d37a468 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 7 Aug 2024 17:12:05 +0800 Subject: [PATCH 01/18] [Frontend] Gracefully handle missing chat template and fix CI failure (#7238) Co-authored-by: Roger Wang --- tests/async_engine/test_chat_template.py | 21 ++--- tests/async_engine/test_openapi_server_ray.py | 10 ++- .../openai/test_oot_registration.py | 87 ++++++++++++------- tests/utils.py | 4 +- vllm/entrypoints/chat_utils.py | 37 +++++++- vllm/entrypoints/openai/protocol.py | 5 +- vllm/entrypoints/openai/serving_chat.py | 8 +- .../openai/serving_tokenization.py | 14 +-- vllm/transformers_utils/tokenizer.py | 8 +- 9 files changed, 125 insertions(+), 69 deletions(-) diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index aea8a7fed6e33..4df6c02973284 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -1,22 +1,16 @@ -import os -import pathlib - import pytest -from vllm.entrypoints.chat_utils import load_chat_template +from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.transformers_utils.tokenizer import get_tokenizer -chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath( - __file__))).parent.parent / "examples/template_chatml.jinja" +from ..utils import VLLM_PATH + +chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" assert chatml_jinja_path.exists() # Define models, templates, and their corresponding expected outputs MODEL_TEMPLATE_GENERATON_OUTPUT = [ - ("facebook/opt-125m", None, True, - "HelloHi there!What is the capital of"), - ("facebook/opt-125m", None, False, - "HelloHi there!What is the capital of"), ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user Hello<|im_end|> <|im_start|>assistant @@ -93,11 +87,12 @@ def test_get_gen_prompt(model, template, add_generation_prompt, add_generation_prompt=add_generation_prompt) # Call the function and get the result - result = tokenizer.apply_chat_template( + result = apply_chat_template( + tokenizer, conversation=mock_request.messages, - tokenize=False, + chat_template=mock_request.chat_template or template_content, add_generation_prompt=mock_request.add_generation_prompt, - chat_template=mock_request.chat_template or template_content) + ) # Test assertion assert result == expected_output, ( diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 5ecd770ede836..0d53b39e7ce1c 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -1,10 +1,12 @@ import openai # use the official client for correctness check import pytest -from ..utils import RemoteOpenAIServer +from ..utils import VLLM_PATH, RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "facebook/opt-125m" +chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" +assert chatml_jinja_path.exists() @pytest.fixture(scope="module") @@ -16,7 +18,9 @@ def server(): "--max-model-len", "2048", "--enforce-eager", - "--engine-use-ray" + "--engine-use-ray", + "--chat-template", + str(chatml_jinja_path), ] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: @@ -83,7 +87,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI): choice = chat_completion.choices[0] assert choice.finish_reason == "length" assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=13, total_tokens=23) + completion_tokens=10, prompt_tokens=55, total_tokens=65) message = choice.message assert message.content is not None and len(message.content) >= 10 diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py index 5272ac4065f1d..9f9a4cd972c51 100644 --- a/tests/entrypoints/openai/test_oot_registration.py +++ b/tests/entrypoints/openai/test_oot_registration.py @@ -9,6 +9,11 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.utils import get_open_port +from ...utils import VLLM_PATH, RemoteOpenAIServer + +chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" +assert chatml_jinja_path.exists() + class MyOPTForCausalLM(OPTForCausalLM): @@ -21,12 +26,25 @@ def compute_logits(self, hidden_states: torch.Tensor, return logits -def server_function(port): +def server_function(port: int): # register our dummy model ModelRegistry.register_model("OPTForCausalLM", MyOPTForCausalLM) - sys.argv = ["placeholder.py"] + \ - ("--model facebook/opt-125m --gpu-memory-utilization 0.10 " - f"--dtype float32 --api-key token-abc123 --port {port}").split() + + sys.argv = ["placeholder.py"] + [ + "--model", + "facebook/opt-125m", + "--gpu-memory-utilization", + "0.10", + "--dtype", + "float32", + "--api-key", + "token-abc123", + "--port", + str(port), + "--chat-template", + str(chatml_jinja_path), + ] + import runpy runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') @@ -36,35 +54,40 @@ def test_oot_registration_for_api_server(): ctx = torch.multiprocessing.get_context() server = ctx.Process(target=server_function, args=(port, )) server.start() - MAX_SERVER_START_WAIT_S = 60 - client = OpenAI( - base_url=f"http://localhost:{port}/v1", - api_key="token-abc123", - ) - now = time.time() - while True: - try: - completion = client.chat.completions.create( - model="facebook/opt-125m", - messages=[{ - "role": "system", - "content": "You are a helpful assistant." - }, { - "role": "user", - "content": "Hello!" - }], - temperature=0, - ) - break - except OpenAIError as e: - if "Connection error" in str(e): - time.sleep(3) - if time.time() - now > MAX_SERVER_START_WAIT_S: - raise RuntimeError("Server did not start in time") from e - else: - raise e - server.kill() + + try: + client = OpenAI( + base_url=f"http://localhost:{port}/v1", + api_key="token-abc123", + ) + now = time.time() + while True: + try: + completion = client.chat.completions.create( + model="facebook/opt-125m", + messages=[{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "Hello!" + }], + temperature=0, + ) + break + except OpenAIError as e: + if "Connection error" in str(e): + time.sleep(3) + if time.time() - now > RemoteOpenAIServer.MAX_START_WAIT_S: + msg = "Server did not start in time" + raise RuntimeError(msg) from e + else: + raise e + finally: + server.terminate() + generated_text = completion.choices[0].message.content + assert generated_text is not None # make sure only the first token is generated rest = generated_text.replace("", "") assert rest == "" diff --git a/tests/utils.py b/tests/utils.py index bd431b85d2663..e3d04cc638a95 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -50,7 +50,7 @@ def _nvml(): class RemoteOpenAIServer: DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key - MAX_SERVER_START_WAIT_S = 120 # wait for server to start for 120 seconds + MAX_START_WAIT_S = 120 # wait for server to start for 120 seconds def __init__( self, @@ -85,7 +85,7 @@ def __init__( stdout=sys.stdout, stderr=sys.stderr) self._wait_for_server(url=self.url_for("health"), - timeout=self.MAX_SERVER_START_WAIT_S) + timeout=self.MAX_START_WAIT_S) def __enter__(self): return self diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 072450a6146ee..12634c3261856 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1,8 +1,9 @@ import codecs from dataclasses import dataclass from functools import lru_cache -from typing import (Awaitable, Iterable, List, Optional, Tuple, Union, cast, - final) +from pathlib import Path +from typing import (Any, Awaitable, Iterable, List, Optional, Tuple, Union, + cast, final) # yapf conflicts with isort for this block # yapf: disable @@ -22,6 +23,7 @@ from vllm.logger import init_logger from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import async_get_and_parse_image +from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) @@ -69,13 +71,17 @@ class ChatMessageParseResult: mm_futures: List[Awaitable[MultiModalDataDict]] -def load_chat_template(chat_template: Optional[str]) -> Optional[str]: +def load_chat_template( + chat_template: Optional[Union[Path, str]]) -> Optional[str]: if chat_template is None: return None try: with open(chat_template, "r") as f: resolved_chat_template = f.read() except OSError as e: + if isinstance(chat_template, Path): + raise + JINJA_CHARS = "{}\n" if not any(c in chat_template for c in JINJA_CHARS): msg = (f"The supplied chat template ({chat_template}) " @@ -208,3 +214,28 @@ def parse_chat_messages( mm_futures.extend(parse_result.mm_futures) return conversation, mm_futures + + +def apply_chat_template( + tokenizer: AnyTokenizer, + conversation: List[ConversationMessage], + chat_template: Optional[str], + *, + tokenize: bool = False, # Different from HF's default + **kwargs: Any, +) -> str: + if chat_template is None and tokenizer.chat_template is None: + raise ValueError( + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one.") + + prompt = tokenizer.apply_chat_template( + conversation=conversation, + chat_template=chat_template, + tokenize=tokenize, + **kwargs, + ) + assert isinstance(prompt, str) + + return prompt diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 76318a1271229..70467bd879690 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -190,8 +190,9 @@ class ChatCompletionRequest(OpenAIBaseModel): default=None, description=( "A Jinja template to use for this conversion. " - "If this is not passed, the model's default chat template will be " - "used instead."), + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one."), ) chat_template_kwargs: Optional[Dict[str, Any]] = Field( default=None, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index add1ce8acc95e..2167b967b14b5 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -10,6 +10,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import AsyncEngineClient from vllm.entrypoints.chat_utils import (ConversationMessage, + apply_chat_template, load_chat_template, parse_chat_messages) from vllm.entrypoints.logger import RequestLogger @@ -99,16 +100,15 @@ async def create_chat_completion( tool.model_dump() for tool in request.tools ] - prompt = tokenizer.apply_chat_template( + prompt = apply_chat_template( + tokenizer, conversation=conversation, - tokenize=False, + chat_template=request.chat_template or self.chat_template, add_generation_prompt=request.add_generation_prompt, tools=tool_dicts, documents=request.documents, - chat_template=request.chat_template or self.chat_template, **(request.chat_template_kwargs or {}), ) - assert isinstance(prompt, str) except Exception as e: logger.error("Error in applying chat template from request: %s", e) return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 5b6b979b9b9e7..1aeabb7a7d729 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -2,7 +2,9 @@ from vllm.config import ModelConfig from vllm.engine.protocol import AsyncEngineClient -from vllm.entrypoints.chat_utils import load_chat_template, parse_chat_messages +from vllm.entrypoints.chat_utils import (apply_chat_template, + load_chat_template, + parse_chat_messages) from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable @@ -70,12 +72,12 @@ async def create_tokenize( logger.warning( "Multi-modal inputs are ignored during tokenization") - prompt = tokenizer.apply_chat_template( - add_generation_prompt=request.add_generation_prompt, + prompt = apply_chat_template( + tokenizer, conversation=conversation, - tokenize=False, - chat_template=self.chat_template) - assert isinstance(prompt, str) + chat_template=self.chat_template, + add_generation_prompt=request.add_generation_prompt, + ) else: prompt = request.prompt diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index bf26d889d1388..25e4c41592c68 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -12,12 +12,12 @@ from vllm.transformers_utils.tokenizers import BaichuanTokenizer from vllm.utils import make_async +from .tokenizer_group import AnyTokenizer + logger = init_logger(__name__) -def get_cached_tokenizer( - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] -) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: +def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: """Get tokenizer with cached properties. This will patch the tokenizer object in place. @@ -63,7 +63,7 @@ def get_tokenizer( revision: Optional[str] = None, download_dir: Optional[str] = None, **kwargs, -) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: +) -> AnyTokenizer: """Gets a tokenizer for the given model name via HuggingFace or ModelScope. """ if VLLM_USE_MODELSCOPE: From 639159b2a665173bc7a81625887b8e76f85e2e32 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 7 Aug 2024 08:54:52 -0700 Subject: [PATCH 02/18] [distributed][misc] add specialized method for cuda platform (#7249) --- .../device_communicators/custom_all_reduce.py | 8 ++- vllm/platforms/cuda.py | 37 +++++++++++++- vllm/utils.py | 50 ------------------- 3 files changed, 42 insertions(+), 53 deletions(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index a4f30808d32e1..479dc95a8b667 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -11,7 +11,8 @@ gpu_p2p_access_check) from vllm.distributed.parallel_state import in_the_same_node_as from vllm.logger import init_logger -from vllm.utils import cuda_device_count_stateless, is_full_nvlink +from vllm.platforms import current_platform +from vllm.utils import cuda_device_count_stateless try: assert ops.is_custom_op_supported("_C_custom_ar::meta_size") @@ -113,7 +114,10 @@ def __init__(self, # test nvlink first, this will filter out most of the cases # where custom allreduce is not supported # this checks hardware and driver support for NVLink - full_nvlink = is_full_nvlink(physical_device_ids) + assert current_platform.is_cuda() + from vllm.platforms.cuda import CudaPlatform + cuda_platform: CudaPlatform = current_platform + full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids) if world_size > 2 and not full_nvlink: logger.warning( "Custom allreduce is disabled because it's not supported on" diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 02ba227460e3f..a7e760cc16408 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -4,12 +4,21 @@ import os from functools import lru_cache, wraps -from typing import Tuple +from typing import List, Tuple import pynvml +from vllm.logger import init_logger + from .interface import Platform, PlatformEnum +logger = init_logger(__name__) + +# NVML utils +# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, +# all the related functions work on real physical device ids. +# the major benefit of using NVML is that it will not initialize CUDA + def with_nvml_context(fn): @@ -47,3 +56,29 @@ class CudaPlatform(Platform): def get_device_capability(device_id: int = 0) -> Tuple[int, int]: physical_device_id = device_id_to_physical_device_id(device_id) return get_physical_device_capability(physical_device_id) + + @staticmethod + @with_nvml_context + def is_full_nvlink(physical_device_ids: List[int]) -> bool: + """ + query if the set of gpus are fully connected by nvlink (1 hop) + """ + handles = [ + pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids + ] + for i, handle in enumerate(handles): + for j, peer_handle in enumerate(handles): + if i < j: + try: + p2p_status = pynvml.nvmlDeviceGetP2PStatus( + handle, peer_handle, + pynvml.NVML_P2P_CAPS_INDEX_NVLINK) + if p2p_status != pynvml.NVML_P2P_STATUS_OK: + return False + except pynvml.NVMLError as error: + logger.error( + "NVLink detection failed. This is normal if your" + " machine has no NVLink equipped.", + exc_info=error) + return False + return True diff --git a/vllm/utils.py b/vllm/utils.py index 61e3bb0bfc333..08aa889b5e447 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1034,56 +1034,6 @@ def cuda_device_count_stateless() -> int: return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) -# NVML utils -# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, -# all the related functions work on real physical device ids. -# the major benefit of using NVML is that it will not initialize CUDA - -try: - import pynvml -except ImportError: - # For non-NV devices - pynvml = None - - -def with_nvml_context(fn): - - @wraps(fn) - def wrapper(*args, **kwargs): - if pynvml is not None: - pynvml.nvmlInit() - try: - return fn(*args, **kwargs) - finally: - if pynvml is not None: - pynvml.nvmlShutdown() - - return wrapper - - -@with_nvml_context -def is_full_nvlink(device_ids: List[int]) -> bool: - """ - query if the set of gpus are fully connected by nvlink (1 hop) - """ - handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids] - for i, handle in enumerate(handles): - for j, peer_handle in enumerate(handles): - if i < j: - try: - p2p_status = pynvml.nvmlDeviceGetP2PStatus( - handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK) - if p2p_status != pynvml.NVML_P2P_STATUS_OK: - return False - except pynvml.NVMLError as error: - logger.error( - "NVLink detection failed. This is normal if your" - " machine has no NVLink equipped.", - exc_info=error) - return False - return True - - #From: https://stackoverflow.com/a/4104188/2749989 def run_once(f): From 0f7052bc7e7c3301588705abf7c7fadf3db293a6 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 7 Aug 2024 12:17:58 -0400 Subject: [PATCH 03/18] [Misc] Refactor linear layer weight loading; introduce `BasevLLMParameter` and `weight_loader_v2` (#5874) --- tests/quantization/test_compressed_tensors.py | 20 +- vllm/model_executor/__init__.py | 4 + vllm/model_executor/layers/linear.py | 153 +++++++++- .../compressed_tensors/compressed_tensors.py | 21 +- .../schemes/compressed_tensors_unquantized.py | 20 +- .../schemes/compressed_tensors_w4a16_24.py | 112 ++++--- .../schemes/compressed_tensors_w8a16_fp8.py | 51 ++-- .../schemes/compressed_tensors_w8a8_fp8.py | 51 ++-- .../schemes/compressed_tensors_w8a8_int8.py | 47 +-- .../schemes/compressed_tensors_wNa16.py | 102 +++---- vllm/model_executor/parameter.py | 277 ++++++++++++++++++ 11 files changed, 655 insertions(+), 203 deletions(-) create mode 100644 vllm/model_executor/parameter.py diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index bd79da84a7764..2ea340779b819 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -9,7 +9,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, - CompressedTensorsWNA16) + CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( QuantizationType) @@ -109,7 +109,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): assert qkv_proj.weight_packed.dtype is torch.int32 assert qkv_proj.weight_scale.dtype is torch.float16 - assert qkv_proj.weight_packed.pack_factor == pack_factor + assert qkv_proj.scheme.pack_factor == pack_factor output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -140,13 +140,17 @@ def test_compressed_tensors_fp8(vllm_runner): qkv_proj = layer.self_attn.qkv_proj assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8) - assert qkv_proj.weight.dtype is torch.float8_e4m3fn + assert isinstance( + qkv_proj.scheme, + (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) + assert qkv_proj.input_scale.dtype is torch.float32 - assert qkv_proj.weight_scale.dtype is torch.float32 - # should be scalars after processing - assert len(qkv_proj.input_scale.shape) == 0 - assert len(qkv_proj.weight_scale.shape) == 0 + + if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + assert qkv_proj.weight_scale.dtype is torch.float32 + assert len(qkv_proj.weight_scale.shape) == 0 output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index fb98f4a6b46f4..5c767e22de4d0 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -1,7 +1,11 @@ +from vllm.model_executor.parameter import (BasevLLMParameter, + PackedvLLMParameter) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed __all__ = [ "SamplingMetadata", "set_random_seed", + "BasevLLMParameter", + "PackedvLLMParameter", ] diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index cd53c2b916211..646839ff303ee 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -13,10 +13,14 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.parameter import (BasevLLMParameter, + PackedvLLMParameter) from vllm.model_executor.utils import set_weight_attrs logger = init_logger(__name__) +WEIGHT_LOADER_V2_SUPPORTED = ["CompressedTensorsLinearMethod"] + def adjust_marlin_shard(param, shard_size, shard_offset): marlin_tile_size = getattr(param, "marlin_tile_size", None) @@ -288,6 +292,7 @@ def __init__(self, if output_sizes is None: output_sizes = [output_size] + self.quant_method.create_weights( layer=self, input_size_per_partition=self.input_size, @@ -295,7 +300,9 @@ def __init__(self, input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - weight_loader=self.weight_loader, + weight_loader=( + self.weight_loader_v2 if self.quant_method.__class__.__name__ + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader), prefix=prefix) if bias: self.bias = Parameter( @@ -337,6 +344,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) + def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor): + param.load_column_parallel_weight(loaded_weight=loaded_weight) + def forward(self, input_): bias = self.bias if not self.skip_bias_add else None @@ -527,6 +537,62 @@ def weight_loader(self, assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) + def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter, + loaded_weight: torch.Tensor): + """ + Handle special case for models where MLP layers are already + fused on disk. In this case, we have no shard id. This function + determmines the shard id by splitting these layers and then calls + the weight loader using the shard id. + + An example of a model with these fused layers: + https://huggingface.co/microsoft/Phi-3-mini-4k-instruct + """ + + current_shard_offset = 0 + shard_offsets: List[Tuple[int, int, int]] = [] + for i, output_size in enumerate(self.output_sizes): + shard_offsets.append((i, current_shard_offset, output_size)) + current_shard_offset += output_size + + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if isinstance(param, PackedvLLMParameter + ) and param.packed_dim == param.output_dim: + param.adjust_shard_indexes_for_packing( + shard_size=shard_size, shard_offset=shard_offset) + + loaded_weight_shard = loaded_weight.narrow(param.output_dim, + shard_offset, + shard_size) + self.weight_loader_v2(param, loaded_weight_shard, shard_id) + + def weight_loader_v2(self, + param: BasevLLMParameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[int] = None): + param_data = param.data + if loaded_shard_id is None: + if param.output_dim is None: + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + return + self._load_fused_module_from_checkpoint(param, loaded_weight) + return + + assert loaded_shard_id < len(self.output_sizes) + + tp_size = get_tensor_model_parallel_world_size() + shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size + shard_size = self.output_sizes[loaded_shard_id] // tp_size + + param.load_merged_column_weight(loaded_weight=loaded_weight, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size) + class QKVParallelLinear(ColumnParallelLinear): """Linear layers for the attention's QKV transformation. @@ -598,6 +664,82 @@ def __init__(self, quant_config=quant_config, prefix=prefix) + def _get_shard_offset_mapping(self, loaded_shard_id: str): + shard_offset_mapping = { + "q": 0, + "k": self.num_heads * self.head_size, + "v": (self.num_heads + self.num_kv_heads) * self.head_size, + "total": (self.num_heads + 2 * self.num_kv_heads) * self.head_size + } + return shard_offset_mapping.get(loaded_shard_id) + + def _get_shard_size_mapping(self, loaded_shard_id: str): + shard_size_mapping = { + "q": self.num_heads * self.head_size, + "k": self.num_kv_heads * self.head_size, + "v": self.num_kv_heads * self.head_size, + } + return shard_size_mapping.get(loaded_shard_id) + + def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter, + loaded_weight: torch.Tensor): + """ + Handle special case for models where QKV layers are already + fused on disk. In this case, we have no shard id. This function + determmines the shard id by splitting these layers and then calls + the weight loader using the shard id. + + An example of a model with these fused layers: + https://huggingface.co/microsoft/Phi-3-mini-4k-instruct + """ + shard_offsets = [ + # (shard_id, shard_offset, shard_size) + ("q", 0, self.total_num_heads * self.head_size), + ("k", self.total_num_heads * self.head_size, + self.total_num_kv_heads * self.head_size), + ("v", + (self.total_num_heads + self.total_num_kv_heads) * self.head_size, + self.total_num_kv_heads * self.head_size), + ] + + for shard_id, shard_offset, shard_size in shard_offsets: + # Special case for Quantization. + # If quantized, we need to adjust the offset and size to account + # for the packing. + if isinstance(param, PackedvLLMParameter + ) and param.packed_dim == param.output_dim: + param.adjust_shard_indexes_for_packing( + shard_size=shard_size, shard_offset=shard_offset) + + loaded_weight_shard = loaded_weight.narrow(param.output_dim, + shard_offset, + shard_size) + self.weight_loader_v2(param, loaded_weight_shard, shard_id) + + def weight_loader_v2(self, + param: BasevLLMParameter, + loaded_weight: torch.Tensor, + loaded_shard_id: Optional[str] = None): + param_data = param.data + if loaded_shard_id is None: # special case for certain models + if param.output_dim is None: + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + return + self._load_fused_module_from_checkpoint(param, loaded_weight) + return + + assert loaded_shard_id in ["q", "k", "v"] + + shard_offset = self._get_shard_offset_mapping(loaded_shard_id) + shard_size = self._get_shard_size_mapping(loaded_shard_id) + + param.load_qkv_weight(loaded_weight=loaded_weight, + num_heads=self.num_kv_head_replicas, + shard_id=loaded_shard_id, + shard_offset=shard_offset, + shard_size=shard_size) + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor, @@ -798,6 +940,7 @@ def __init__(self, self.tp_size = get_tensor_model_parallel_world_size() self.input_size_per_partition = divide(input_size, self.tp_size) assert self.quant_method is not None + self.quant_method.create_weights( layer=self, input_size_per_partition=self.input_size_per_partition, @@ -805,7 +948,9 @@ def __init__(self, input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, - weight_loader=self.weight_loader, + weight_loader=( + self.weight_loader_v2 if self.quant_method.__class__.__name__ + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader), prefix=prefix) if not reduce_results and (bias and not skip_bias_add): raise ValueError("When not reduce the results, adding bias to the " @@ -850,6 +995,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) + def weight_loader_v2(self, param: BasevLLMParameter, + loaded_weight: torch.Tensor): + param.load_row_parallel_weight(loaded_weight=loaded_weight) + def forward(self, input_): if self.input_is_parallel: input_parallel = input_ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 39d00bd5733ff..ae75781927381 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -19,6 +19,8 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform +__all__ = ["CompressedTensorsLinearMethod"] + class CompressedTensorsConfig(QuantizationConfig): @@ -146,18 +148,15 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel, if weight_quant is None or input_quant is None: return False - # Confirm we have floating points. - if not (weight_quant.type == QuantizationType.FLOAT - and input_quant.type == QuantizationType.FLOAT): - return False - # Confirm weight scheme is supported. + is_floating_point = (weight_quant.type == QuantizationType.FLOAT + and input_quant.type == QuantizationType.FLOAT) is_symmetric_weight = weight_quant.symmetric is_static_weight = not weight_quant.dynamic is_per_tensor_or_channel_weight = (weight_quant.strategy in [ QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL ]) - if not (is_symmetric_weight and is_static_weight + if not (is_floating_point and is_symmetric_weight and is_static_weight and is_per_tensor_or_channel_weight): return False @@ -169,11 +168,7 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel, is_symmetric_activation = input_quant.symmetric is_per_tensor_activation = ( input_quant.strategy == QuantizationStrategy.TENSOR) - if not (is_symmetric_activation and is_per_tensor_activation): - return False - - # All conditions satisfied. - return True + return is_symmetric_activation and is_per_tensor_activation def _is_fp8_w8a16(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: @@ -230,6 +225,7 @@ def _get_scheme_from_parts( group_size=weight_quant.group_size) # Detect If Activation Quantization. + # TODO @dsikka: clean-up conditions if is_activation_quantization_format(self.quant_format): if self._is_fp8_w8a8(weight_quant, input_quant): is_fp8_w8a8_supported = self._check_scheme_supported( @@ -237,7 +233,8 @@ def _get_scheme_from_parts( if is_fp8_w8a8_supported: return CompressedTensorsW8A8Fp8( strategy=weight_quant.strategy, - is_static_input_scheme=(not input_quant.dynamic)) + is_static_input_scheme=(input_quant + and not input_quant.dynamic)) else: return CompressedTensorsW8A16Fp8( strategy=weight_quant.strategy, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py index b7ba29ddc9840..2e8d520eacc81 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py @@ -2,11 +2,10 @@ import torch import torch.nn.functional as F -from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.parameter import ModelWeightParameter __all__ = ["CompressedTensorsUnquantized"] @@ -24,7 +23,9 @@ def get_min_capability(cls) -> int: return 70 def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - pass + # required by torch.compile to be torch.nn.Parameter + layer.weight = torch.nn.Parameter(layer.weight.data, + requires_grad=False) def create_weights(self, layer: torch.nn.Module, output_partition_sizes: List[int], @@ -32,14 +33,15 @@ def create_weights(self, layer: torch.nn.Module, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - requires_grad=False) + weight = ModelWeightParameter(data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) - set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) layer.register_parameter("weight", weight) - set_weight_attrs(weight, {"weight_loader": weight_loader}) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index c1adfdb2980b6..9ad61a64e406c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -8,7 +8,10 @@ CompressedTensorsScheme) from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N) -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.parameter import (BasevLLMParameter, + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedvLLMParameter) from vllm.scalar_type import scalar_types __all__ = ["CompressedTensorsW4A16Sparse24"] @@ -45,7 +48,12 @@ def get_min_capability(cls) -> int: return 80 def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - pass + # required by torch.compile to be torch.nn.Parameter + layer.weight_packed = Parameter(layer.weight_packed.data, + requires_grad=False) + layer.scale_packed = Parameter(layer.scale_packed.data, + requires_grad=False) + layer.meta = Parameter(layer.meta.data, requires_grad=False) def create_weights(self, layer: torch.nn.Module, input_size: int, output_partition_sizes: List[int], @@ -56,79 +64,65 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, pack_factor = 32 // self.quant_type.size_bits output_size_per_partition = sum(output_partition_sizes) - qweight = Parameter( - torch.empty( - input_size_per_partition // self.tile_size // 2, - output_size_per_partition * self.tile_size // pack_factor, - dtype=torch.int32, - ), - requires_grad=False, - ) - set_weight_attrs( - qweight, - { - "input_dim": 0, - "output_dim": 1, - "packed_dim": 1, - "pack_factor": pack_factor, - "marlin_tile_size": self.tile_size, - "weight_loader": weight_loader - }, - ) - - layer.register_parameter("weight_packed", qweight) + qweight = PackedvLLMParameter(data=torch.empty( + input_size_per_partition // self.tile_size // 2, + output_size_per_partition * self.tile_size // pack_factor, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=pack_factor, + marlin_tile_size=self.tile_size, + weight_loader=weight_loader) input_groups = (1 if self.group_size is None else input_size_per_partition // self.group_size) - scales = Parameter( + weight_scale_args = { + "data": torch.empty( input_groups, output_size_per_partition, dtype=params_dtype, ), - requires_grad=False, - ) - set_weight_attrs( - scales, - { - "output_dim": 1, - "input_dim": None if input_groups == 1 else 0, - "weight_loader": weight_loader - }, - ) - layer.register_parameter("scale_packed", scales) - - weight_shape = Parameter(torch.empty(2, dtype=torch.int64), - requires_grad=False) + "weight_loader": + weight_loader + } + + if self.group_size is not None: + scales = GroupQuantScaleParameter(output_dim=1, + input_dim=0, + **weight_scale_args) + else: + scales = ChannelQuantScaleParameter(output_dim=1, + **weight_scale_args) + + weight_shape = BasevLLMParameter(data=torch.empty(2, + dtype=torch.int64), + weight_loader=weight_loader) + + meta = PackedvLLMParameter(data=torch.empty( + input_size_per_partition // 8 // 2 // 2, + output_size_per_partition * 2, + dtype=torch.int16, + ), + input_dim=0, + output_dim=1, + packed_dim=1, + packed_factor=1, + marlin_tile_size=2, + weight_loader=weight_loader) + layer.register_parameter("weight_packed", qweight) layer.register_parameter("weight_shape", weight_shape) - set_weight_attrs(weight_shape, {"weight_loader": weight_loader}) - - meta = Parameter( - torch.empty( - input_size_per_partition // 8 // 2 // 2, - output_size_per_partition * 2, - dtype=torch.int16, - ), - requires_grad=False, - ) - set_weight_attrs( - meta, - { - "input_dim": 0, - "packed_dim": 1, - "pack_factor": 1, - "output_dim": 1, - "marlin_tile_size": 2, - "weight_loader": weight_loader - }, - ) + layer.register_parameter("scale_packed", scales) layer.register_parameter("meta", meta) max_workspace_size = ( output_size_per_partition // GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL + workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int), requires_grad=False) layer.workspace = workspace diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index eeb7c042e1d1f..3d55d55cc390d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -9,9 +9,10 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - convert_to_channelwise, create_per_channel_scale_param, - create_per_tensor_scale_param) -from vllm.model_executor.utils import set_weight_attrs + convert_to_channelwise) +from vllm.model_executor.parameter import (ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter) __all__ = ["CompressedTensorsW8A16Fp8"] @@ -40,11 +41,19 @@ def process_weights_after_loading(self, layer) -> None: layer.logical_widths) layer.weight_scale = torch.nn.Parameter(ws_channelwise, requires_grad=False) + else: + # required by torch.compile to be torch.nn.Parameter + layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data, + requires_grad=False) # Weights must be transposed for marlin layer.weight = torch.nn.Parameter(layer.weight.t(), requires_grad=False) + if self.is_static_input_scheme: + # required by torch.compile to be torch.nn.Parameter + layer.input_scale = torch.nn.Parameter(layer.input_scale.data, + requires_grad=False) prepare_fp8_layer_for_marlin(layer, strategy="channel") def create_weights(self, layer: torch.nn.Module, input_size: int, @@ -60,35 +69,39 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, layer.orig_dtype = params_dtype # WEIGHT - weight = torch.nn.Parameter(torch.empty(output_size_per_partition, - input_size_per_partition, - dtype=torch.float8_e4m3fn), - requires_grad=False) + weight = ModelWeightParameter(data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) layer.register_parameter("weight", weight) - set_weight_attrs(weight, { - "input_dim": 1, - "output_dim": 0, - "weight_loader": weight_loader, - }) # WEIGHT SCALE - layer_kwargs = {"weight_loader": weight_loader} if self.strategy == QuantizationStrategy.CHANNEL: - weight_scale = create_per_channel_scale_param( - output_partition_sizes, **layer_kwargs) + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), + dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) elif self.strategy == QuantizationStrategy.TENSOR: - weight_scale = create_per_tensor_scale_param( - output_partition_sizes, **layer_kwargs) + weight_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) else: raise ValueError( f"Unsupported weight strategy={self.strategy}, " f"supported strategies are {SUPPORTED_STRATEGIES}") + + weight_scale[:] = torch.finfo(torch.float32).min layer.register_parameter("weight_scale", weight_scale) # INPUT SCALE (to deal with converted checkpoints) if self.is_static_input_scheme: - input_scale = create_per_tensor_scale_param( - output_partition_sizes, **layer_kwargs) + input_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) layer.register_parameter("input_scale", input_scale) def apply_weights(self, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index cc9d71db140c2..8a3d24e2fd258 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -8,10 +8,10 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - apply_fp8_linear, create_per_channel_scale_param, - create_per_tensor_scale_param, cutlass_fp8_supported, - requantize_with_max_scale) -from vllm.model_executor.utils import set_weight_attrs + apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale) +from vllm.model_executor.parameter import (ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter) __all__ = ["CompressedTensorsW8A8Fp8"] @@ -46,6 +46,9 @@ def process_weights_after_loading(self, layer) -> None: elif self.strategy == QuantizationStrategy.CHANNEL: weight = layer.weight layer.weight = Parameter(weight.t(), requires_grad=False) + # required by torch.compile to be torch.nn.Parameter + layer.weight_scale = Parameter(layer.weight_scale.data, + requires_grad=False) else: raise ValueError(f"Unknown quantization strategy {self.strategy}") @@ -66,32 +69,40 @@ def create_weights(self, layer: torch.nn.Module, layer.logical_widths = output_partition_sizes # WEIGHT - weight = torch.nn.Parameter(torch.empty(output_size_per_partition, - input_size_per_partition, - dtype=torch.float8_e4m3fn), - requires_grad=False) + weight = ModelWeightParameter(data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) layer.register_parameter("weight", weight) - set_weight_attrs(weight, { - "input_dim": 1, - "output_dim": 0, - "weight_loader": weight_loader, - }) # WEIGHT SCALE - layer_kwargs = {"weight_loader": weight_loader} + # TODO: update create_xxx_parameter functions to return + # the newly added parameters if self.strategy == QuantizationStrategy.CHANNEL: - weight_scale = create_per_channel_scale_param( - output_partition_sizes, **layer_kwargs) + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), + dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) else: assert self.strategy == QuantizationStrategy.TENSOR - weight_scale = create_per_tensor_scale_param( - output_partition_sizes, **layer_kwargs) + weight_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + + # min requirement for fp8 kernels + weight_scale[:] = torch.finfo(torch.float32).min layer.register_parameter("weight_scale", weight_scale) # INPUT SCALE if self.is_static_input_scheme: - input_scale = create_per_tensor_scale_param( - output_partition_sizes, **layer_kwargs) + input_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) + input_scale[:] = torch.finfo(torch.float32).min layer.register_parameter("input_scale", input_scale) def apply_weights(self, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 3a80863d3abbe..078380f159291 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -8,9 +8,11 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - apply_int8_linear, convert_to_channelwise, create_per_channel_scale_param, - create_per_tensor_scale_param) -from vllm.model_executor.utils import set_weight_attrs + apply_int8_linear, convert_to_channelwise) +from vllm.model_executor.parameter import (BasevLLMParameter, + ChannelQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter) class CompressedTensorsW8A8Int8(CompressedTensorsScheme): @@ -39,7 +41,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: ws_channelwise = convert_to_channelwise(layer.weight_scale, self.logical_widths) layer.weight_scale = Parameter(ws_channelwise, requires_grad=False) - + else: + layer.weight_scale = Parameter(layer.weight_scale.data, + requires_grad=False) # INPUT SCALE if self.is_static_input_scheme: layer.input_scale = Parameter(layer.input_scale.max(), @@ -55,32 +59,35 @@ def create_weights(self, layer: torch.nn.Module, self.logical_widths = output_partition_sizes # WEIGHT - weight = Parameter(torch.empty(sum(output_partition_sizes), - input_size_per_partition, - dtype=torch.int8), - requires_grad=False) + weight = ModelWeightParameter(data=torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=torch.int8), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + layer.register_parameter("weight", weight) - set_weight_attrs(weight, { - "input_dim": 1, - "output_dim": 0, - "weight_loader": weight_loader, - }) # WEIGHT SCALE - layer_kwargs = {"weight_loader": weight_loader} if self.strategy == QuantizationStrategy.CHANNEL: - weight_scale = create_per_channel_scale_param( - output_partition_sizes, **layer_kwargs) + weight_scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), + dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) else: assert self.strategy == QuantizationStrategy.TENSOR - weight_scale = create_per_tensor_scale_param( - output_partition_sizes, **layer_kwargs) + weight_scale = PerTensorScaleParameter(data=torch.empty( + len(output_partition_sizes), dtype=torch.float32), + weight_loader=weight_loader) layer.register_parameter("weight_scale", weight_scale) # INPUT SCALE if self.is_static_input_scheme: - input_scale = create_per_tensor_scale_param( - output_partition_sizes, **layer_kwargs) + input_scale = BasevLLMParameter(data=torch.empty( + 1, dtype=torch.float32), + weight_loader=weight_loader) layer.register_parameter("input_scale", input_scale) def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index b8880f7ac136f..94699c27d5cee 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -1,7 +1,6 @@ from typing import Callable, List, Optional import torch -from torch.nn import Parameter from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( @@ -10,7 +9,10 @@ apply_gptq_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales, replace_tensor, verify_marlin_supported, verify_marlin_supports_shape) -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.parameter import (BasevLLMParameter, + ChannelQuantScaleParameter, + GroupQuantScaleParameter, + PackedvLLMParameter) from vllm.scalar_type import scalar_types __all__ = ["CompressedTensorsWNA16"] @@ -30,17 +32,12 @@ def __init__(self, self.pack_factor = 32 // num_bits self.strategy = strategy + self.group_size = -1 if group_size is None else group_size - self.group_size: int - if group_size is None: - if self.strategy != "channel": - raise ValueError( - "Marlin kernels require group quantization or " - "channelwise quantization, but found no group " - "size and strategy is not channelwise.") - self.group_size = -1 - else: - self.group_size = group_size + if self.group_size == -1 and self.strategy != "channel": + raise ValueError("Marlin kernels require group quantization or " + "channelwise quantization, but found no group " + "size and strategy is not channelwise.") if num_bits not in WNA16_SUPPORTED_TYPES_MAP: raise ValueError( @@ -63,11 +60,12 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, input_size_per_partition: int, params_dtype: torch.dtype, weight_loader: Callable, **kwargs): + output_size_per_partition = sum(output_partition_sizes) # If group_size is -1, we are in channelwise case. channelwise = (self.group_size == -1) - group_size = input_size if channelwise else self.group_size + group_size = self.group_size if self.group_size != -1 else input_size row_parallel = (input_size != input_size_per_partition) # In the case of channelwise quantization, we need to replicate the # scales across all gpus. @@ -79,60 +77,51 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, input_size=input_size, group_size=group_size) - weight_scale_dim = None scales_and_zp_size = input_size // group_size if partition_scales: assert input_size_per_partition % group_size == 0 - weight_scale_dim = 1 scales_and_zp_size = input_size_per_partition // group_size - weight = Parameter( - torch.empty( - output_size_per_partition, - input_size_per_partition // self.pack_factor, - dtype=torch.int32, - ), - requires_grad=False, - ) - - set_weight_attrs( - weight, { - "input_dim": 1, - "output_dim": 0, - "packed_dim": 1, - "pack_factor": self.pack_factor, - "weight_loader": weight_loader - }) - layer.register_parameter("weight_packed", weight) - - weight_scale = Parameter( + weight = PackedvLLMParameter(input_dim=1, + output_dim=0, + weight_loader=weight_loader, + packed_factor=self.pack_factor, + packed_dim=1, + data=torch.empty( + output_size_per_partition, + input_size_per_partition // + self.pack_factor, + dtype=torch.int32, + )) + + weight_scale_args = { + "weight_loader": + weight_loader, + "data": torch.empty( output_size_per_partition, scales_and_zp_size, dtype=params_dtype, - ), - requires_grad=False, - ) - - set_weight_attrs( - weight_scale, { - "weight_loader": weight_loader, - "input_dim": weight_scale_dim, - "output_dim": 0 - }) - layer.register_parameter("weight_scale", weight_scale) + ) + } + if self.group_size == -1: + weight_scale = ChannelQuantScaleParameter(output_dim=0, + **weight_scale_args) + else: + weight_scale = GroupQuantScaleParameter(output_dim=0, + input_dim=1, + **weight_scale_args) # A 2D array defining the original shape of the weights # before packing - weight_shape = Parameter(torch.empty(2, dtype=torch.int64), - requires_grad=False) + weight_shape = BasevLLMParameter(data=torch.empty(2, + dtype=torch.int64), + weight_loader=weight_loader) + layer.register_parameter("weight_packed", weight) + layer.register_parameter("weight_scale", weight_scale) layer.register_parameter("weight_shape", weight_shape) - set_weight_attrs(weight_shape, { - "weight_loader": weight_loader, - "ignore_warning": True, - }) layer.input_size_per_partition = input_size_per_partition layer.output_size_per_partition = output_size_per_partition @@ -154,10 +143,15 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # No zero-point layer.weight_zp = marlin_make_empty_g_idx(device) + # Update for kernel + layer.weight_packed = torch.nn.Parameter( + layer.weight_packed.t().contiguous(), requires_grad=False) + layer.weight_scale = torch.nn.Parameter( + layer.weight_scale.squeeze().t().contiguous(), requires_grad=False) # Repack weights from compressed-tensors format to marlin format. marlin_qweight = ops.gptq_marlin_repack( - layer.weight_packed.t().contiguous(), + layer.weight_packed, perm=layer.g_idx_sort_indices, size_k=layer.input_size_per_partition, size_n=layer.output_size_per_partition, @@ -166,7 +160,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # Permute scales from compressed-tensors format to marlin format. marlin_scales = marlin_permute_scales( - layer.weight_scale.squeeze().t().contiguous(), + layer.weight_scale, size_k=layer.input_size_per_partition, size_n=layer.output_size_per_partition, group_size=layer.group_size) diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py new file mode 100644 index 0000000000000..10239843b3222 --- /dev/null +++ b/vllm/model_executor/parameter.py @@ -0,0 +1,277 @@ +from typing import Callable, Optional, Union + +import torch +from torch.nn import Parameter + +from vllm.distributed import get_tensor_model_parallel_rank +from vllm.logger import init_logger + +__all__ = [ + "BasevLLMParameter", "PackedvLLMParameter", "PerTensorScaleParameter", + "ModelWeightParameter", "ChannelQuantScaleParameter", + "GroupQuantScaleParameter" +] + +logger = init_logger(__name__) + + +class BasevLLMParameter(Parameter): + """ + Base parameter for vLLM linear layers. Extends the torch.nn.parameter + by taking in a linear weight loader. Will copy the loaded weight + into the parameter when the provided weight loader is called. + """ + + def __new__(cls, data: torch.Tensor, **kwargs): + + return super().__new__(cls, data=data, requires_grad=False) + + def __init__(self, data: torch.Tensor, weight_loader: Callable): + """ + Initialize the BasevLLMParameter + + :param data: torch tensor with the parameter data + :param weight_loader: weight loader callable + + :returns: a torch.nn.parameter + """ + + self._weight_loader = weight_loader + + @property + def weight_loader(self): + return self._weight_loader + + def _assert_and_load(self, loaded_weight: torch.Tensor): + assert self.data.shape == loaded_weight.shape + self.data.copy_(loaded_weight) + + def load_column_parallel_weight(self, loaded_weight: torch.Tensor): + self._assert_and_load(loaded_weight) + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor): + self._assert_and_load(loaded_weight) + + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + self._assert_and_load(loaded_weight) + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + self._assert_and_load(loaded_weight) + + +class _ColumnvLLMParameter(BasevLLMParameter): + """ + Private class defining weight loading functionality + (load_merged_column_weight, load_qkv_weight) + for parameters being loaded into linear layers with column + parallelism. This includes QKV and MLP layers which are + not already fused on disk. Requires an output dimension + to be defined. Called within the weight loader of + each of the column parallel linear layers. + """ + + def __init__(self, output_dim: int, **kwargs): + self._output_dim = output_dim + super().__init__(**kwargs) + + @property + def output_dim(self): + return self._output_dim + + def load_column_parallel_weight(self, loaded_weight: torch.Tensor): + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.data.shape[self.output_dim] + loaded_weight = loaded_weight.narrow(self.output_dim, + tp_rank * shard_size, shard_size) + assert self.data.shape == loaded_weight.shape + self.data.copy_(loaded_weight) + + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + + shard_offset = kwargs.get("shard_offset") + shard_size = kwargs.get("shard_size") + if isinstance( + self, + PackedvLLMParameter) and self.packed_dim == self.output_dim: + shard_size, shard_offset = self.adjust_shard_indexes_for_packing( + shard_offset=shard_offset, shard_size=shard_size) + + param_data = self.data + + tp_rank = get_tensor_model_parallel_rank() + param_data = param_data.narrow(self.output_dim, shard_offset, + shard_size) + loaded_weight = loaded_weight.narrow(self.output_dim, + tp_rank * shard_size, shard_size) + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + + shard_offset = kwargs.get("shard_offset") + shard_size = kwargs.get("shard_size") + shard_id = kwargs.get("shard_id") + num_heads = kwargs.get("num_heads") + + if isinstance( + self, + PackedvLLMParameter) and self.output_dim == self.packed_dim: + shard_size, shard_offset = self.adjust_shard_indexes_for_packing( + shard_offset=shard_offset, shard_size=shard_size) + + param_data = self.data + tp_rank = get_tensor_model_parallel_rank() + shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads + param_data = param_data.narrow(self.output_dim, shard_offset, + shard_size) + loaded_weight = loaded_weight.narrow(self.output_dim, + shard_id * shard_size, shard_size) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +class ModelWeightParameter(_ColumnvLLMParameter): + """ + Parameter class for linear layer weights. Extends the + _ColumnvLLMParameter by adding loading functionality + for linear layers with row parallel functionality. + Requires an input dimension to be defined. + """ + + def __init__(self, input_dim: int, **kwargs): + self._input_dim = input_dim + super().__init__(**kwargs) + + @property + def input_dim(self): + return self._input_dim + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor): + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.data.shape[self.input_dim] + loaded_weight = loaded_weight.narrow(self.input_dim, + tp_rank * shard_size, shard_size) + + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert self.data.shape == loaded_weight.shape + self.data.copy_(loaded_weight) + + +class GroupQuantScaleParameter(ModelWeightParameter): + """ + Parameter class for weight scales loaded for weights with + grouped quantization. Equivalent to ModelWeightParameter. + """ + pass + + +class ChannelQuantScaleParameter(_ColumnvLLMParameter): + """ + Parameter class for weight scales loaded for weights with + channel-wise quantization. Equivalent to _ColumnvLLMParameter. + """ + pass + + +class PerTensorScaleParameter(BasevLLMParameter): + """ + Parameter class for scales where the number of scales is + equivalent to the number of logical matrices in fused linear + layers (e.g. for QKV, there are 3 scales loaded from disk). + This is relevant to weights with per-tensor quantization. + Adds functionality to map the scalers to a shard during + weight loading. + + Note: additional parameter manipulation may be handled + for each quantization config specifically, within + process_weights_after_loading + """ + + def __init__(self, **kwargs): + self.qkv_idxs = {"q": 0, "k": 1, "v": 2} + super().__init__(**kwargs) + + def _shard_id_as_int(self, shard_id: Union[str, int]) -> int: + if isinstance(shard_id, int): + return shard_id + + assert isinstance(shard_id, str) + assert shard_id in self.qkv_idxs + return self.qkv_idxs[shard_id] + + def load_merged_column_weight(self, *args, **kwargs): + self._load_into_shard_id(*args, **kwargs) + + def load_qkv_weight(self, *args, **kwargs): + self._load_into_shard_id(*args, **kwargs) + + def load_column_parallel_weight(self, *args, **kwargs): + self._load_into_shard_id(*args, **kwargs) + + def _load_into_shard_id(self, loaded_weight: torch.Tensor, + shard_id: Union[str, int], **kwargs): + """ + Slice the parameter data based on the shard id for + loading. + """ + + param_data = self.data + shard_id = self._shard_id_as_int(shard_id) + + # AutoFP8 scales do not have a shape + # compressed-tensors scales do have a shape + if len(loaded_weight.shape) != 0: + assert loaded_weight.shape[0] == 1 + loaded_weight = loaded_weight[0] + + param_data = param_data[shard_id] + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + +class PackedvLLMParameter(ModelWeightParameter): + """ + Parameter for model weights which are packed on disk. + Example: GPTQ Marlin weights are int4 or int8, packed into int32. + Extends the ModelWeightParameter to take in the + packed factor, the packed dimension, and optionally, marlin + tile size for marlin kernels. Adjusts the shard_size and + shard_offset for fused linear layers model weight loading + by accounting for packing and optionally, marlin tile size. + """ + + def __init__(self, + packed_factor: int, + packed_dim: int, + marlin_tile_size: Optional[int] = None, + **kwargs): + self._packed_factor = packed_factor + self._packed_dim = packed_dim + self._marlin_tile = marlin_tile_size + super().__init__(**kwargs) + + @property + def packed_dim(self): + return self._packed_dim + + @property + def packed_factor(self): + return self._packed_factor + + @property + def marlin_tile(self): + return self._marlin_tile + + def _adjust_shard_indexes_for_marlin(self, shard_size, shard_offset): + return shard_size * self.marlin_tile, shard_offset * self.marlin_tile + + def adjust_shard_indexes_for_packing(self, shard_size, shard_offset): + shard_size = shard_size // self.packed_factor + shard_offset = shard_offset // self.packed_factor + if self.marlin_tile is not None: + return self._adjust_shard_indexes_for_marlin( + shard_size, shard_offset) + return shard_size, shard_offset From 564985729abc267af281de11f737cfb29b5c0abb Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:24:56 -0400 Subject: [PATCH 04/18] [ BugFix ] Move `zmq` frontend to IPC instead of TCP (#7222) --- vllm/entrypoints/openai/api_server.py | 12 ++++++++---- vllm/entrypoints/openai/rpc/client.py | 6 +++--- vllm/entrypoints/openai/rpc/server.py | 10 ++++------ vllm/envs.py | 11 ++++++----- vllm/utils.py | 12 ++++++++---- 5 files changed, 29 insertions(+), 22 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 88f0bd4ee4dbe..48aa904d4721d 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -43,7 +43,7 @@ OpenAIServingTokenization) from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, get_open_port +from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -106,16 +106,20 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # Otherwise, use the multiprocessing AsyncLLMEngine. else: + # Select random path for IPC. + rpc_path = get_open_zmq_ipc_path() + logger.info("Multiprocessing frontend to use %s for RPC Path.", + rpc_path) + # Start RPCServer in separate process (holds the AsyncLLMEngine). - port = get_open_port(envs.VLLM_RPC_PORT) rpc_server_process = Process(target=run_rpc_server, args=(engine_args, UsageContext.OPENAI_API_SERVER, - port)) + rpc_path)) rpc_server_process.start() # Build RPCClient, which conforms to AsyncEngineClient Protocol. - async_engine_client = AsyncEngineRPCClient(port) + async_engine_client = AsyncEngineRPCClient(rpc_path) await async_engine_client.setup() try: diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py index 043649131560c..8552c286eeeea 100644 --- a/vllm/entrypoints/openai/rpc/client.py +++ b/vllm/entrypoints/openai/rpc/client.py @@ -21,9 +21,9 @@ class AsyncEngineRPCClient: - def __init__(self, port: int): + def __init__(self, rpc_path: str): self.context = zmq.asyncio.Context() - self.path = f"tcp://localhost:{port}" + self.rpc_path = rpc_path async def setup(self): """Setup the client before it starts sending server requests.""" @@ -58,7 +58,7 @@ def socket(self): # to enable streaming. socket = self.context.socket(zmq.constants.DEALER) try: - socket.connect(self.path) + socket.connect(self.rpc_path) yield socket finally: socket.close() diff --git a/vllm/entrypoints/openai/rpc/server.py b/vllm/entrypoints/openai/rpc/server.py index 60bb23b9bde05..617c9b7070e2c 100644 --- a/vllm/entrypoints/openai/rpc/server.py +++ b/vllm/entrypoints/openai/rpc/server.py @@ -20,7 +20,7 @@ class AsyncEngineRPCServer: def __init__(self, async_engine_args: AsyncEngineArgs, - usage_context: UsageContext, port: int): + usage_context: UsageContext, rpc_path: str): # Initialize engine first. self.engine = AsyncLLMEngine.from_engine_args(async_engine_args, usage_context) @@ -30,9 +30,7 @@ def __init__(self, async_engine_args: AsyncEngineArgs, # Init socket for readiness state. self.socket = self.context.socket(zmq.constants.ROUTER) - # Note numeric form of localhost should be used for zmq bind(), - # see https://stackoverflow.com/a/8958414 - self.socket.bind(f"tcp://127.0.0.1:{port}") + self.socket.bind(rpc_path) def cleanup(self): """Cleanup all resources.""" @@ -213,6 +211,6 @@ def signal_handler() -> None: def run_rpc_server(async_engine_args: AsyncEngineArgs, - usage_context: UsageContext, port: int): - server = AsyncEngineRPCServer(async_engine_args, usage_context, port) + usage_context: UsageContext, rpc_path: str): + server = AsyncEngineRPCServer(async_engine_args, usage_context, rpc_path) asyncio.run(run_server(server)) diff --git a/vllm/envs.py b/vllm/envs.py index 81d2d80e65e46..df4c994359dbd 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1,10 +1,11 @@ import os +import tempfile from typing import TYPE_CHECKING, Any, Callable, Dict, Optional if TYPE_CHECKING: VLLM_HOST_IP: str = "" VLLM_PORT: Optional[int] = None - VLLM_RPC_PORT: int = 5570 + VLLM_RPC_BASE_PATH: str = tempfile.gettempdir() VLLM_USE_MODELSCOPE: bool = False VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60 VLLM_INSTANCE_ID: Optional[str] = None @@ -142,10 +143,10 @@ def get_default_config_root(): lambda: int(os.getenv('VLLM_PORT', '0')) if 'VLLM_PORT' in os.environ else None, - # used when the frontend api server is running in multi-processing mode, - # to communicate with the backend engine process over ZMQ. - 'VLLM_RPC_PORT': - lambda: int(os.getenv('VLLM_RPC_PORT', '5570')), + # path used for ipc when the frontend api server is running in + # multi-processing mode to communicate with the backend engine process. + 'VLLM_RPC_BASE_PATH': + lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()), # If true, will load models from ModelScope instead of Hugging Face Hub. # note that the value is true or false, not numbers diff --git a/vllm/utils.py b/vllm/utils.py index 08aa889b5e447..1fd395c04ca24 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -19,6 +19,7 @@ from typing import (Any, AsyncGenerator, Awaitable, Callable, Dict, Generic, Hashable, List, Optional, OrderedDict, Set, Tuple, TypeVar, Union, overload) +from uuid import uuid4 import numpy as np import numpy.typing as npt @@ -484,10 +485,13 @@ def get_distributed_init_method(ip: str, port: int) -> str: return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}" -def get_open_port(port: Optional[int] = None) -> int: - if port is None: - # Default behavior here is to return a port for multi-gpu communication - port = envs.VLLM_PORT +def get_open_zmq_ipc_path() -> str: + base_rpc_path = envs.VLLM_RPC_BASE_PATH + return f"ipc://{base_rpc_path}/{uuid4()}" + + +def get_open_port() -> int: + port = envs.VLLM_PORT if port is not None: while True: try: From ab0f5e2823d31c150c01fc30c146b6ea801d6bdd Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 7 Aug 2024 12:29:27 -0400 Subject: [PATCH 05/18] Fixes typo in function name (#7275) Signed-off-by: Rafael Vasquez --- vllm/scripts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/scripts.py b/vllm/scripts.py index 403b22239aed0..f45bfe06047de 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -14,7 +14,7 @@ from vllm.utils import FlexibleArgumentParser -def registrer_signal_handlers(): +def register_signal_handlers(): def signal_handler(sig, frame): sys.exit(0) @@ -31,7 +31,7 @@ def serve(args: argparse.Namespace) -> None: def interactive_cli(args: argparse.Namespace) -> None: - registrer_signal_handlers() + register_signal_handlers() base_url = args.url api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY") From b764547616e6ae1517929985400b1dc62cdbc3a3 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 8 Aug 2024 00:32:07 +0800 Subject: [PATCH 06/18] [Bugfix] Fix input processor for InternVL2 model (#7164) Co-authored-by: Cyrus Leung --- tests/models/test_internvl.py | 23 +++++-- vllm/model_executor/models/internvl.py | 84 +++++++++++++++++--------- 2 files changed, 73 insertions(+), 34 deletions(-) diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py index 66cb8dda248db..6aa0189648d72 100644 --- a/tests/models/test_internvl.py +++ b/tests/models/test_internvl.py @@ -5,6 +5,7 @@ import torch from huggingface_hub import snapshot_download from PIL.Image import Image +from transformers import AutoConfig from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END, IMG_START, @@ -26,10 +27,15 @@ # we use snapshot_download to prevent conflicts between # dynamic_module and trust_remote_code for hf_runner +DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"] models = [ - snapshot_download("OpenGVLab/InternVL2-1B"), - snapshot_download("OpenGVLab/InternVL2-2B"), - # snapshot_download("OpenGVLab/InternVL2-4B"), # broken + snapshot_download("OpenGVLab/InternVL2-1B", + allow_patterns=DOWNLOAD_PATTERN), + snapshot_download("OpenGVLab/InternVL2-2B", + allow_patterns=DOWNLOAD_PATTERN), + # Broken due to outdated implementation of Phi-3 + # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3 + # snapshot_download("OpenGVLab/InternVL2-4B"), ] @@ -41,8 +47,17 @@ def __init__(self, hf_runner: HfRunner): self.tokenizer = hf_runner.tokenizer self.dtype = hf_runner.model.dtype + self.config = AutoConfig.from_pretrained(hf_runner.model_name) + self.vision_config = self.config.vision_config + self.use_thumbnail = self.config.use_thumbnail + self.min_num = self.config.min_dynamic_patch + self.max_num = self.config.max_dynamic_patch + self.image_size = self.vision_config.image_size + def __call__(self, text: str, images: Image, **kwargs): - pixel_values = image_to_pixel_values(images).to(self.dtype) + pixel_values = image_to_pixel_values(images, self.image_size, + self.min_num, self.max_num, + self.use_thumbnail).to(self.dtype) num_patches_list = [pixel_values.shape[0]] for num_patches in num_patches_list: context_tokens = IMG_CONTEXT * self.num_image_token * num_patches diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 8850fd7c6763b..49f9a4c85f2d0 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -38,9 +38,6 @@ IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) -MAX_IMAGE_FEATURE_SIZE_WIDTH = 3000 -MAX_IMAGE_FEATURE_SIZE_HEIGHT = 500 - class InternVLImagePixelInputs(TypedDict): type: Literal["pixel_values"] @@ -84,11 +81,9 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, return best_ratio -def calculate_num_blocks(orig_width: int, - orig_height: int, - min_num=1, - max_num=6, - image_size=448): +def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int, + max_num: int, + image_size: int) -> Tuple[int, int, int]: aspect_ratio = orig_width / orig_height # calculate the existing image aspect ratio @@ -110,11 +105,9 @@ def calculate_num_blocks(orig_width: int, # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def dynamic_preprocess(image, - min_num=1, - max_num=6, - image_size=448, - use_thumbnail=False): +def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int, + image_size: int, + use_thumbnail: int) -> List[Image.Image]: orig_width, orig_height = image.size blocks, target_width, target_height = calculate_num_blocks( @@ -138,12 +131,14 @@ def dynamic_preprocess(image, # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def image_to_pixel_values(image: Image.Image, input_size=448, max_num=6): +def image_to_pixel_values(image: Image.Image, input_size: int, min_num: int, + max_num: int, use_thumbnail: bool) -> torch.Tensor: transform = build_transform(input_size=input_size) images = dynamic_preprocess(image, + min_num=min_num, + max_num=max_num, image_size=input_size, - use_thumbnail=True, - max_num=max_num) + use_thumbnail=use_thumbnail) pixel_values = [transform(image) for image in images] pixel_values = torch.stack(pixel_values) return pixel_values @@ -159,12 +154,18 @@ def get_internvl_num_patches(image_size: int, patch_size: int, def get_max_internvl_image_tokens(ctx: InputContext): hf_config = ctx.get_hf_config(PretrainedConfig) vision_config = hf_config.vision_config + + use_thumbnail = hf_config.use_thumbnail + max_dynamic_patch = hf_config.max_dynamic_patch + if use_thumbnail: + max_dynamic_patch += 1 + downsample_ratio = hf_config.downsample_ratio + image_size = vision_config.image_size patch_size = vision_config.patch_size - downsample_ratio = hf_config.downsample_ratio num_patches = get_internvl_num_patches(image_size, patch_size, downsample_ratio) - return num_patches * 7 + return num_patches * max_dynamic_patch def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): @@ -176,21 +177,27 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): hf_config = ctx.get_hf_config(PretrainedConfig) vision_config = hf_config.vision_config + image_size = vision_config.image_size + patch_size = vision_config.patch_size + downsample_ratio = hf_config.downsample_ratio + num_patches = get_internvl_num_patches(image_size, patch_size, + downsample_ratio) + image_data = multi_modal_data["image"] if isinstance(image_data, Image.Image): width, height = image_data.size - num_blocks, _, _ = calculate_num_blocks(width, height) + min_num = hf_config.min_dynamic_patch + max_num = hf_config.max_dynamic_patch + num_blocks, _, _ = calculate_num_blocks(width, height, min_num, + max_num, image_size) + # add thumbnail image if num_blocks > 1 + if hf_config.use_thumbnail and num_blocks > 1: + num_blocks += 1 elif isinstance(image_data, torch.Tensor): raise NotImplementedError("Embeddings input is not supported yet") else: raise TypeError(f"Invalid image type: {type(image_data)}") - image_size = vision_config.image_size - patch_size = vision_config.patch_size - downsample_ratio = hf_config.downsample_ratio - num_patches = get_internvl_num_patches(image_size, patch_size, - downsample_ratio) - tokenizer = cached_get_tokenizer(model_config.tokenizer, trust_remote_code=True) @@ -198,8 +205,7 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): prompt_token_ids = llm_inputs["prompt_token_ids"] if prompt is None: prompt = tokenizer.decode(prompt_token_ids) - image_prompt = IMG_START + IMG_CONTEXT * (num_blocks + - 1) * num_patches + IMG_END + image_prompt = IMG_START + IMG_CONTEXT * num_blocks * num_patches + IMG_END new_prompt = prompt.replace('', image_prompt, 1) new_prompt_token_ids = tokenizer.encode(new_prompt) @@ -209,8 +215,19 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): def input_mapper_for_internvl(ctx: InputContext, data: object): + hf_config = ctx.get_hf_config(PretrainedConfig) + + use_thumbnail = hf_config.use_thumbnail + min_num = hf_config.min_dynamic_patch + max_num = hf_config.max_dynamic_patch + image_size = hf_config.vision_config.image_size + if isinstance(data, Image.Image): - data = image_to_pixel_values(data) + data = image_to_pixel_values(data, + image_size, + min_num, + max_num, + use_thumbnail=use_thumbnail) model_config = ctx.model_config tokenizer = cached_get_tokenizer(model_config.tokenizer, trust_remote_code=True) @@ -240,10 +257,17 @@ def dummy_data_for_internvl(ctx: InputContext, seq_len: int): add_special_tokens=False)[0], image_feature_size_override=image_feature_size, ) + + image_size = vision_config.image_size + min_num = hf_config.min_dynamic_patch + max_num = hf_config.max_dynamic_patch + max_image_width = max_num * image_size + max_image_height = min_num * image_size + mm_data = dummy_image_for_clip( vision_config, - image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + image_width_override=max_image_width, + image_height_override=max_image_height, ) return seq_data, mm_data From 80cbe10c59a3354d0fc7c841fdc6422fc64899aa Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 7 Aug 2024 20:49:10 +0400 Subject: [PATCH 07/18] [OpenVINO] migrate to latest dependencies versions (#7251) --- Dockerfile.openvino | 2 +- .../getting_started/openvino-installation.rst | 2 +- requirements-openvino.txt | 33 ++----------------- setup.py | 2 +- 4 files changed, 6 insertions(+), 33 deletions(-) diff --git a/Dockerfile.openvino b/Dockerfile.openvino index c84dea419e58a..06ca4638dfeb9 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -21,7 +21,7 @@ COPY setup.py /workspace/vllm/ # install build requirements RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt # build vLLM with OpenVINO backend -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/ +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/ COPY examples/ /workspace/vllm/examples COPY benchmarks/ /workspace/vllm/benchmarks diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst index 62256df091a44..d8f27c4328a58 100644 --- a/docs/source/getting_started/openvino-installation.rst +++ b/docs/source/getting_started/openvino-installation.rst @@ -57,7 +57,7 @@ Install from source .. code-block:: console - $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v . + $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . .. _openvino_backend_performance_tips: diff --git a/requirements-openvino.txt b/requirements-openvino.txt index 2dd971d6400be..419294aa75626 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -1,34 +1,7 @@ # Common dependencies -# -r requirements-common.txt -# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2 -cmake >= 3.21 -ninja # For faster builds. -psutil -sentencepiece # Required for LLaMA tokenizer. -numpy < 2.0.0 -requests -tqdm -py-cpuinfo -transformers < 4.43 -tokenizers >= 0.19.1 # Required for Llama 3. -fastapi -aiohttp -openai -uvicorn[standard] -pydantic >= 2.0 # Required for OpenAI server. -pillow # Required for image processing -prometheus_client >= 0.18.0 -prometheus-fastapi-instrumentator >= 7.0.0 -tiktoken >= 0.6.0 # Required for DBRX tokenizer -lm-format-enforcer == 0.10.3 -outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0 -typing_extensions -filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 -pyzmq -gguf == 0.9.1 +-r requirements-common.txt # OpenVINO dependencies torch >= 2.1.2 -openvino ~= 2024.3.0.dev -openvino-tokenizers[transformers] ~= 2024.3.0.0.dev -optimum-intel[openvino] >= 1.18.1 +openvino ~= 2024.3.0 +optimum-intel[openvino] >= 1.18.2 diff --git a/setup.py b/setup.py index b146299f8269d..f6e005879aeff 100644 --- a/setup.py +++ b/setup.py @@ -272,7 +272,7 @@ def _build_custom_ops() -> bool: def _build_core_ext() -> bool: - return not _is_neuron() and not _is_tpu() + return not _is_neuron() and not _is_tpu() and not _is_openvino() def get_hipcc_rocm_version(): From 0e12cd67a8f84047e6d28084bfe94a8278e10218 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 7 Aug 2024 09:58:02 -0700 Subject: [PATCH 08/18] [Doc] add online speculative decoding example (#7243) --- docs/source/models/spec_decode.rst | 66 +++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst index be901fa881b12..d3c196faff25d 100644 --- a/docs/source/models/spec_decode.rst +++ b/docs/source/models/spec_decode.rst @@ -14,17 +14,17 @@ Speculative decoding is a technique which improves inter-token latency in memory Speculating with a draft model ------------------------------ -The following code configures vLLM to use speculative decoding with a draft model, speculating 5 tokens at a time. +The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. .. code-block:: python from vllm import LLM, SamplingParams - + prompts = [ "The future of AI is", ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - + llm = LLM( model="facebook/opt-6.7b", tensor_parallel_size=1, @@ -33,12 +33,56 @@ The following code configures vLLM to use speculative decoding with a draft mode use_v2_block_manager=True, ) outputs = llm.generate(prompts, sampling_params) - + for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +To perform the same with an online mode launch the server: + +.. code-block:: bash + + python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ + --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \ + --num_speculative_tokens 5 --gpu_memory_utilization 0.8 + + Then use a client: + +.. code-block:: python + + from openai import OpenAI + + # Modify OpenAI's API key and API base to use vLLM's API server. + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, + ) + + models = client.models.list() + model = models.data[0].id + + # Completion API + stream = False + completion = client.completions.create( + model=model, + prompt="The future of AI is", + echo=False, + n=1, + stream=stream, + ) + + print("Completion results:") + if stream: + for c in completion: + print(c) + else: + print(completion) + Speculating by matching n-grams in the prompt --------------------------------------------- @@ -48,12 +92,12 @@ matching n-grams in the prompt. For more information read `this thread. `_ or +For more information see `this blog `_ or `this technical report `_. .. code-block:: python @@ -100,9 +144,9 @@ For more information see `this blog Date: Wed, 7 Aug 2024 15:09:36 -0300 Subject: [PATCH 09/18] [BugFix] Fix frontend multiprocessing hang (#7217) Signed-off-by: Max de Bayser Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> --- tests/entrypoints/openai/test_mp_crash.py | 35 +++++++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 11 ++++++- vllm/entrypoints/openai/rpc/client.py | 26 ++++++++++++++--- 3 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 tests/entrypoints/openai/test_mp_crash.py diff --git a/tests/entrypoints/openai/test_mp_crash.py b/tests/entrypoints/openai/test_mp_crash.py new file mode 100644 index 0000000000000..7dc595a7be351 --- /dev/null +++ b/tests/entrypoints/openai/test_mp_crash.py @@ -0,0 +1,35 @@ +from typing import Any + +import pytest + +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.entrypoints.openai.api_server import build_async_engine_client +from vllm.entrypoints.openai.cli_args import make_arg_parser +from vllm.utils import FlexibleArgumentParser + + +def crashing_from_engine_args( + cls, + engine_args: Any = None, + start_engine_loop: Any = None, + usage_context: Any = None, + stat_loggers: Any = None, +) -> "AsyncLLMEngine": + raise Exception("foo") + + +@pytest.mark.asyncio +async def test_mp_crash_detection(monkeypatch): + + with pytest.raises(RuntimeError) as excinfo, monkeypatch.context() as m: + m.setattr(AsyncLLMEngine, "from_engine_args", + crashing_from_engine_args) + parser = FlexibleArgumentParser( + description="vLLM's remote OpenAI server.") + parser = make_arg_parser(parser) + args = parser.parse_args([]) + + async with build_async_engine_client(args): + pass + assert "The server process died before responding to the readiness probe"\ + in str(excinfo.value) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 48aa904d4721d..d44604b12fb69 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -120,9 +120,18 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # Build RPCClient, which conforms to AsyncEngineClient Protocol. async_engine_client = AsyncEngineRPCClient(rpc_path) - await async_engine_client.setup() try: + while True: + try: + await async_engine_client.setup() + break + except TimeoutError as e: + if not rpc_server_process.is_alive(): + raise RuntimeError( + "The server process died before " + "responding to the readiness probe") from e + yield async_engine_client finally: # Ensure rpc server process was terminated diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py index 8552c286eeeea..d69b202e2d1bb 100644 --- a/vllm/entrypoints/openai/rpc/client.py +++ b/vllm/entrypoints/openai/rpc/client.py @@ -18,6 +18,9 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +# Time to wait before checking it the server process is alive. +SERVER_START_TIMEOUT_MS = 1000 + class AsyncEngineRPCClient: @@ -61,7 +64,16 @@ def socket(self): socket.connect(self.rpc_path) yield socket finally: - socket.close() + # linger == 0 means discard unsent messages + # when the socket is closed. This is necessary + # because otherwise self.context.destroy() will + # wait for 30 seconds until unsent messages are + # received, which is impossible if the server + # crashed. In the absence of a server crash we + # always expect a response before closing the + # socket anyway. + # Reference: http://api.zeromq.org/4-2:zmq-setsockopt#toc24 + socket.close(linger=0) async def _send_get_data_rpc_request(self, request: RPCUtilityRequest, expected_type: Any, @@ -85,14 +97,19 @@ async def _send_get_data_rpc_request(self, request: RPCUtilityRequest, return data - async def _send_one_way_rpc_request(self, request: RPC_REQUEST_TYPE, - error_message: str): + async def _send_one_way_rpc_request(self, + request: RPC_REQUEST_TYPE, + error_message: str, + timeout: Optional[int] = None): """Send one-way RPC request to trigger an action.""" with self.socket() as socket: # Ping RPC Server with request. await socket.send(cloudpickle.dumps(request)) # Await acknowledgement from RPCServer. + if timeout is not None and await socket.poll(timeout=timeout) == 0: + raise TimeoutError(f"server didn't reply within {timeout} ms") + response = cloudpickle.loads(await socket.recv()) if not isinstance(response, str) or response != VLLM_RPC_SUCCESS_STR: @@ -117,7 +134,8 @@ async def wait_for_server(self): await self._send_one_way_rpc_request( request=RPCUtilityRequest.IS_SERVER_READY, - error_message="Unable to start RPC Server.") + error_message="Unable to start RPC Server.", + timeout=SERVER_START_TIMEOUT_MS) async def _get_model_config_rpc(self) -> ModelConfig: """Get the ModelConfig object from the RPC Server""" From 5223199e03ac3729eb60043a1ef57156c8af1bc9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 7 Aug 2024 14:23:12 -0400 Subject: [PATCH 10/18] [Bugfix][FP8] Fix dynamic FP8 Marlin quantization (#7219) --- tests/quantization/test_fp8.py | 19 +++++++++++++++---- vllm/envs.py | 8 ++++++++ .../model_executor/layers/quantization/fp8.py | 11 ++++++++++- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index a020f7bf37262..ebb06ed20f249 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -9,6 +9,7 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod, Fp8LinearMethod) +from vllm.platforms import current_platform MODELS = [ "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", @@ -20,7 +21,12 @@ @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="FP8 is not supported on this GPU type.") @pytest.mark.parametrize("model_id", MODELS) -def test_model_load_and_run(vllm_runner, model_id: str): +@pytest.mark.parametrize("force_marlin", [False, True]) +def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool, + monkeypatch) -> None: + if force_marlin: + monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") + with vllm_runner(model_id) as llm: # note: this does not test accuracy, just that we can run through # see lm-eval tests for accuracy @@ -61,7 +67,12 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="FP8 is not supported on this GPU type.") @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) -def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None: +@pytest.mark.parametrize("force_marlin", [False, True]) +def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, + monkeypatch) -> None: + if force_marlin: + monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") + with vllm_runner("facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype) as llm: @@ -75,9 +86,9 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str) -> None: assert attn._k_scale == 1.0 assert attn._v_scale == 1.0 - capability = torch.cuda.get_device_capability() + capability = current_platform.get_device_capability() capability = capability[0] * 10 + capability[1] - if capability >= 89: + if capability >= 89 and not force_marlin: # For GPUs with hardware support, we keep weights in fp8 assert fc1.weight.dtype == torch.float8_e4m3fn else: diff --git a/vllm/envs.py b/vllm/envs.py index df4c994359dbd..81f30b1d42a13 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -52,6 +52,7 @@ CMAKE_BUILD_TYPE: Optional[str] = None VERBOSE: bool = False VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False + VLLM_TEST_FORCE_FP8_MARLIN: bool = False def get_default_cache_root(): @@ -342,6 +343,13 @@ def get_default_config_root(): lambda: (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in ("1", "true")), + + # If set, forces FP8 Marlin to be used for FP8 quantization regardless + # of the hardware support for FP8 compute. + "VLLM_TEST_FORCE_FP8_MARLIN": + lambda: + (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in + ("1", "true")), } # end-env-vars-definition diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index c829cb836ee4c..cdd2413f5b2c4 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -4,6 +4,7 @@ from torch.nn import Module from torch.nn.parameter import Parameter +import vllm.envs as envs from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase @@ -118,7 +119,7 @@ def __init__(self, quant_config: Fp8Config): # kernel for fast weight-only FP8 quantization capability = current_platform.get_device_capability() capability = capability[0] * 10 + capability[1] - self.use_marlin = capability < 89 + self.use_marlin = capability < 89 or envs.VLLM_TEST_FORCE_FP8_MARLIN def create_weights( self, @@ -174,6 +175,14 @@ def process_weights_after_loading(self, layer: Module) -> None: qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) + # If using marlin (w8a16), kernel uses channelwise weights, + # so extend the weight scales to be channelwise. + if self.use_marlin: + assert weight_scale.numel() == 1 + weight_scale = convert_to_channelwise( + weight_scale.expand(len(layer.logical_widths)), + layer.logical_widths) + # Update the layer with the new values. layer.weight = Parameter(qweight.t(), requires_grad=False) layer.weight_scale = Parameter(weight_scale, requires_grad=False) From 469b3bc538bcbdc1d9b7d30c415bdb0361853871 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 7 Aug 2024 11:34:25 -0700 Subject: [PATCH 11/18] [ci] Make building wheels per commit optional (#7278) Signed-off-by: kevin --- .buildkite/release-pipeline.yaml | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 5be9a553dddd4..416fe344a36ea 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,9 +1,27 @@ steps: - - label: "Build wheel - CUDA {{matrix.cuda_version}}" + - label: "Build wheel - CUDA 12.1" agents: queue: cpu_queue commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + # rename the files to change linux -> manylinux1 + - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done" + - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/" + - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/" + env: + DOCKER_BUILDKIT: "1" + + - block: "Build CUDA 11.8 wheel" + key: block-build-cu118-wheel + + - label: "Build wheel - CUDA 11.8" + depends_on: block-build-cu118-wheel + agents: + queue: cpu_queue + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" # rename the files to change linux -> manylinux1 @@ -12,8 +30,3 @@ steps: - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/" env: DOCKER_BUILDKIT: "1" - matrix: - setup: - cuda_version: - - "11.8.0" - - "12.1.0" From 311f743831f164f83361295b562f6cdb88ad3418 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 7 Aug 2024 16:05:37 -0400 Subject: [PATCH 12/18] [Bugfix] Fix gptq failure on T4s (#7264) --- .../layers/quantization/awq_marlin.py | 3 +-- .../layers/quantization/gptq_marlin.py | 3 +-- .../layers/quantization/utils/marlin_utils.py | 23 ++++++++++--------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 2cc080608c7a9..1b0453c2bd6f8 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -126,8 +126,7 @@ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]): return check_marlin_supported(quant_type=cls.TYPE_MAP[num_bits], group_size=group_size, - has_zp=has_zp, - min_capability=cls.get_min_capability()) + has_zp=has_zp) class AWQMarlinLinearMethod(LinearMethodBase): diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 066102f3a01c0..b92697531c299 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -136,8 +136,7 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): return False return check_marlin_supported(quant_type=cls.TYPE_MAP[(num_bits, sym)], - group_size=group_size, - min_capability=cls.get_min_capability()) + group_size=group_size) class GPTQMarlinLinearMethod(LinearMethodBase): diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 6e84d36219361..0ec68ac5b0f21 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -26,12 +26,13 @@ # without runtime zero-point. We support common cases, i.e. AWQ and GPTQ. # TODO: we may want to move this into the C++ so its closer to the actual impl def query_marlin_supported_quant_types(has_zp: bool, - min_capability: Optional[int] = None): - if min_capability is None: + device_capability: Optional[int] = None + ): + if device_capability is None: major, minor = current_platform.get_device_capability() - min_capability = major * 10 + minor + device_capability = major * 10 + minor - if min_capability < 80: + if device_capability < 80: return [] if has_zp: @@ -48,20 +49,20 @@ def _check_marlin_supported( quant_type: ScalarType, group_size: Optional[int], has_zp: bool, - min_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]: + device_capability: Optional[int] = None) -> Tuple[bool, Optional[str]]: - if min_capability is None: + if device_capability is None: major, minor = current_platform.get_device_capability() - min_capability = major * 10 + minor + device_capability = major * 10 + minor supported_types = query_marlin_supported_quant_types( - has_zp, min_capability) + has_zp, device_capability) if quant_type not in supported_types: return (False, f"Marlin does not support weight_bits = {quant_type}. " f"Only types = {supported_types} " f"are supported (for group_size = {group_size}, " - f"min_capability = {min_capability}, zp = {has_zp}).") + f"device_capability = {device_capability}, zp = {has_zp}).") if (group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES): return (False, f"Marlin does not support group_size = {group_size}. " f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} " @@ -73,9 +74,9 @@ def _check_marlin_supported( def check_marlin_supported(quant_type: ScalarType, group_size: int, has_zp: bool = False, - min_capability: Optional[int] = None) -> bool: + device_capability: Optional[int] = None) -> bool: cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, - min_capability) + device_capability) return cond From fc1493a01edf0ae6f50299daae15679e4c08d74c Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 7 Aug 2024 13:35:14 -0700 Subject: [PATCH 13/18] [FrontEnd] Make `merge_async_iterators` `is_cancelled` arg optional (#7282) --- vllm/utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 1fd395c04ca24..4137aaec8a93c 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -405,7 +405,7 @@ async def iterate_with_cancellation( async def merge_async_iterators( *iterators: AsyncGenerator[T, None], - is_cancelled: Callable[[], Awaitable[bool]], + is_cancelled: Optional[Callable[[], Awaitable[bool]]] = None, ) -> AsyncGenerator[Tuple[int, T], None]: """Merge multiple asynchronous iterators into a single iterator. @@ -413,8 +413,8 @@ async def merge_async_iterators( When it yields, it yields a tuple (i, item) where i is the index of the iterator that yields the item. - It also polls the provided function at least once per second to check - for client cancellation. + It also optionally polls a provided function at least once per second + to check for client cancellation. """ # Can use anext() in python >= 3.10 @@ -422,12 +422,13 @@ async def merge_async_iterators( ensure_future(pair[1].__anext__()): pair for pair in enumerate(iterators) } + timeout = None if is_cancelled is None else 1 try: while awaits: done, pending = await asyncio.wait(awaits.keys(), return_when=FIRST_COMPLETED, - timeout=1) - if await is_cancelled(): + timeout=timeout) + if is_cancelled is not None and await is_cancelled(): raise asyncio.CancelledError("client cancelled") for d in done: pair = awaits.pop(d) From 6d94420246f2d88137c04f357502a6a8c38a2e8e Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 7 Aug 2024 17:21:50 -0400 Subject: [PATCH 14/18] [Doc] Update supported_hardware.rst (#7276) --- .../quantization/supported_hardware.rst | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst index ecc330d866dbd..bb41bfed342c6 100644 --- a/docs/source/quantization/supported_hardware.rst +++ b/docs/source/quantization/supported_hardware.rst @@ -5,18 +5,20 @@ Supported Hardware for Quantization Kernels The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: -============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== -Implementation Volta Turing Ampere Ada Hopper AMD GPU Intel GPU x86 CPU AWS Inferentia Google TPU -============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== -AQLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -AWQ ❌ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -DeepSpeedFP ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -FP8 ❌ ❌ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -Marlin ❌ ❌ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -GPTQ ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -SqueezeLLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -bitsandbytes ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ -============== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +===================== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +Implementation Volta Turing Ampere Ada Hopper AMD GPU Intel GPU x86 CPU AWS Inferentia Google TPU +===================== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== +AWQ ❌ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +GPTQ ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +Marlin (GPTQ/AWQ/FP8) ❌ ❌ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +INT8 (W8A8) ❌ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +FP8 (W8A8) ❌ ❌ ❌ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +AQLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +bitsandbytes ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +DeepSpeedFP ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +GGUF ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +SqueezeLLM ✅ ✅ ✅ ✅ ✅ ❌ ❌ ❌ ❌ ❌ +===================== ====== ======= ======= ===== ====== ======= ========= ======= ============== ========== Notes: ^^^^^^ @@ -27,4 +29,4 @@ Notes: Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. -For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. \ No newline at end of file +For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. From e53dfd3eafd9bd5cc217c3796c7d1911a88ec893 Mon Sep 17 00:00:00 2001 From: Lily Liu Date: Wed, 7 Aug 2024 16:26:52 -0700 Subject: [PATCH 15/18] [Kernel] Fix Flashinfer Correctness (#7284) --- vllm/attention/backends/flashinfer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 03188164a9637..64b1f4a89f23c 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -127,6 +127,7 @@ def __post_init__(self): raise ValueError( f"Only {supported_head_sizes} are supported for head_dim,", f"received {self.head_dim}.") + self.is_profile_run = is_block_tables_empty(self.block_tables) def begin_forward(self): if self.num_prefill_tokens > 0: @@ -140,11 +141,14 @@ def begin_forward(self): assert self.paged_kv_last_page_len is not None batch_size = self.query_start_loc.shape[0] - 1 assert batch_size >= 0 - # The prefill stage does not read kv cache. + # The profile run does not read kv cache. # Both paged_kv_indices and paged_kv_last_page_len are empty. # paged_kv_indptr is a zero tensor with size batch_size + 1. - self.paged_kv_indptr = torch.zeros(batch_size + 1, - device=self.device) + if self.is_profile_run: + self.paged_kv_indptr = torch.zeros(batch_size + 1, + device=self.device) + else: + self.paged_kv_indptr = self.paged_kv_indptr.to(self.device) self.paged_kv_last_page_len = self.paged_kv_last_page_len.to( self.device) self.paged_kv_indices = self.paged_kv_indices.to(self.device) From 746709642c81aa22926765aef67e086a15aef076 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:06:01 -0700 Subject: [PATCH 16/18] [Misc] Fix typos in scheduler.py (#7285) Signed-off-by: Rui Qiao --- vllm/core/scheduler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index f60463107be44..950abfccba4c3 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -678,7 +678,7 @@ def _schedule_prefills( all tokens. Returns: - SchedulerSwappedInOutputs. + SchedulerPrefillOutputs. """ ignored_seq_groups: List[SequenceGroup] = [] seq_groups: List[SequenceGroup] = [] @@ -851,7 +851,7 @@ def _schedule_default(self) -> SchedulerOutputs: preempted=preempted, ) - def _schedule_chunked_prefill(self): + def _schedule_chunked_prefill(self) -> SchedulerOutputs: """Schedule queued requests. Chunked prefill allows to chunk prefill requests, batch them together @@ -862,7 +862,7 @@ def _schedule_chunked_prefill(self): The policy can sustain the high GPU utilization because it can put prefill and decodes requests to the same batch, while it improves - inter token latency because decodes requests don't need to blocked + inter token latency because decodes requests don't need to be blocked by prefill requests. """ budget = SchedulingBudget( From 48abee9e5492924a69551d859d66d98874d72d60 Mon Sep 17 00:00:00 2001 From: Cherilyn Buren <88433283+NiuBlibing@users.noreply.github.com> Date: Thu, 8 Aug 2024 14:17:29 +0800 Subject: [PATCH 17/18] [Frontend] remove max_num_batched_tokens limit for lora (#7288) --- vllm/config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index ec6d587e7925b..2ac31657979f2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1377,11 +1377,6 @@ def verify_with_model_config(self, model_config: ModelConfig): model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - if scheduler_config.max_num_batched_tokens > 65528: - raise ValueError( - "Due to limitations of the custom LoRA CUDA kernel, " - "max_num_batched_tokens must be <= 65528 when " - "LoRA is enabled.") if scheduler_config.chunked_prefill_enabled: raise ValueError("LoRA is not supported with chunked prefill yet.") From 6dffa4b0a6120159ef2fe44d695a46817aff65bc Mon Sep 17 00:00:00 2001 From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com> Date: Thu, 8 Aug 2024 00:02:27 -0700 Subject: [PATCH 18/18] [Bugfix] Fix LoRA with PP (#7292) --- vllm/lora/models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 279477562a940..bc4cab1470f44 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -25,6 +25,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor, parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models.interfaces import SupportsLoRA +from vllm.model_executor.models.utils import PPMissingLayer from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -432,6 +433,8 @@ def remove_all_adapters(self): def _create_lora_modules(self): for module_name, module in self.model.named_modules( remove_duplicate=False): + if isinstance(module, PPMissingLayer): + continue if not self._match_target_modules(module_name): continue parts = module_name.split(".")[-1]