Skip to content

Commit

Permalink
Merge branch 'vllm-project:main' into punica-kernel-fusion
Browse files Browse the repository at this point in the history
  • Loading branch information
jeejeelee authored Jan 6, 2025
2 parents 7ffd15e + 996357e commit c1c5b4b
Show file tree
Hide file tree
Showing 31 changed files with 1,153 additions and 803 deletions.
2 changes: 1 addition & 1 deletion docs/source/serving/deploying_with_k8s.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ metadata:
name: hf-token-secret
namespace: default
type: Opaque
data:
stringData:
token: "REPLACE_WITH_TOKEN"
```
Expand Down
7 changes: 4 additions & 3 deletions tests/multimodal/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,17 +586,18 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
)

processor = processor_factory(ctx, cache=None)
profiler = processor.profiling_info

mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
processor.get_supported_mm_limits = mock_supported_mm_limits
profiler.get_supported_mm_limits = mock_supported_mm_limits

if is_valid:
exc_ctx = nullcontext()
else:
exc_ctx = pytest.raises(ValueError, match="this model only supports")

with exc_ctx:
processor._get_and_validate_dummy_mm_counts()
profiler.get_mm_limits()


@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
Expand Down Expand Up @@ -723,7 +724,7 @@ def _test_processing_cache_correctness(
}

mm_counts = {k: len(vs) for k, vs in mm_data.items()}
prompt = baseline_processor._get_dummy_processor_inputs(
prompt = baseline_processor.profiling_info.get_dummy_processor_inputs(
model_config.max_model_len,
mm_counts,
).prompt_text
Expand Down
6 changes: 3 additions & 3 deletions tests/v1/engine/test_engine_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core import EngineCore
from vllm.v1.executor.abstract import Executor

if not current_platform.is_cuda():
pytest.skip(reason="V1 currently only supported on CUDA.",
Expand Down Expand Up @@ -43,7 +43,7 @@ def test_engine_core(monkeypatch):
"""Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config()
executor_class = AsyncLLM._get_executor_cls(vllm_config)
executor_class = Executor.get_class(vllm_config)

engine_core = EngineCore(vllm_config=vllm_config,
executor_class=executor_class)
Expand Down Expand Up @@ -149,7 +149,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
"""Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config()
executor_class = AsyncLLM._get_executor_cls(vllm_config)
executor_class = Executor.get_class(vllm_config)

engine_core = EngineCore(vllm_config=vllm_config,
executor_class=executor_class)
Expand Down
6 changes: 3 additions & 3 deletions tests/v1/engine/test_engine_core_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from vllm.platforms import current_platform
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.executor.abstract import Executor

if not current_platform.is_cuda():
pytest.skip(reason="V1 currently only supported on CUDA.",
Expand Down Expand Up @@ -84,7 +84,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
vllm_config = engine_args.create_engine_config(
UsageContext.UNKNOWN_CONTEXT)
executor_class = AsyncLLM._get_executor_cls(vllm_config)
executor_class = Executor.get_class(vllm_config)
client = EngineCoreClient.make_client(
multiprocess_mode=multiprocessing_mode,
asyncio_mode=False,
Expand Down Expand Up @@ -152,7 +152,7 @@ async def test_engine_core_client_asyncio(monkeypatch):
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config(
usage_context=UsageContext.UNKNOWN_CONTEXT)
executor_class = AsyncLLM._get_executor_cls(vllm_config)
executor_class = Executor.get_class(vllm_config)
client = EngineCoreClient.make_client(
multiprocess_mode=True,
asyncio_mode=True,
Expand Down
5 changes: 0 additions & 5 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1015,11 +1015,6 @@ def _verify_args(self) -> None:
raise ValueError(
"GPU memory utilization must be less than 1.0. Got "
f"{self.gpu_memory_utilization}.")
from vllm.platforms import current_platform
if (current_platform.is_cuda() and self.block_size is not None
and self.block_size > 32):
raise ValueError("CUDA Paged Attention kernel only supports "
f"block sizes up to 32. Got {self.block_size}.")

def _verify_cache_dtype(self) -> None:
if self.cache_dtype == "auto":
Expand Down
6 changes: 3 additions & 3 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -767,11 +767,11 @@ async def run_server(args, **uvicorn_kwargs) -> None:
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
ToolParserManager.import_tool_parser(args.tool_parser_plugin)

valide_tool_parses = ToolParserManager.tool_parsers.keys()
valid_tool_parses = ToolParserManager.tool_parsers.keys()
if args.enable_auto_tool_choice \
and args.tool_call_parser not in valide_tool_parses:
and args.tool_call_parser not in valid_tool_parses:
raise KeyError(f"invalid tool call parser: {args.tool_call_parser} "
f"(chose from {{ {','.join(valide_tool_parses)} }})")
f"(chose from {{ {','.join(valid_tool_parses)} }})")

# workaround to make sure that we bind the port before the engine is set up.
# This avoids race conditions with ray.
Expand Down
12 changes: 9 additions & 3 deletions vllm/lora/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,9 @@ def __init__(self, base_layer: ReplicatedLinear) -> None:
self.output_size = self.base_layer.output_size
self.n_slices = 1

def forward(self, input_):
def forward(
self, input_: torch.Tensor
) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
"""Forward of ReplicatedLinearWithLoRA
Args:
Expand Down Expand Up @@ -496,7 +498,9 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
bias = bias[start_idx:end_idx]
return bias

def forward(self, input_):
def forward(
self, input_: torch.Tensor
) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
"""Forward of ColumnParallelLinear
Args:
Expand Down Expand Up @@ -833,7 +837,9 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
return bias

def forward(self, input_):
def forward(
self, input_: torch.Tensor
) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
"""Forward of RowParallelLinear
Args:
Expand Down
3 changes: 2 additions & 1 deletion vllm/lora/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import re
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Sequence, Type
from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union

import safetensors.torch
import torch
Expand Down Expand Up @@ -219,6 +219,7 @@ def from_local_checkpoint(

config["vllm_max_position_embeddings"] = max_position_embeddings
peft_helper = PEFTHelper.from_dict(config)
unexpected_modules: List[Union[list[str], str]]
if os.path.isfile(lora_tensor_path):
tensors: Dict[str, torch.Tensor] = {}
# Find unexpected modules.
Expand Down
4 changes: 3 additions & 1 deletion vllm/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
assert param.size() == loaded_weight.size()
param.data.copy_(loaded_weight)

def forward(self, x: torch.Tensor) -> torch.Tensor:
def forward(
self, x: torch.Tensor
) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
bias = self.bias if not self.skip_bias_add else None
assert self.quant_method is not None
output = self.quant_method.apply(self, x, bias)
Expand Down
79 changes: 47 additions & 32 deletions vllm/model_executor/models/aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
MultiModalDataItems, ProcessorInputs,
MultiModalDataItems, ProcessingMixin,
PromptReplacement)
from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.aria import (AriaMoELMConfig,
AriaVisionConfig)
Expand Down Expand Up @@ -444,18 +445,58 @@ def build_mm_projector(config: PretrainedConfig):
)


class AriaMultiModalProcessor(BaseMultiModalProcessor):
class AriaProcessingMixin(ProcessingMixin):

def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
def _get_hf_config(self):
return self.ctx.get_hf_config()

def _get_vision_config(self) -> AriaVisionConfig:
return self._get_hf_config().vision_config

def _get_num_image_tokens(self) -> int:
hf_config = self.ctx.get_hf_config()
hf_config = self._get_hf_config()
return max(hf_config.projector_patch_to_query_dict.values())


class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo):

def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}

def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
return {"image": self._get_num_image_tokens()}

def get_dummy_processor_inputs(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
vision_config = self._get_vision_config()

max_image_size = vision_config.image_size
num_images = mm_counts.get("image", 0)

mm_data = {
"image":
self._get_dummy_images(width=max_image_size,
height=max_image_size,
num_images=num_images)
}

hf_processor = self._get_hf_processor()
image_token: str = hf_processor.image_token # type: ignore

return ProcessorInputs(
prompt_text=image_token * num_images,
mm_data=mm_data,
)


class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor):

def _get_profiling_info(self) -> BaseProfilingInfo:
return AriaProfilingInfo(self.ctx)

def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
Expand All @@ -472,7 +513,7 @@ def _get_prompt_replacements(
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
hf_config = self.ctx.get_hf_config()
hf_config = self._get_hf_config()
image_token_id = hf_config.image_token_index

num_image_tokens = self._get_num_image_tokens()
Expand All @@ -485,32 +526,6 @@ def _get_prompt_replacements(
)
]

def _get_dummy_processor_inputs(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
hf_config = self.ctx.get_hf_config()
vision_config: AriaVisionConfig = hf_config.vision_config

max_image_size = vision_config.image_size
num_images = mm_counts.get("image", 0)

mm_data = {
"image":
self._get_dummy_images(width=max_image_size,
height=max_image_size,
num_images=num_images)
}

hf_processor = self._get_hf_processor()
image_token: str = hf_processor.image_token # type: ignore

return ProcessorInputs(
prompt_text=image_token * num_images,
mm_data=mm_data,
)


@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor)
class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
Expand Down
Loading

0 comments on commit c1c5b4b

Please sign in to comment.