From 84b2bb8307406a0dcdf63185fdee4272f8fd1039 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 5 Feb 2025 06:42:46 +0000 Subject: [PATCH] [Bugfix] Fix OpenVINO model runner (#12750) Signed-off-by: Felix Marty --- vllm/attention/backends/openvino.py | 4 ++++ vllm/model_executor/model_loader/openvino.py | 11 +++++------ vllm/worker/openvino_model_runner.py | 9 +++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index f58528dbf5b78..9908620a32a23 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -140,3 +140,7 @@ class OpenVINOAttentionMetadata: # `model_executable`. multi_modal_placeholder_index_maps: Optional[Dict[ str, MultiModalPlaceholderMap.IndexMap]] + + # Enable/disable KV scales calculation. This is so that we can disable the + # calculation until after prefill and cuda graph capture. + enable_kv_scales_calculation: bool diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index 7bd531c568f5e..fde200d576e2f 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -13,7 +13,7 @@ import vllm.envs as envs from vllm.attention.backends.openvino import OpenVINOAttentionMetadata -from vllm.config import DeviceConfig, ModelConfig +from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import (LogitsProcessor, _prune_hidden_states) @@ -103,7 +103,6 @@ def __init__( self, ov_core: ov.Core, model_config: ModelConfig, - device_config: DeviceConfig, kv_cache_dtype: ov.Type, ) -> None: super().__init__() @@ -187,8 +186,7 @@ def sample( def get_model( - model_config: ModelConfig, - device_config: DeviceConfig, + vllm_config: VllmConfig, kv_cache_dtype: ov.Type, **kwargs, ) -> torch.nn.Module: @@ -201,5 +199,6 @@ def get_model( "be added in the future. If this is important to you, " "please open an issue on github.") - return OpenVINOCausalLM(ov_core, model_config, device_config, - kv_cache_dtype) + with set_current_vllm_config(vllm_config): + return OpenVINOCausalLM(ov_core, vllm_config.model_config, + kv_cache_dtype) diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 44442cddbd4a2..f7a5ab9de9fa6 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -54,15 +54,13 @@ def __init__( ): self.ov_core = ov_core ModelRunnerBase.__init__(self, vllm_config=vllm_config) - cache_config = self.cache_config - model_config = self.model_config self.is_driver_worker = is_driver_worker self.device = self.device_config.device self.kv_cache_dtype = kv_cache_dtype - self.sliding_window = model_config.get_sliding_window() - self.block_size = cache_config.block_size + self.sliding_window = self.model_config.get_sliding_window() + self.block_size = self.cache_config.block_size self.attn_backend = get_attn_backend( self.model_config.get_head_size(), @@ -81,8 +79,7 @@ def __init__( self.model: nn.Module # Set after init_Model def load_model(self) -> None: - self.model = get_model(model_config=self.model_config, - device_config=self.device_config, + self.model = get_model(vllm_config=self.vllm_config, kv_cache_dtype=self.kv_cache_dtype, ov_core=self.ov_core)