diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index aca25b9995205..d1a07d42e1830 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -2,6 +2,7 @@ import os from typing import Dict, List, Optional, Tuple, Union import math +import gc import numpy as np import torch @@ -296,28 +297,26 @@ def wrapper(module, target_op, *args, **kwargs): model._openvino_patch_orig_forward = model.forward model.forward = partial(ov_wrapper, model) - -def patch_stateful_model(model): +def patch_stateful_model(model, factory): print('TRANSFORMING OPTIMUM-INTEL MODEL TO vLLM COMPATIBLE FORM') from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher, AnyInput, Or from openvino.runtime import opset13 - from openvino.runtime.utils.node_factory import NodeFactory from openvino.runtime.utils import replace_node - factory = NodeFactory() - factory.add_extension("libuser_ov_extensions.so") #model.remove_parameter(model.input('beam_idx').get_node()) - max_context_len = opset13.parameter(shape=[], dtype=np.int32, name='max_context_len') # max_context_len + max_context_len = opset13.parameter(shape=[], dtype=np.int64, name='max_context_len') # max_context_len model_remaining_params = [ opset13.parameter(shape=[], dtype=bool, name='is_prompt'), # is_prompt opset13.parameter(shape=[-1, -1], dtype=np.int64, name='slot_mapping'), # slot mapping max_context_len, - opset13.parameter(shape=[-1], dtype=np.int32, name='context_lens'), # context_lens + opset13.parameter(shape=[-1], dtype=np.int64, name='context_lens'), # context_lens opset13.parameter(shape=[-1, -1], dtype=np.int32, name='block_tables'), # block_tables ] + for parameter in model_remaining_params: + parameter.get_output_tensor(0).set_names({parameter.get_friendly_name()}) paged_attention_remaining_args = [ - opset13.constant([]), # alibi_slopes - opset13.constant(0), # sliding_window + opset13.constant(np.array([], np.float32)), # alibi_slopes + opset13.constant(np.array(0, np.int32)), # sliding_window ] kv_parameters = [] @@ -468,6 +467,7 @@ def callback(m: Matcher) -> bool: position_ids_parameter.append(opset13.parameter(shape=[-1, -1], dtype=np.int64, name="position_ids")) print('CREATED A NEW position_ids PARAMETER') replace_node(mapping[position_ids].get_node(), position_ids_parameter[0]) + position_ids_parameter[0].get_output_tensor(0).set_names({'position_ids'}) print('APPLIED position_ids PARAMETER INSTEAD OF attention_mask-BASED SUB-GRAPH') return True @@ -548,8 +548,13 @@ def load_model(self) -> None: if is_openvino_optimum_intel: import openvino as ov from optimum.intel import OVModelForCausalLM - self.model = OVModelForCausalLM.from_pretrained(self.model_config.model, export=True, compile=False, load_in_8bit=False) # need stateful because it also enables SDPA - patch_stateful_model(self.model.model) + self.model = OVModelForCausalLM.from_pretrained(self.model_config.model, export=True, compile=False, load_in_8bit=False, trust_remote_code=True) # need stateful because it also enables SDPA + if not hasattr(self.model, 'ov_node_factory'): + from openvino.runtime.utils.node_factory import NodeFactory + # Keep factory to destroy it in a particular moment when all other objects referencing custom nodes are destoyed + self.model.ov_node_factory = NodeFactory() + self.model.ov_node_factory.add_extension('libuser_ov_extensions.so') + patch_stateful_model(self.model.model, self.model.ov_node_factory) #ov.serialize(self.model.model, 'vllm_openvino_model.xml') core = ov.Core() ov_compiled = core.compile_model(self.model.model, "CPU") @@ -568,6 +573,15 @@ def load_model(self) -> None: else: self.model = get_model(self.model_config) + def __del__(self): + # Order is important + if hasattr(self.model, 'ov_node_factory'): + del self.model.ov_request + del self.model.model + if gc: # when app is being destroyed the module may not be available + gc.collect() + del self.model.ov_node_factory + def set_block_size(self, block_size: int) -> None: self.block_size = block_size