From 0f83539dbbd2304490b31b63189ae200ff8dc18e Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Mon, 25 Mar 2024 01:56:09 +0100 Subject: [PATCH] avoid copy result and force allocation --- vllm/executor/openvino_executor.py | 3 +++ vllm/model_executor/openvino_model_loader.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py index 84059eacb7a4b..1bfe4205966ab 100644 --- a/vllm/executor/openvino_executor.py +++ b/vllm/executor/openvino_executor.py @@ -119,6 +119,9 @@ def allocate_cpu_cache(self) -> List[OpenVINOKVCache]: for _ in range(self.num_layers): key_blocks = ov.Tensor(self.cache_dtype, key_block_shape) value_blocks = ov.Tensor(self.cache_dtype, value_block_shape) + # force allocation + key_blocks.data[:] = 0 + value_blocks.data[:] = 0 cpu_cache.append((key_blocks, value_blocks)) return cpu_cache diff --git a/vllm/model_executor/openvino_model_loader.py b/vllm/model_executor/openvino_model_loader.py index e4c7d1c0b99f7..a4aa289a06ef6 100644 --- a/vllm/model_executor/openvino_model_loader.py +++ b/vllm/model_executor/openvino_model_loader.py @@ -51,8 +51,9 @@ def ov_wrapper(self, *args, **kwargs) -> torch.Tensor: else: inputs.append(np.array(0, dtype=np.int32)) # for optimum-based models this parameter can be used even on the first iteration - outputs = self._ov_request.infer(inputs, share_inputs=True, share_outputs=False) - return torch.from_numpy(outputs[0]) + self._ov_request.start_async(inputs, share_inputs=True) + self._ov_request.wait() + return torch.from_numpy(self._ov_request.get_tensor("logits").data) def patch_stateful_model(