From 0f83539dbbd2304490b31b63189ae200ff8dc18e Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Mon, 25 Mar 2024 01:56:09 +0100
Subject: [PATCH] avoid copy result and force allocation

---
 vllm/executor/openvino_executor.py           | 3 +++
 vllm/model_executor/openvino_model_loader.py | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
index 84059eacb7a4b..1bfe4205966ab 100644
--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
@@ -119,6 +119,9 @@ def allocate_cpu_cache(self) -> List[OpenVINOKVCache]:
         for _ in range(self.num_layers):
             key_blocks = ov.Tensor(self.cache_dtype, key_block_shape)
             value_blocks = ov.Tensor(self.cache_dtype, value_block_shape)
+            # force allocation
+            key_blocks.data[:] = 0
+            value_blocks.data[:] = 0
             cpu_cache.append((key_blocks, value_blocks))
         return cpu_cache
 
diff --git a/vllm/model_executor/openvino_model_loader.py b/vllm/model_executor/openvino_model_loader.py
index e4c7d1c0b99f7..a4aa289a06ef6 100644
--- a/vllm/model_executor/openvino_model_loader.py
+++ b/vllm/model_executor/openvino_model_loader.py
@@ -51,8 +51,9 @@ def ov_wrapper(self, *args, **kwargs) -> torch.Tensor:
     else:
         inputs.append(np.array(0, dtype=np.int32))   # for optimum-based models this parameter can be used even on the first iteration
 
-    outputs = self._ov_request.infer(inputs, share_inputs=True, share_outputs=False)
-    return torch.from_numpy(outputs[0])
+    self._ov_request.start_async(inputs, share_inputs=True)
+    self._ov_request.wait()
+    return torch.from_numpy(self._ov_request.get_tensor("logits").data)
 
 
 def patch_stateful_model(