From 4974d0f339370cda9056c54f0e4f7b829bcf1cfb Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 19 Jul 2024 20:38:02 +0800
Subject: [PATCH 01/10] refactor ipex op

---
 vllm/_ipex_ops.py                        | 91 +++++++-----------------
 vllm/model_executor/layers/activation.py | 15 ++--
 vllm/model_executor/layers/layernorm.py  |  5 +-
 3 files changed, 37 insertions(+), 74 deletions(-)

diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 2156f6b18adb6..6c57c9b545da7 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -27,29 +27,33 @@ def _reshape_activation_tensor(
 
     @staticmethod
     def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-        x1, x2 = ipex_ops._reshape_activation_tensor(x)
-        ipex.llm.functional.silu_mul(x1, x2, out)
+        # x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        # ipex.llm.functional.silu_mul(x1, x2, out)
+        ipex.llm.functional.silu_and_mul(x, out)
 
     @staticmethod
     def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-        x1, x2 = ipex_ops._reshape_activation_tensor(x)
-        ipex.llm.functional.gelu_mul(x1, x2, out, "none")
+        # x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        # ipex.llm.functional.gelu_mul(x1, x2, out, "none")
+        ipex.llm.functional.gelu_and_mul(x, out)
 
     @staticmethod
     def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-        x1, x2 = ipex_ops._reshape_activation_tensor(x)
-        ipex.llm.functional.gelu_mul(x1, x2, out, "tanh")
+        # x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        # ipex.llm.functional.gelu_mul(x1, x2, out, "tanh")
+        ipex.llm.functional.gelu_and_mul(x, out)
 
     @staticmethod
-    def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-        out.copy_(torch.nn.functional.gelu(x))
+    def gelu_fast(x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.gelu(x)
 
     @staticmethod
-    def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-        out.copy_(torch.nn.functional.gelu(x))
+    def gelu_new(x: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.gelu(x)
 
-    # TODO add implementation of gelu_quick here
-    # def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    @staticmethod
+    def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+        ipex.llm.functional.gelu_quick(x, out)
 
     @staticmethod
     def paged_attention_v1(
@@ -160,29 +164,10 @@ def rotary_embedding(
         cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
         is_neox: bool,
     ) -> None:
-        if positions.dim() == 1:
-            positions = positions.unsqueeze(0)
-            query = query.unsqueeze(0)
-            key = key.unsqueeze(0)
-
-        rotary_dim = cos_sin_cache.size(1)
-        query = query.view(*query.shape[:-1], -1, head_size)
-        key = key.view(*key.shape[:-1], -1, head_size)
-
-        query_rot = query[..., :rotary_dim]
-        key_rot = key[..., :rotary_dim]
-
-        cos_sin = cos_sin_cache[positions.long()]
-        cos, sin = cos_sin.chunk(2, dim=-1)
-
-        if is_neox:
-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
-        else:
-            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
-            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
-        ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
-                                             rotary_dim, is_neox, positions)
+        rot_dim = cos_sin_cache.size(1)
+        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
+                                                     head_size, cos_sin_cache,
+                                                     is_neox, rot_dim)
 
     @staticmethod
     def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
@@ -190,37 +175,15 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
                                  cos_sin_cache: torch.Tensor, is_neox: bool,
                                  rot_dim: int,
                                  cos_sin_cache_offsets: torch.Tensor) -> None:
-        if positions.dim() == 1:
-            positions = positions.unsqueeze(0)
-            query = query.unsqueeze(0)
-            key = key.unsqueeze(0)
-        cos_sin_cache_offsets = cos_sin_cache_offsets.view_as(positions)
-        rotary_dim = cos_sin_cache.size(1)
-        query = query.view(*query.shape[:-1], -1, head_size)
-        key = key.view(*key.shape[:-1], -1, head_size)
-
-        query_rot = query[..., :rotary_dim]
-        key_rot = key[..., :rotary_dim]
-
-        cos_sin = cos_sin_cache[torch.add(positions,
-                                          cos_sin_cache_offsets).long()]
-        cos, sin = cos_sin.chunk(2, dim=-1)
-
-        if is_neox:
-            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
-            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
-        else:
-            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
-            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
-
-        ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
-                                             rotary_dim, is_neox, positions)
+        ipex.llm.functional.rotary_embedding_batched(positions, query, key,
+                                                     head_size, cos_sin_cache,
+                                                     is_neox, rot_dim,
+                                                     cos_sin_cache_offsets)
 
     @staticmethod
-    def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
-                 epsilon: float) -> None:
-        tmp = ipex.llm.functional.rms_norm(input, weight, epsilon)
-        out.copy_(tmp)
+    def rms_norm(input: torch.Tensor, weight: torch.Tensor,
+                 epsilon: float) -> torch.Tensor:
+        return ipex.llm.functional.rms_norm(input, weight, epsilon)
 
     @staticmethod
     def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 4c14fe476ee4a..43056786d35c9 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -114,9 +114,7 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         from vllm._ipex_ops import ipex_ops as ops
 
-        out = torch.empty_like(x)
-        ops.gelu_new(out, x)
-        return out
+        return ops.gelu_new(x)
 
 
 class FastGELU(CustomOp):
@@ -136,9 +134,7 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         from vllm._ipex_ops import ipex_ops as ops
 
-        out = torch.empty_like(x)
-        ops.gelu_fast(out, x)
-        return out
+        return ops.gelu_fast(x)
 
 
 class QuickGELU(CustomOp):
@@ -155,6 +151,13 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         ops.gelu_quick(out, x)
         return out
 
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm._ipex_ops import ipex_ops as ops
+
+        out = torch.empty_like(x)
+        ops.gelu_quick(out, x)
+        return out
+
     # TODO implement forward_xpu for QuickGELU
     # def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index e3d588efd9b6d..14f60e9172f29 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -82,14 +82,11 @@ def forward_xpu(
                 self.variance_epsilon,
             )
             return x, residual
-        out = torch.empty_like(x)
-        ops.rms_norm(
-            out,
+        return ops.rms_norm(
             x,
             self.weight.data,
             self.variance_epsilon,
         )
-        return out
 
     def extra_repr(self) -> str:
         s = f"hidden_size={self.weight.data.size(0)}"

From 608547a6278f756f2ded6fdeada5f010fc5a4483 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Fri, 19 Jul 2024 23:54:07 +0800
Subject: [PATCH 02/10] fix varlen_attn

---
 vllm/_ipex_ops.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 6c57c9b545da7..bdcaa04235780 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -209,11 +209,14 @@ def varlen_attention(
         return_softmax: bool,
         gen_: torch.Generator,
     ) -> None:
-        ipex.llm.functional.varlen_attention(query, key, value, out, seqlen_q,
-                                             seqlen_k, max_seqlen_q,
-                                             max_seqlen_k, pdropout,
-                                             softmax_scale, zero_tensors,
-                                             is_causal, return_softmax, gen_)
+        ipex.llm.functional.varlen_attention(query.contiguous(),
+                                             key.contiguous(),
+                                             value.contiguous(), out,
+                                             seqlen_q.int(), seqlen_k.int(),
+                                             max_seqlen_q, max_seqlen_k,
+                                             pdropout, softmax_scale,
+                                             zero_tensors, is_causal,
+                                             return_softmax, gen_)
 
     @staticmethod
     def reshape_and_cache(

From c5e91281eae3a49368cfce4ef80757f78e9a6544 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Mon, 12 Aug 2024 17:05:05 +0800
Subject: [PATCH 03/10] update dependency

---
 requirements-xpu.txt | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index 48d899ec70eda..5e7ff31d205c1 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -3,9 +3,8 @@
 
 setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
 
-torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
-intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
-
-triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+torch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww32/py310/torch-2.3.1+cxx11.abi-cp310-cp310-linux_x86_64.whl
+intel_extension_for_pytorch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww32/py310/intel_extension_for_pytorch-2.3.110+xpu-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww32/py310/oneccl_bind_pt-2.3.100+xpu-cp310-cp310-linux_x86_64.whl
 
+triton-xpu @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl  

From 71710ceb8171b891cbe87e7ad756d625aa2ca306 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 13 Aug 2024 00:39:56 +0800
Subject: [PATCH 04/10] update docker file

---
 Dockerfile.xpu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 321da98cf6c89..d7b86512a4621 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -1,9 +1,8 @@
-FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
+FROM intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
     chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    rm /etc/apt/sources.list.d/intel-graphics.list && \
     wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
     echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
     chmod 644 /usr/share/keyrings/intel-graphics.gpg

From 9ff9708179193359e5fa2263268eaa8f94ae0a36 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Thu, 15 Aug 2024 08:39:02 +0800
Subject: [PATCH 05/10] update docker file with dependency installation (#45)

---
 Dockerfile.xpu | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index d7b86512a4621..8b77e5c3f55e9 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -9,6 +9,15 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
 
 RUN apt-get update  -y \
 && apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 
+
+RUN git clone https://github.com/intel/pti-gpu && \
+    cd pti-gpu/sdk && \
+    mkdir build && \
+    cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
+    make -j && \
+    cmake --install . --config Release --prefix "/usr/local"
+
 COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm

From 85c697e4e18f27903b9d73ef0fc9750eddf7d329 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 20 Aug 2024 17:13:59 +0800
Subject: [PATCH 06/10] use rc1 wheels, upgrade to oneapi-2024.2.1

---
 Dockerfile.xpu       | 2 +-
 requirements-xpu.txt | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 8b77e5c3f55e9..50bbd8f7dad87 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -1,4 +1,4 @@
-FROM intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04
+FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index 5e7ff31d205c1..0f57d8413bd5f 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -3,8 +3,8 @@
 
 setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
 
-torch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww32/py310/torch-2.3.1+cxx11.abi-cp310-cp310-linux_x86_64.whl
-intel_extension_for_pytorch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww32/py310/intel_extension_for_pytorch-2.3.110+xpu-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww32/py310/oneccl_bind_pt-2.3.100+xpu-cp310-cp310-linux_x86_64.whl
+torch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/releases/2024.2/IPEX_2.3.110+xpu/RC1/py310/torch-2.3.1+cxx11.abi-cp310-cp310-linux_x86_64.whl
+intel_extension_for_pytorch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/releases/2024.2/IPEX_2.3.110+xpu/RC1/py310/intel_extension_for_pytorch-2.3.110+xpu-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/releases/2024.2/IPEX_2.3.110+xpu/RC1/py310/oneccl_bind_pt-2.3.100+xpu-cp310-cp310-linux_x86_64.whl
 
 triton-xpu @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl  

From f8f1b66133945da25fc7019623ef643bc90f52d4 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 27 Aug 2024 21:57:04 +0800
Subject: [PATCH 07/10] update dependency to ipex ww34 weekly release

---
 requirements-xpu.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index 0f57d8413bd5f..a4404306b1f7f 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -3,8 +3,8 @@
 
 setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
 
-torch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/releases/2024.2/IPEX_2.3.110+xpu/RC1/py310/torch-2.3.1+cxx11.abi-cp310-cp310-linux_x86_64.whl
-intel_extension_for_pytorch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/releases/2024.2/IPEX_2.3.110+xpu/RC1/py310/intel_extension_for_pytorch-2.3.110+xpu-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/releases/2024.2/IPEX_2.3.110+xpu/RC1/py310/oneccl_bind_pt-2.3.100+xpu-cp310-cp310-linux_x86_64.whl
+torch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww34/py310/torch-2.3.0a0+giteb3ebdc-cp310-cp310-linux_x86_64.whl
+intel_extension_for_pytorch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww34/py310/intel_extension_for_pytorch-2.3.110+gited12d5e-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww34/py310/oneccl_bind_pt-2.3.0+gpu-cp310-cp310-linux_x86_64.whl
 
 triton-xpu @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl  

From 3e0a8f45e412bffbee29b472467ed54eb3c70890 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 4 Sep 2024 18:42:24 +0800
Subject: [PATCH 08/10] fix copy_blocks

---
 vllm/attention/backends/ipex_attn.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 64d60e4e47e48..113a2788eacd3 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -49,14 +49,18 @@ def swap_blocks(
         dst_kv_cache: torch.Tensor,
         src_to_dst: torch.Tensor,
     ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+        from vllm._ipex_ops import ipex_ops as ops
+        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+        from vllm._ipex_ops import ipex_ops as ops
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
 
 
 @dataclass

From b6e4a5935eb3463b3725227d4a637a20c9f47bdd Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Thu, 12 Sep 2024 01:26:04 +0800
Subject: [PATCH 09/10] update ipex to 2.3.1 release

---
 requirements-xpu.txt | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index a4404306b1f7f..f07211b48b68d 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -3,8 +3,10 @@
 
 setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
 
-torch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww34/py310/torch-2.3.0a0+giteb3ebdc-cp310-cp310-linux_x86_64.whl
-intel_extension_for_pytorch @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww34/py310/intel_extension_for_pytorch-2.3.110+gited12d5e-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://ubit-artifactory-sh.intel.com/artifactory/aipc_releases-sh-local/gpu-new/validation/IPEX/weekly/PVC/2024/ww34/py310/oneccl_bind_pt-2.3.0+gpu-cp310-cp310-linux_x86_64.whl
+torch == 2.3.1+cxx11.abi
+intel-extension-for-pytorch == 2.3.110+xpu
+oneccl_bind_pt == 2.3.100+xpu
 
-triton-xpu @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl  
+triton-xpu == 3.0.0b2
+
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/

From 903ba2fa863bbf94b8466579383aa1a68613e713 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Thu, 12 Sep 2024 01:29:37 +0800
Subject: [PATCH 10/10] remove useless

---
 vllm/_ipex_ops.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index bdcaa04235780..31fcc4c3256a8 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -27,20 +27,14 @@ def _reshape_activation_tensor(
 
     @staticmethod
     def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-        # x1, x2 = ipex_ops._reshape_activation_tensor(x)
-        # ipex.llm.functional.silu_mul(x1, x2, out)
         ipex.llm.functional.silu_and_mul(x, out)
 
     @staticmethod
     def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-        # x1, x2 = ipex_ops._reshape_activation_tensor(x)
-        # ipex.llm.functional.gelu_mul(x1, x2, out, "none")
         ipex.llm.functional.gelu_and_mul(x, out)
 
     @staticmethod
     def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-        # x1, x2 = ipex_ops._reshape_activation_tensor(x)
-        # ipex.llm.functional.gelu_mul(x1, x2, out, "tanh")
         ipex.llm.functional.gelu_and_mul(x, out)
 
     @staticmethod