From c8b86546782863aa750ba2593c46edab1b7ca632 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Tue, 28 Jan 2025 19:07:05 +0000
Subject: [PATCH] Direct call on ROCm

---
 vllm/attention/layer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index dd6d039f9e6f8..c24d8657964dd 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -117,11 +117,11 @@ def __init__(
         self.backend = backend_name_to_enum(attn_backend.get_name())
         self.dtype = dtype
 
-        # For cuda and cpu platforms, we control how
+        # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
         # torch.compile works by registering the attention as one giant
         # opaque custom op. For other platforms, we directly call them
         # and let torch.compile handle them.
-        self.use_direct_call = not current_platform.is_cuda(
+        self.use_direct_call = not current_platform.is_cuda_alike(
         ) and not current_platform.is_cpu()
 
         self.use_output = attn_backend.accept_output_buffer