From c8b86546782863aa750ba2593c46edab1b7ca632 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Tue, 28 Jan 2025 19:07:05 +0000 Subject: [PATCH] Direct call on ROCm --- vllm/attention/layer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index dd6d039f9e6f8..c24d8657964dd 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -117,11 +117,11 @@ def __init__( self.backend = backend_name_to_enum(attn_backend.get_name()) self.dtype = dtype - # For cuda and cpu platforms, we control how + # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant # opaque custom op. For other platforms, we directly call them # and let torch.compile handle them. - self.use_direct_call = not current_platform.is_cuda( + self.use_direct_call = not current_platform.is_cuda_alike( ) and not current_platform.is_cpu() self.use_output = attn_backend.accept_output_buffer