diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index d538286a0dddd..ffa05e80623ac 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -32,7 +32,7 @@ def get_supported_head_sizes() -> List[int]: @staticmethod def get_name() -> str: - return "flash-attn" + return "FLASH_ATTN" @staticmethod def get_impl_cls() -> Type["FlashAttentionImpl"]: diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 1dd2a21fdb51a..e43fb134a6a5a 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -40,7 +40,7 @@ class FlashInferBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "flashinfer" + return "FLASHINFER" @staticmethod def get_impl_cls() -> Type["FlashInferImpl"]: diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 7398732ddfc92..1eb5fe10d76db 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -19,7 +19,7 @@ class IpexAttnBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "ipex-attn" + return "IPEX" @staticmethod def get_impl_cls() -> Type["IpexAttnBackendImpl"]: diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index 8b36230730380..6fddfc2002120 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -38,7 +38,7 @@ class OpenVINOAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "openvino" + return "OPENVINO" @staticmethod def get_impl_cls(): diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 56d3d3b482e58..6fee81de14420 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -11,6 +11,10 @@ class PallasAttentionBackend(AttentionBackend): + @staticmethod + def get_name() -> str: + return "PALLAS" + @staticmethod def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: return PallasAttentionBackendImpl diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 3987986f1786b..4116fbf00020c 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -20,7 +20,7 @@ class PlaceholderAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "placeholder-attn" + return "NO_ATTENTION" @staticmethod def get_impl_cls() -> Type["PlaceholderAttentionImpl"]: diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 682eac50126ad..c2aec4aaa74e7 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -28,7 +28,7 @@ class ROCmFlashAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "rocm-flash-attn" + return "ROCM_FLASH" @staticmethod def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]: diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index ef8d576616838..1fb7c37578f20 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -25,7 +25,7 @@ class TorchSDPABackend(AttentionBackend): @staticmethod def get_name() -> str: - return "torch-sdpa" + return "TORCH_SDPA" @staticmethod def get_impl_cls() -> Type["TorchSDPABackendImpl"]: diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 358a223e7ed0e..d1a44f3e8bfa6 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -317,8 +317,8 @@ def graph_capture_get_metadata_for_batch( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ + assert self.runner.attn_backend.get_name() == "XFORMERS", \ + f"Expected attn_backend name to be 'XFORMERS', but "\ f" got '{self.runner.attn_backend.get_name()}'" self._update_captured_metadata_for_enc_dec_model( batch_size=batch_size, attn_metadata=attn_metadata) @@ -337,8 +337,8 @@ def get_graph_input_buffers( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ + assert self.runner.attn_backend.get_name() == "XFORMERS", \ + f"Expected attn_backend name to be 'XFORMERS', but "\ f" got '{self.runner.attn_backend.get_name()}'" self._add_additonal_input_buffers_for_enc_dec_model( attn_metadata=attn_metadata, input_buffers=input_buffers) @@ -356,8 +356,8 @@ def prepare_graph_input_buffers( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ + assert self.runner.attn_backend.get_name() == "XFORMERS", \ + f"Expected attn_backend name to be 'XFORMERS', but "\ f" got '{self.runner.attn_backend.get_name()}'" self._prepare_input_buffers_for_enc_dec_model( attn_metadata, input_buffers) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 650bc6ec7750a..5aaf13d8ea744 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -24,7 +24,7 @@ class XFormersBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "xformers" + return "XFORMERS" @staticmethod def get_impl_cls() -> Type["XFormersImpl"]: diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index aaf6ec5f508c8..3aa999fcb9ebb 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -179,7 +179,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): return False # TODO: Add support for other attn backends - if self.attn_backend.get_name() != "flash-attn": + if self.attn_backend.get_name() != "FLASH_ATTN": return False # TODO: Add support for LORA diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 50d2767a03752..316db43502d3b 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -184,7 +184,7 @@ def create_worker( if not disable_mqa_scorer: if scorer_worker.model_runner.attn_backend.get_name( - ) != "flash-attn": + ) != "FLASH_ATTN": disable_mqa_scorer = True logger.info( "[Speculative Decoding] Disabling MQA scorer as the " diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index dc1674cd1ea20..f98fb7e4f01df 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1855,7 +1855,7 @@ def forward( self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) self.input_buffers["positions"].copy_(positions, non_blocking=True) - if self.backend_name != "placeholder-attn": + if self.backend_name != "NO_ATTENTION": self.input_buffers["slot_mapping"].copy_( attn_metadata.slot_mapping, non_blocking=True) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 0cd0047bebf2d..be2f0d79154d6 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -29,8 +29,8 @@ logger = init_logger(__name__) -MULTI_STEP_ATTENTION_BACKENDS = ["flash-attn", "rocm-flash-attn", "flashinfer"] -MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["flash-attn"] +MULTI_STEP_ATTENTION_BACKENDS = ["FLASH_ATTN", "ROCM_FLASH", "FLASHINFER"] +MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN"] def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ -> List[str]: