diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 26c6ac812a125..d30b5192f29a2 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -144,6 +144,20 @@ def _cached_get_attn_backend( if backend_by_env_var is not None: selected_backend = backend_name_to_enum(backend_by_env_var) + if selected_backend is not None and use_v1: + if (selected_backend in (_Backend.FLASH_ATTN, _Backend.FLASHINFER, + _Backend.XFORMERS)): + raise ValueError( + f"{selected_backend.name} is not compatible with vLLM V1. " + "Please either do `export VLLM_ATTENTION_BACKEND=" + f"{_Backend.FLASH_ATTN_VLLM_V1.name}` or unset it to use " + "the default backend.") + elif selected_backend not in _Backend.get_v1_backends(): + raise ValueError( + f"{selected_backend.name} attention backend is not compatible " + "with vLLM V1. Please use a different backend or unset the " + "VLLM_ATTENTION_BACKEND env variable.") + # get device-specific attn_backend attention_cls = current_platform.get_attn_backend_cls( selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index d6dae2e526dc6..18e9d9b92d7bb 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -42,6 +42,10 @@ class _Backend(enum.Enum): BLOCK_SPARSE_FLASH_ATTN = enum.auto() NO_ATTENTION = enum.auto() + @classmethod + def get_v1_backends(cls) -> Tuple["_Backend", ...]: + return (cls.FLASH_ATTN_VLLM_V1, cls.ROCM_FLASH, cls.PALLAS_VLLM_V1) + class PlatformEnum(enum.Enum): CUDA = enum.auto()