diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index f4fc7ce020e95..a3854f70bb4fa 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -43,7 +43,8 @@ def get_name(self) -> str: def get_supported_act_dtypes(self) -> List[torch.dtype]: return [torch.half] - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # The AWQ kernel only supports Turing or newer GPUs. return 75 diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index e7de283b562a6..c23b66161d9b8 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -44,8 +44,9 @@ def get_supported_act_dtypes(self) -> List[torch.dtype]: """List of supported activation dtypes.""" raise NotImplementedError + @classmethod @abstractmethod - def get_min_capability(self) -> int: + def get_min_capability(cls) -> int: """Minimum GPU capability to support the quantization method. E.g., 70 for Volta, 75 for Turing, 80 for Ampere. diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 969958d9b5448..e76714a7b460c 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -38,7 +38,7 @@ def get_supported_act_dtypes(self) -> List[torch.dtype]: return [torch.float32, torch.float16, torch.bfloat16] @classmethod - def get_min_capability(self) -> int: + def get_min_capability(cls) -> int: return 70 @staticmethod diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 0cf224cc05479..a451427ec93f2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -33,10 +33,9 @@ def get_scaled_act_names(self) -> List[str]: def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.float16, torch.bfloat16] - # Need to figure it out @classmethod def get_min_capability(cls) -> int: - return 60 + return 75 def get_name(self) -> str: return "compressed_tensors" @@ -84,6 +83,14 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig": def get_config_filenames(cls) -> List[str]: return [] + def _check_gptq_and_marlin_can_run(self): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + if capability < 80: + raise RuntimeError("The quantization config is not supported for ", + "the current GPU. Minimum capability: 80. ", + f"Current capability: {capability}.") + def _is_static_tensor_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8 @@ -126,6 +133,7 @@ def _get_schema(self, weight_quant: BaseModel, input_quant: BaseModel) -> "CompressedTensorsScheme": if self._is_wNa16_group_channel(weight_quant, input_quant): + self._check_gptq_and_marlin_can_run() if (self.quant_format == CompressionFormat.marlin_24.value and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS): return CompressedTensorsW4A16Sparse24( diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index 207dbcee8afc5..72ba55eb1740d 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -39,7 +39,8 @@ def get_name(self) -> str: def get_supported_act_dtypes(self) -> List[torch.dtype]: return [torch.half] - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: return 70 @staticmethod