vllm-project · robertgshaw2-redhat · Apr 1, 2024 · Mar 27, 2024 · Mar 30, 2024 · Mar 30, 2024
diff --git a/vllm/config.py b/vllm/config.py
@@ -171,26 +171,28 @@ def _verify_quantization(self) -> None:
             self.quantization = self.quantization.lower()
 
         # Parse quantization method from the HF model config, if available.
-        hf_quant_config = getattr(self.hf_config, "quantization_config", None)
-        if hf_quant_config is not None:
-            hf_quant_method = str(hf_quant_config["quant_method"]).lower()
-
-            # If the GPTQ model is serialized in marlin format, use marlin.
-            if (hf_quant_method == "gptq"
-                    and "is_marlin_format" in hf_quant_config
-                    and hf_quant_config["is_marlin_format"]):
+        quant_cfg = getattr(self.hf_config, "quantization_config", None)
+        if quant_cfg is not None:
+            quant_method = quant_cfg.get("quant_method", "").lower()
+            # compat: autogptq >=0.8.0 use checkpoint_format: str
+            # compat: autogptq <=0.7.1 is_marlin_format: bool
+            is_format_marlin = (quant_cfg.get("checkpoint_format") == "marlin"
+                                or quant_cfg.get("is_marlin_format", False))
+
+            # Use marlin if the GPTQ model is serialized in marlin format.
+            if quant_method == "gptq" and is_format_marlin:
                 logger.info("The model is serialized in Marlin format. "
                             "Using Marlin kernel.")
-                hf_quant_method = "marlin"
+                quant_method = "marlin"
                 if self.quantization == "gptq":
-                    self.quantization = hf_quant_method
+                    self.quantization = quant_method
 
             if self.quantization is None:
-                self.quantization = hf_quant_method
-            elif self.quantization != hf_quant_method:
+                self.quantization = quant_method
+            elif self.quantization != quant_method:
                 raise ValueError(
                     "Quantization method specified in the model config "
-                    f"({hf_quant_method}) does not match the quantization "
+                    f"({quant_method}) does not match the quantization "
                     f"method specified in the `quantization` argument "
                     f"({self.quantization}).")