vllm-project · gshtras · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024 · mgoin
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -206,9 +206,13 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # If checkpoint is fp8, handle that there are N scales for N
         # shards in a fused module
         else:
+            layer.weight_scale.data[layer.weight_scale.data == torch.finfo(
+                torch.float32).min] = 1
 if isinstance(layer, LinearBase): 
     if is_layer_skipped(prefix, self.ignored_layers): 
         return UnquantizedLinearMethod() 
     return Fp8LinearMethod(self) 
 if isinstance(layer, LinearBase): 
     if is_layer_skipped(prefix, self.ignored_layers): 
         return UnquantizedLinearMethod() 
     return Fp8LinearMethod(self) 
             layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
                                                     requires_grad=False)
             if self.quant_config.activation_scheme == "static":
+                layer.input_scale.data[layer.input_scale.data == torch.finfo(
+                    torch.float32).min] = 1
                 layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
                                                        requires_grad=False)
             # If using marlin (w8a16), kernel uses channelwise weights,

@@ -118,8 +118,10 @@ def apply_fp8_linear(
         # Note: we pad the input because torch._scaled_mm is more performant
         # for matrices with batch dimension > 16.
         # This could change in the future.
+        batched = input.dim() > 2
+        inp_view = input.view(-1, input.shape[-1]) if batched else input
         qinput, x_scale = ops.scaled_fp8_quant(
-            input,
+            inp_view,
             input_scale,
             num_token_padding=17,
             use_per_token_if_dynamic=use_per_token_if_dynamic)
@@ -138,8 +140,10 @@ def apply_fp8_linear(
             # A fix for discrepancy in scaled_mm which returns tuple
             # for torch < 2.5 and a single value in torch >= 2.5
             if type(output) is tuple and len(output) == 2:
-                return torch.narrow(output[0], 0, 0, input.shape[0])
-            return torch.narrow(output, 0, 0, input.shape[0])
+                output = output[0]
+            return (torch.narrow(
+                output, 0, 0, input.shape[0]) if not batched else output.view(
+                    input.shape[0], input.shape[1], weight.shape[1]))
 
         else:
             # Fallback for channelwise case, where we use unfused DQ

diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
@@ -1396,6 +1396,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                from vllm.model_executor.model_loader.weight_utils import (
+                    maybe_remap_kv_scale_name)
+                orig_name = name
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    logger.debug("Missing name %s, orig name %s", name,
+                                 orig_name)
+                    continue
+
                 param = params_dict.pop(name)
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)