Update w8a8_utils.py

vllm-project · Oct 29, 2024 · ebd38af · ebd38af
1 parent 1032cf3
commit ebd38af
Showing 1 changed file with 7 additions and 6 deletions.
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -110,11 +110,11 @@ def apply_fp8_linear(
 
         # Fused GEMM_DQ
         output = ops.cutlass_scaled_mm(qinput,
-                                     weight,
-                                     out_dtype=input.dtype,
-                                     scale_a=x_scale,
-                                     scale_b=weight_scale,
-                                     bias=bias)
+                                       weight,
+                                       out_dtype=input.dtype,
+                                       scale_a=x_scale,
+                                       scale_b=weight_scale,
+                                       bias=bias)
         return output.view(*output_shape)
 
     # torch.scaled_mm supports per tensor weights + activations only
@@ -145,7 +145,8 @@ def apply_fp8_linear(
             if type(output) is tuple and len(output) == 2:
                 output = output[0]
 
-            return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+            return torch.narrow(output, 0, 0,
+                                input_2d.shape[0]).view(*output_shape)
 
         else:
             # Fallback for channelwise case, where we use unfused DQ