vllm-project · robertgshaw2-redhat · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 27, 2024
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -232,7 +232,8 @@ def _get_scheme_from_parts(
                 return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
                     strategy=weight_quant.strategy,
-                    group_size=weight_quant.group_size)
+                    group_size=weight_quant.group_size,
+                    actorder=weight_quant.actorder)
 
         # Detect If Activation Quantization.
         # TODO @dsikka: clean-up conditions

@@ -3,12 +3,13 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    apply_gptq_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace,
-    marlin_permute_scales, replace_tensor, verify_marlin_supported,
-    verify_marlin_supports_shape)
+    apply_gptq_marlin_linear, marlin_is_k_full, marlin_make_empty_g_idx,
+    marlin_make_workspace, marlin_permute_scales, marlin_sort_g_idx,
+    replace_tensor, verify_marlin_supported, verify_marlin_supports_shape)
 from vllm.model_executor.parameter import (BasevLLMParameter,
                                            ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
@@ -22,13 +23,16 @@
 }
 WNA16_SUPPORTED_BITS = list(WNA16_SUPPORTED_TYPES_MAP.keys())
 
+logger = init_logger(__name__)
+
 
 class CompressedTensorsWNA16(CompressedTensorsScheme):
 
     def __init__(self,
                  strategy: str,
                  num_bits: int,
-                 group_size: Optional[int] = None):
+                 group_size: Optional[int] = None,
+                 actorder: bool = False):
 
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
@@ -46,6 +50,15 @@ def __init__(self,
 
         self.quant_type = WNA16_SUPPORTED_TYPES_MAP[num_bits]
 
+        if actorder and self.group_size == -1:
+            # In this case, actorder == True is the same as actorder == False
+            # (since we have only one group per output channel)
+            logger.warning(
+                "Model must be quantized with group_size > 0 in order to use "
+                "activation ordering")
+            actorder = False
+        self.actorder = actorder
+
         # Verify supported on platform.
         verify_marlin_supported(quant_type=self.quant_type,
                                 group_size=self.group_size)
@@ -62,6 +75,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                        **kwargs):
 
         output_size_per_partition = sum(output_partition_sizes)
+        is_row_parallel = input_size != input_size_per_partition
 
         # If group_size is -1, we are in channelwise case.
         channelwise = (self.group_size == -1)
@@ -119,14 +133,21 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                                                           dtype=torch.int64),
                                          weight_loader=weight_loader)
 
+        # G_IDX (for activation reordering)
+        g_idx = BasevLLMParameter(data=torch.empty(input_size_per_partition,
+                                                   dtype=torch.int32),
+                                  weight_loader=weight_loader)
+
         layer.register_parameter("weight_packed", weight)
         layer.register_parameter("weight_scale", weight_scale)
         layer.register_parameter("weight_shape", weight_shape)
+        layer.register_parameter("weight_g_idx", g_idx)
 
         layer.input_size_per_partition = input_size_per_partition
         layer.output_size_per_partition = output_size_per_partition
         layer.input_size = input_size
         layer.group_size = group_size
+        layer.is_k_full = marlin_is_k_full(self.actorder, is_row_parallel)
 
     # Checkpoints are serialized in compressed-tensors format, which is
     # different from marlin format. Handle repacking here.
@@ -137,9 +158,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.workspace = marlin_make_workspace(
             layer.output_size_per_partition, device)
 
-        # Act-order not supported in compressed-tensors yet, so set to empty.
-        layer.g_idx = marlin_make_empty_g_idx(device)
-        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+        # Handle sorting for activation reordering if needed.
+        if self.actorder:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(layer.weight_g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+            replace_tensor(layer, "weight_g_idx", g_idx)
+        else:
+            layer.weight_g_idx = marlin_make_empty_g_idx(device)
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
 
         # No zero-point
         layer.weight_zp = marlin_make_empty_g_idx(device)
@@ -161,7 +187,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Permute scales from compressed-tensors format to marlin format.
         marlin_scales = marlin_permute_scales(
             layer.weight_scale,
-            size_k=layer.input_size_per_partition,
+            size_k=(layer.input_size
+                    if self.actorder else layer.input_size_per_partition),
             size_n=layer.output_size_per_partition,
             group_size=layer.group_size)
         replace_tensor(layer, "weight_scale", marlin_scales)
@@ -174,7 +201,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
             weight=layer.weight_packed,
             weight_scale=layer.weight_scale,
             weight_zp=layer.weight_zp,
-            g_idx=layer.g_idx,
+            g_idx=layer.weight_g_idx,
             g_idx_sort_indices=layer.g_idx_sort_indices,
             workspace=layer.workspace,
             wtype=self.quant_type,

@@ -58,6 +58,8 @@ class QuantizationArgs(BaseModel):
         observed with every sample. Defaults to False for static
         quantization. Note that enabling dynamic quantization 
         will change the default observer to a memoryless one
+    :param actorder: whether to apply group quantization in decreasing order of
+        activation. Defaults to False for arbitrary ordering
     """
 
     num_bits: int = 8
@@ -67,6 +69,7 @@ class QuantizationArgs(BaseModel):
     strategy: Optional[QuantizationStrategy] = None
     block_structure: Optional[str] = None
     dynamic: bool = False
+    actorder: bool = False
     observer: str = Field(
         default="minmax",
         description=("The class to use to compute the quantization param - "

@@ -129,16 +129,16 @@ def marlin_make_workspace(output_size_per_partition: int,
                        requires_grad=False)
 
 
-def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
-    return (not act_order) or (act_order and not is_row_parallel)
+def marlin_is_k_full(actorder: bool, is_row_parallel: bool) -> bool:
+    return (not actorder) or (actorder and not is_row_parallel)
 
 
-def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+def marlin_repeat_scales_on_all_ranks(actorder: bool, group_size: int,
                                       is_row_parallel: bool) -> bool:
-    # Need to repeat scales on every rank if act_ordering or
+    # Need to repeat scales on every rank if actorder or
     # channelwise and RowParallelLinear
     is_channelwise = group_size == -1
-    return act_order or (is_channelwise and is_row_parallel)
+    return actorder or (is_channelwise and is_row_parallel)
 
 
 def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor: