vllm-project · kylesayrs · Feb 13, 2025 · Feb 13, 2025 · rahul-tuli · Feb 14, 2025
diff --git a/examples/finetuning/example_alternating_recipe.yaml b/examples/finetuning/example_alternating_recipe.yaml
@@ -1,6 +1,6 @@
 initial_sparsity_stage:
   run_type: oneshot
-  obcq_modifiers:
+  modifiers:
     SparseGPTModifier:
       sparsity: 0.5
       block_size: 128
@@ -16,7 +16,7 @@ initial_training_stage:
       start: 0
 next_sparsity_stage:
   run_type: oneshot
-  obcq_modifiers:
+  modifiers:
     SparseGPTModifier:
       sparsity: 0.7
       block_size: 128

diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -3,8 +3,7 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.modifiers.obcq import SparseGPTModifier
-from llmcompressor.modifiers.pruning import ConstantPruningModifier
+from llmcompressor.modifiers.pruning import ConstantPruningModifier, SparseGPTModifier
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.transformers import oneshot
 

diff --git a/src/llmcompressor/modifiers/README.md b/src/llmcompressor/modifiers/README.md
@@ -8,7 +8,7 @@ are relevant only during training. Below is a summary of the key modifiers avail
 
 Modifiers that introduce sparsity into a model
 
-### [SparseGPT](./obcq/base.py)
+### [SparseGPT](./pruning/sparsegpt/base.py)
 One-shot algorithm that uses calibration data to introduce unstructured or structured 
 sparsity into weights. Implementation based on [SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot](https://arxiv.org/abs/2301.00774). A small amount of calibration data is used 
 to calculate a Hessian for each layers input activations, this Hessian is then used to 

diff --git a/src/llmcompressor/modifiers/obcq/base.py b/src/llmcompressor/modifiers/obcq/base.py
@@ -1,163 +1,12 @@
-import contextlib
-from typing import Dict, Optional, Tuple
+import warnings
 
-import torch
-from compressed_tensors.utils import (
-    align_module_device,
-    get_execution_device,
-    update_offload_parameter,
-)
-from loguru import logger
-from pydantic import PrivateAttr
+from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier
 
-from llmcompressor.core import State
-from llmcompressor.modifiers import Modifier
-from llmcompressor.modifiers.obcq.sgpt_mixin import SparsityModifierMixin
-from llmcompressor.modifiers.obcq.sgpt_sparsify import (
-    accumulate_hessian,
-    make_empty_hessian,
-    sparsify_weight,
+warnings.warn(
+    "llmcompressor.modifiers.obcq has been moved to "
+    "llmcompressor.modifiers.pruning.sparsegpt Please update your paths",
+    DeprecationWarning,
 )
-from llmcompressor.utils.metric_logging import CompressionLogger
-
-__all__ = ["SparseGPTModifier"]
-
-
-class SparseGPTModifier(SparsityModifierMixin, Modifier):
-    """
-    Modifier for applying the one-shot SparseGPT algorithm to a model
-
-    | Sample yaml:
-    |   test_stage:
-    |       obcq_modifiers:
-    |           SparseGPTModifier:
-    |               sparsity: 0.5
-    |               mask_structure: "2:4"
-    |               dampening_frac: 0.001
-    |               block_size: 128
-    |               targets: ['Linear']
-    |               ignore: ['re:.*lm_head']
-
-    Lifecycle:
-        - on_initialize
-            - register_hook(module, calibrate_module, "forward")
-            - run_sequential / run_layer_sequential / run_basic
-                - make_empty_hessian
-                - accumulate_hessian
-        - on_sequential_batch_end
-            - sparsify_weight
-        - on_finalize
-            - remove_hooks()
-
-    :param sparsity: Sparsity to compress model to
-    :param sparsity_profile: Can be set to 'owl' to use Outlier Weighed
-        Layerwise Sparsity (OWL), more information can be found
-        in the paper https://arxiv.org/pdf/2310.05175
-    :param mask_structure: String to define the structure of the mask to apply.
-        Must be of the form N:M where N, M are integers that define a custom block
-        shape. Defaults to 0:0 which represents an unstructured mask.
-    :param owl_m: Number of outliers to use for OWL
-    :param owl_lmbda: Lambda value to use for OWL
-    :param block_size: Used to determine number of columns to compress in one pass
-    :param dampening_frac: Amount of dampening to apply to H, as a fraction of the
-        diagonal norm
-    :param preserve_sparsity_mask: Whether or not to preserve the sparsity mask
-        during when applying sparsegpt, this becomes useful when starting from a
-        previously pruned model, defaults to False.
-    :param offload_hessians: Set to True for decreased memory usage but increased
-        runtime.
-    :param sequential_targets: list of layer names to compress during OBCQ, or '__ALL__'
-        to compress every layer in the model. Alias for `targets`
-    :param targets: list of layer names to compress during OBCQ, or '__ALL__'
-        to compress every layer in the model. Alias for `sequential_targets`
-    :param ignore: optional list of module class names or submodule names to not
-        quantize even if they match a target. Defaults to empty list.
-    """
-
-    # modifier arguments
-    block_size: int = 128
-    dampening_frac: Optional[float] = 0.01
-    preserve_sparsity_mask: bool = False
-    offload_hessians: bool = False
-
-    # private variables
-    _num_samples: Dict[torch.nn.Module, int] = PrivateAttr(default_factory=dict)
-    _hessians: Dict[torch.nn.Module, torch.Tensor] = PrivateAttr(default_factory=dict)
-
-    def calibrate_module(
-        self,
-        module: torch.nn.Module,
-        args: Tuple[torch.Tensor, ...],
-        _output: torch.Tensor,
-    ):
-        # Assume that the first argument is the input
-        inp = args[0]
 
-        # Initialize hessian if not present
-        if module not in self._num_samples:
-            device = get_execution_device(module)
-            self._hessians[module] = make_empty_hessian(module, device=device)
-            self._num_samples[module] = 0
 
-        # Accumulate hessian with input with optional offloading
-        with self._maybe_onload_hessian(module):
-            self._hessians[module], self._num_samples[module] = accumulate_hessian(
-                inp,
-                module,
-                self._hessians[module],
-                self._num_samples[module],
-            )
-
-    def on_sequential_batch_end(self):
-        """
-        Sparsify modules
-        TODO: implement with event callback
-        """
-        for module in list(self._num_samples.keys()):
-            name = self._module_names[module]
-            sparsity = self._module_sparsities[module]
-            num_samples = self._num_samples[module]
-
-            logger.info(f"Sparsifying {name} using {num_samples} samples")
-            with (
-                torch.no_grad(),
-                align_module_device(module),
-                CompressionLogger(module) as comp_logger,
-            ):
-                loss, sparsified_weight = sparsify_weight(
-                    module=module,
-                    hessians_dict=self._hessians,
-                    sparsity=sparsity,
-                    prune_n=self._prune_n,
-                    prune_m=self._prune_m,
-                    block_size=self.block_size,
-                    dampening_frac=self.dampening_frac,
-                    preserve_sparsity_mask=self.preserve_sparsity_mask,
-                )
-                comp_logger.set_loss(loss)
-
-            update_offload_parameter(module, "weight", sparsified_weight)
-
-            # self._hessians[module] already deleted by sparsify_weight
-            del self._num_samples[module]
-
-    @contextlib.contextmanager
-    def _maybe_onload_hessian(self, module: torch.nn.Module):
-        if self.offload_hessians:
-            device = get_execution_device(module)
-            self._hessians[module] = self._hessians[module].to(device=device)
-
-        yield
-
-        if self.offload_hessians:
-            if module in self._hessians:  # may have been deleted in context
-                self._hessians[module] = self._hessians[module].to(device="cpu")
-
-    def on_finalize(self, state: State, **kwargs) -> bool:
-        self.remove_hooks()
-        self._hessians = dict()
-        self._num_samples = dict()
-        self._module_names = dict()
-        self._module_sparsities = dict()
-
-        return True
+__all__ = ["SparseGPTModifier"]
diff --git a/src/llmcompressor/modifiers/pruning/__init__.py b/src/llmcompressor/modifiers/pruning/__init__.py
@@ -2,4 +2,5 @@
 
 from .constant import *
 from .magnitude import *
+from .sparsegpt import *
 from .wanda import *
diff --git a/src/llmcompressor/modifiers/pruning/sparsegpt/__init__.py b/src/llmcompressor/modifiers/pruning/sparsegpt/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+
+from .base import *
diff --git a/src/llmcompressor/modifiers/pruning/sparsegpt/base.py b/src/llmcompressor/modifiers/pruning/sparsegpt/base.py
@@ -0,0 +1,163 @@
+import contextlib
+from typing import Dict, Optional, Tuple
+
+import torch
+from compressed_tensors.utils import (
+    align_module_device,
+    get_execution_device,
+    update_offload_parameter,
+)
+from loguru import logger
+from pydantic import PrivateAttr
+
+from llmcompressor.core import State
+from llmcompressor.modifiers import Modifier
+from llmcompressor.modifiers.pruning.sparsegpt.sgpt_mixin import SparsityModifierMixin
+from llmcompressor.modifiers.pruning.sparsegpt.sgpt_sparsify import (
+    accumulate_hessian,
+    make_empty_hessian,
+    sparsify_weight,
+)
+from llmcompressor.utils.metric_logging import CompressionLogger
+
+__all__ = ["SparseGPTModifier"]
+
+
+class SparseGPTModifier(SparsityModifierMixin, Modifier):
+    """
+    Modifier for applying the one-shot SparseGPT algorithm to a model
+
+    | Sample yaml:
+    |   test_stage:
+    |       modifiers:
+    |           SparseGPTModifier:
+    |               sparsity: 0.5
+    |               mask_structure: "2:4"
+    |               dampening_frac: 0.001
+    |               block_size: 128
+    |               targets: ['Linear']
+    |               ignore: ['re:.*lm_head']
+
+    Lifecycle:
+        - on_initialize
+            - register_hook(module, calibrate_module, "forward")
+            - run_sequential / run_layer_sequential / run_basic
+                - make_empty_hessian
+                - accumulate_hessian
+        - on_sequential_batch_end
+            - sparsify_weight
+        - on_finalize
+            - remove_hooks()
+
+    :param sparsity: Sparsity to compress model to
+    :param sparsity_profile: Can be set to 'owl' to use Outlier Weighed
+        Layerwise Sparsity (OWL), more information can be found
+        in the paper https://arxiv.org/pdf/2310.05175
+    :param mask_structure: String to define the structure of the mask to apply.
+        Must be of the form N:M where N, M are integers that define a custom block
+        shape. Defaults to 0:0 which represents an unstructured mask.
+    :param owl_m: Number of outliers to use for OWL
+    :param owl_lmbda: Lambda value to use for OWL
+    :param block_size: Used to determine number of columns to compress in one pass
+    :param dampening_frac: Amount of dampening to apply to H, as a fraction of the
+        diagonal norm
+    :param preserve_sparsity_mask: Whether or not to preserve the sparsity mask
+        during when applying sparsegpt, this becomes useful when starting from a
+        previously pruned model, defaults to False.
+    :param offload_hessians: Set to True for decreased memory usage but increased
+        runtime.
+    :param sequential_targets: list of layer names to compress during OBCQ, or '__ALL__'
+        to compress every layer in the model. Alias for `targets`
+    :param targets: list of layer names to compress during OBCQ, or '__ALL__'
+        to compress every layer in the model. Alias for `sequential_targets`
+    :param ignore: optional list of module class names or submodule names to not
+        quantize even if they match a target. Defaults to empty list.
+    """
+
+    # modifier arguments
+    block_size: int = 128
+    dampening_frac: Optional[float] = 0.01
+    preserve_sparsity_mask: bool = False
+    offload_hessians: bool = False
+
+    # private variables
+    _num_samples: Dict[torch.nn.Module, int] = PrivateAttr(default_factory=dict)
+    _hessians: Dict[torch.nn.Module, torch.Tensor] = PrivateAttr(default_factory=dict)
+
+    def calibrate_module(
+        self,
+        module: torch.nn.Module,
+        args: Tuple[torch.Tensor, ...],
+        _output: torch.Tensor,
+    ):
+        # Assume that the first argument is the input
+        inp = args[0]
+
+        # Initialize hessian if not present
+        if module not in self._num_samples:
+            device = get_execution_device(module)
+            self._hessians[module] = make_empty_hessian(module, device=device)
+            self._num_samples[module] = 0
+
+        # Accumulate hessian with input with optional offloading
+        with self._maybe_onload_hessian(module):
+            self._hessians[module], self._num_samples[module] = accumulate_hessian(
+                inp,
+                module,
+                self._hessians[module],
+                self._num_samples[module],
+            )
+
+    def on_sequential_batch_end(self):
+        """
+        Sparsify modules
+        TODO: implement with event callback
+        """
+        for module in list(self._num_samples.keys()):
+            name = self._module_names[module]
+            sparsity = self._module_sparsities[module]
+            num_samples = self._num_samples[module]
+
+            logger.info(f"Sparsifying {name} using {num_samples} samples")
+            with (
+                torch.no_grad(),
+                align_module_device(module),
+                CompressionLogger(module) as comp_logger,
+            ):
+                loss, sparsified_weight = sparsify_weight(
+                    module=module,
+                    hessians_dict=self._hessians,
+                    sparsity=sparsity,
+                    prune_n=self._prune_n,
+                    prune_m=self._prune_m,
+                    block_size=self.block_size,
+                    dampening_frac=self.dampening_frac,
+                    preserve_sparsity_mask=self.preserve_sparsity_mask,
+                )
+                comp_logger.set_loss(loss)
+
+            update_offload_parameter(module, "weight", sparsified_weight)
+
+            # self._hessians[module] already deleted by sparsify_weight
+            del self._num_samples[module]
+
+    @contextlib.contextmanager
+    def _maybe_onload_hessian(self, module: torch.nn.Module):
+        if self.offload_hessians:
+            device = get_execution_device(module)
+            self._hessians[module] = self._hessians[module].to(device=device)
+
+        yield
+
+        if self.offload_hessians:
+            if module in self._hessians:  # may have been deleted in context
+                self._hessians[module] = self._hessians[module].to(device="cpu")
+
+    def on_finalize(self, state: State, **kwargs) -> bool:
+        self.remove_hooks()
+        self._hessians = dict()
+        self._num_samples = dict()
+        self._module_names = dict()
+        self._module_sparsities = dict()
+
+        return True
diff --git a/...lmcompressor/modifiers/obcq/sgpt_mixin.py → ...modifiers/pruning/sparsegpt/sgpt_mixin.py b/...lmcompressor/modifiers/obcq/sgpt_mixin.py → ...modifiers/pruning/sparsegpt/sgpt_mixin.py
diff --git a/...ompressor/modifiers/obcq/sgpt_sparsify.py → ...ifiers/pruning/sparsegpt/sgpt_sparsify.py b/...ompressor/modifiers/obcq/sgpt_sparsify.py → ...ifiers/pruning/sparsegpt/sgpt_sparsify.py
diff --git a/src/llmcompressor/modifiers/pruning/wanda/base.py b/src/llmcompressor/modifiers/pruning/wanda/base.py
@@ -11,7 +11,7 @@
 
 from llmcompressor.core import State
 from llmcompressor.modifiers import Modifier
-from llmcompressor.modifiers.obcq.sgpt_mixin import SparsityModifierMixin
+from llmcompressor.modifiers.pruning.sparsegpt.sgpt_mixin import SparsityModifierMixin
 from llmcompressor.modifiers.pruning.wanda.wanda_sparsify import (
     accumulate_row_scalars,
     make_empty_row_scalars,

diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -42,7 +42,7 @@ class GPTQModifier(Modifier, HooksMixin):
 
     | Sample yaml:
     | test_stage:
-    |    obcq_modifiers:
+    |    modifiers:
     |      GPTQModifier:
     |          block_size: 128
     |          dampening_frac: 0.001