[Test Patch] Remove redundant code for "Fix/update test_run_compresse…

…d" (#1072) SUMMARY: Removed breakpoints and addressed comments for #970 TEST PLAN: Ran pytest for the two test files #970 ORIGINAL PR DESCRIPTION: ~~Contingent on merge of huggingface/transformers#34719 ^ has been merged not yet released SUMMARY: Update run_compressed tests from decompression tests to run_comrpressed tests -> test if run_compressed True/False models generate the same output Add decompress tests that copies attrs from the source dir path's model to the target model. TEST PLAN: ran the test using transformers main must pass tests/llmcompressor/transformers/compression/test_decompress.py and tests/llmcompressor/transformers/compression/test_run_compressed.py
vllm-project · Jan 15, 2025 · 806da33 · 806da33
1 parent 0755398
commit 806da33
Show file tree

Hide file tree

Showing 11 changed files with 256 additions and 53 deletions.
diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml
@@ -0,0 +1,4 @@
+cadence: "commit"
+test_type: "regression"
+compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
+skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml
@@ -0,0 +1,4 @@
+cadence: "commit"
+test_type: "regression"
+compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed"
+skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml
@@ -0,0 +1,4 @@
+cadence: "commit"
+test_type: "regression"
+compressed_model_stub: "nm-testing/tinyllama-w8a16-dense"
+skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
diff --git a/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml
@@ -0,0 +1,4 @@
+cadence: "commit"
+test_type: "regression"
+compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed"
+skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
-empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" 
+compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed
+uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed
diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/tinyllama-w4a16-compressed"
-empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed
+uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed
diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml
@@ -0,0 +1,4 @@
+cadence: "commit"
+test_type: "regression"
+compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed
+uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed
diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml
diff --git a/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml b/tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml
@@ -1,4 +1,4 @@
 cadence: "commit"
 test_type: "regression"
-model_stub: "nm-testing/tinyllama-w8a8-compressed"
-empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed
+uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed
diff --git a/tests/llmcompressor/transformers/compression/test_decompress.py b/tests/llmcompressor/transformers/compression/test_decompress.py
@@ -0,0 +1,133 @@
+import copy
+import shutil
+import tempfile
+import unittest
+
+from compressed_tensors import QUANTIZATION_CONFIG_NAME
+from compressed_tensors.compressors import ModelCompressor
+from compressed_tensors.quantization import QuantizationStatus
+from parameterized import parameterized_class
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.utils.quantization_config import CompressedTensorsConfig
+
+from tests.testing_utils import parse_params, requires_gpu
+
+CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs"
+
+
+@requires_gpu
+@parameterized_class(parse_params(CONFIG_DIR))
+class TestDecompression(unittest.TestCase):
+    """
+    Check that HFQuantizer decompression is working as expected.
+    Manually decompress a compressed model and compare the generations
+
+    Decompression:
+    Given a skeleton model and path to the optimized model,
+    write the optimized model's safetensors to the skeleton model and decompress
+    Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16
+
+    """
+
+    compressed_model_stub = None
+    skeleton_model_stub = None
+
+    SAMPLE_INPUTS = [
+        "I love 4-bit quantization because",
+        "What is the capital of France?",
+        "def fibonacci(n):",
+    ]
+
+    @classmethod
+    def setUpClass(self):
+        self.test_dir = tempfile.mkdtemp()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub)
+
+        # Decompress using HFQuantizer from AutoModelForCausalLM
+        self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained(
+            self.compressed_model_stub,
+            torch_dtype="auto",
+            device_map="auto",
+            quantization_config=CompressedTensorsConfig(run_compressed=False),
+        )
+
+        # Manually decompress this model
+        self.dense_model = AutoModelForCausalLM.from_pretrained(
+            self.skeleton_model_stub,
+            torch_dtype=self.decompressed_model_hf_quantizer.dtype,
+            device_map=self.decompressed_model_hf_quantizer.device,
+        )
+
+        # decompression from HFQuantizer should populate weight_scale
+        assert hasattr(
+            self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj,
+            "weight_scale",
+        )
+
+        # dense model should not have weight_scale populated
+        assert not hasattr(
+            self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale"
+        )
+
+        config = AutoConfig.from_pretrained(self.compressed_model_stub)
+
+        compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
+        self.compressor = ModelCompressor.from_compression_config(compression_config)
+        self.compressor.quantization_config.quantization_status = (
+            QuantizationStatus.FROZEN
+        )
+
+        # use the model_path to load the decompressed weights into dense_model
+        dense_model = copy.deepcopy(self.dense_model)
+
+        # overwrite the weights of the dense model
+        self.compressor.decompress(
+            model_path=self.compressed_model_stub,
+            model=self.dense_model,
+        )
+
+        # self.dense_model should be decompressed
+        assert dense_model is not self.dense_model
+
+        self.decompressed_model_manual = self.dense_model
+
+        assert hasattr(
+            self.decompressed_model_manual.model.layers[0].self_attn.q_proj,
+            "weight_scale",
+        )
+
+    def test_hf_quantizer_decompress_match_manual_decompress(self):
+        manual_device = self.decompressed_model_manual.device
+        decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device
+
+        self.decompressed_model_manual = self.decompressed_model_manual.to(
+            manual_device
+        )
+        self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to(
+            decompressed_model_hf_quantizer
+        )
+
+        for input in self.SAMPLE_INPUTS:
+            inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(
+                self.decompressed_model_manual.device
+            )
+            inputs = inputs.to(self.decompressed_model_manual.device)
+
+            decompressed_model_manual_output = self.tokenizer.batch_decode(
+                self.decompressed_model_manual.generate(**inputs, max_length=50)
+            )
+
+            decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode(
+                self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50)
+            )
+
+            assert (
+                decompressed_model_hf_quantizer_out == decompressed_model_manual_output
+            )
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree(self.test_dir)
+        del self.dense_model
+        del self.decompressed_model_hf_quantizer
+        del self.decompressed_model_manual
diff --git a/tests/llmcompressor/transformers/compression/test_run_compressed.py b/tests/llmcompressor/transformers/compression/test_run_compressed.py
@@ -1,79 +1,133 @@
+import copy
 import shutil
 import tempfile
 import unittest
 
-import torch
 from compressed_tensors import QUANTIZATION_CONFIG_NAME
 from compressed_tensors.compressors import ModelCompressor
 from compressed_tensors.quantization import QuantizationStatus
 from parameterized import parameterized_class
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.utils.quantization_config import CompressedTensorsConfig
 
 from tests.testing_utils import parse_params, requires_gpu
 
-CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs"
+CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs"
 
 
 @requires_gpu
 @parameterized_class(parse_params(CONFIG_DIR))
-class TestQuantizationMatches(unittest.TestCase):
-    model_stub = None
-    empty_model = None
+class TestDecompression(unittest.TestCase):
+    """
+    Check that HFQuantizer decompression is working as expected.
+    Manually decompress a compressed model and compare the generations
+
+    Decompression:
+    Given a skeleton model and path to the optimized model,
+    write the optimized model's safetensors to the skeleton model and decompress
+    Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16
+
+    """
+
+    compressed_model_stub = None
+    skeleton_model_stub = None
+
+    SAMPLE_INPUTS = [
+        "I love 4-bit quantization because",
+        "What is the capital of France?",
+        "def fibonacci(n):",
+    ]
 
     @classmethod
-    def setUpClass(cls):
-        cls.test_dir = tempfile.mkdtemp()
+    def setUpClass(self):
+        self.test_dir = tempfile.mkdtemp()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub)
 
-        # TODO: Give option on HFQuantizer to run run_compressed True/False
-        # currently hardcoded to True
-        cls.compressed_model = AutoModelForCausalLM.from_pretrained(
-            cls.model_stub,
+        # Decompress using HFQuantizer from AutoModelForCausalLM
+        self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained(
+            self.compressed_model_stub,
             torch_dtype="auto",
             device_map="auto",
-            # run_compressed=True, # TODO: Give option on HFQuantizer
+            quantization_config=CompressedTensorsConfig(run_compressed=False),
         )
-        # TODO: Use ModelCompressor until decompression is supported through
-        # HFQuant/run_compressed can be turned off.
-        cls.uncompressed_model = AutoModelForCausalLM.from_pretrained(
-            cls.empty_model,
-            torch_dtype=cls.compressed_model.dtype,
-            device_map=cls.compressed_model.device,
+
+        # Manually decompress this model
+        self.dense_model = AutoModelForCausalLM.from_pretrained(
+            self.skeleton_model_stub,
+            torch_dtype=self.decompressed_model_hf_quantizer.dtype,
+            device_map=self.decompressed_model_hf_quantizer.device,
+        )
+
+        # decompression from HFQuantizer should populate weight_scale
+        assert hasattr(
+            self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj,
+            "weight_scale",
+        )
+
+        # dense model should not have weight_scale populated
+        assert not hasattr(
+            self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale"
         )
-        config = AutoConfig.from_pretrained(cls.model_stub)
+
+        config = AutoConfig.from_pretrained(self.compressed_model_stub)
+
         compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
-        cls.compressor = ModelCompressor.from_compression_config(compression_config)
-        cls.compressor.quantization_config.quantization_status = (
+        self.compressor = ModelCompressor.from_compression_config(compression_config)
+        self.compressor.quantization_config.quantization_status = (
             QuantizationStatus.FROZEN
         )
-        cls.compressor.decompress(
-            model_path=cls.model_stub, model=cls.uncompressed_model
+
+        # use the model_path to load the decompressed weights into dense_model
+        dense_model = copy.deepcopy(self.dense_model)
+
+        # overwrite the weights of the dense model
+        self.compressor.decompress(
+            model_path=self.compressed_model_stub,
+            model=self.dense_model,
         )
 
-        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub)
+        # self.dense_model should be decompressed
+        assert dense_model is not self.dense_model
 
-    def test_compressed_matches_uncompressed(self):
-        SAMPLE_INPUT = [
-            "I love 4-bit quantization because",
-            "What is the capital of France?",
-            "def fibonacci(n):",
-        ]
+        self.decompressed_model_manual = self.dense_model
 
-        inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
-            self.compressed_model.device
+        assert hasattr(
+            self.decompressed_model_manual.model.layers[0].self_attn.q_proj,
+            "weight_scale",
         )
-        compressed_output = self.tokenizer.batch_decode(
-            self.compressed_model.generate(**inputs, max_length=50)
+
+    def test_hf_quantizer_decompress_match_manual_decompress(self):
+        manual_device = self.decompressed_model_manual.device
+        decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device
+
+        self.decompressed_model_manual = self.decompressed_model_manual.to(
+            manual_device
         )
-        uncompressed_output = self.tokenizer.batch_decode(
-            self.uncompressed_model.generate(**inputs, max_length=50)
+        self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to(
+            decompressed_model_hf_quantizer
         )
 
-        for idx in range(len(SAMPLE_INPUT)):
-            assert compressed_output[idx] == uncompressed_output[idx]
+        for input in self.SAMPLE_INPUTS:
+            inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(
+                self.decompressed_model_manual.device
+            )
+            inputs = inputs.to(self.decompressed_model_manual.device)
+
+            decompressed_model_manual_output = self.tokenizer.batch_decode(
+                self.decompressed_model_manual.generate(**inputs, max_length=50)
+            )
+
+            decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode(
+                self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50)
+            )
+
+            assert (
+                decompressed_model_hf_quantizer_out == decompressed_model_manual_output
+            )
 
     @classmethod
-    def tearDownClass(cls):
-        shutil.rmtree(cls.test_dir)
-        del cls.compressed_model
-        del cls.uncompressed_model
-        torch.cuda.empty_cache()
+    def tearDownClass(self):
+        shutil.rmtree(self.test_dir)
+        del self.dense_model
+        del self.decompressed_model_hf_quantizer
+        del self.decompressed_model_manual