-
Notifications
You must be signed in to change notification settings - Fork 84
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Test Patch] Remove redundant code for "Fix/update test_run_compresse…
…d" (#1072) SUMMARY: Removed breakpoints and addressed comments for #970 TEST PLAN: Ran pytest for the two test files #970 ORIGINAL PR DESCRIPTION: ~~Contingent on merge of huggingface/transformers#34719 ^ has been merged not yet released SUMMARY: Update run_compressed tests from decompression tests to run_comrpressed tests -> test if run_compressed True/False models generate the same output Add decompress tests that copies attrs from the source dir path's model to the target model. TEST PLAN: ran the test using transformers main must pass tests/llmcompressor/transformers/compression/test_decompress.py and tests/llmcompressor/transformers/compression/test_run_compressed.py
- Loading branch information
Showing
11 changed files
with
256 additions
and
53 deletions.
There are no files selected for viewing
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/fp8_dynamic.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w4a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w8a16_dense.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-w8a16-dense" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/decompression_configs/w8a8.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed" | ||
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/fp8_dynamic.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed" | ||
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed |
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w4a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
model_stub: "nm-testing/tinyllama-w4a16-compressed" | ||
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed |
4 changes: 4 additions & 0 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed |
4 changes: 0 additions & 4 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a16_dense.yaml
This file was deleted.
Oops, something went wrong.
4 changes: 2 additions & 2 deletions
4
tests/llmcompressor/transformers/compression/run_compressed_configs/w8a8.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cadence: "commit" | ||
test_type: "regression" | ||
model_stub: "nm-testing/tinyllama-w8a8-compressed" | ||
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" | ||
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed | ||
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed |
133 changes: 133 additions & 0 deletions
133
tests/llmcompressor/transformers/compression/test_decompress.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import copy | ||
import shutil | ||
import tempfile | ||
import unittest | ||
|
||
from compressed_tensors import QUANTIZATION_CONFIG_NAME | ||
from compressed_tensors.compressors import ModelCompressor | ||
from compressed_tensors.quantization import QuantizationStatus | ||
from parameterized import parameterized_class | ||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | ||
from transformers.utils.quantization_config import CompressedTensorsConfig | ||
|
||
from tests.testing_utils import parse_params, requires_gpu | ||
|
||
CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs" | ||
|
||
|
||
@requires_gpu | ||
@parameterized_class(parse_params(CONFIG_DIR)) | ||
class TestDecompression(unittest.TestCase): | ||
""" | ||
Check that HFQuantizer decompression is working as expected. | ||
Manually decompress a compressed model and compare the generations | ||
Decompression: | ||
Given a skeleton model and path to the optimized model, | ||
write the optimized model's safetensors to the skeleton model and decompress | ||
Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16 | ||
""" | ||
|
||
compressed_model_stub = None | ||
skeleton_model_stub = None | ||
|
||
SAMPLE_INPUTS = [ | ||
"I love 4-bit quantization because", | ||
"What is the capital of France?", | ||
"def fibonacci(n):", | ||
] | ||
|
||
@classmethod | ||
def setUpClass(self): | ||
self.test_dir = tempfile.mkdtemp() | ||
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub) | ||
|
||
# Decompress using HFQuantizer from AutoModelForCausalLM | ||
self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained( | ||
self.compressed_model_stub, | ||
torch_dtype="auto", | ||
device_map="auto", | ||
quantization_config=CompressedTensorsConfig(run_compressed=False), | ||
) | ||
|
||
# Manually decompress this model | ||
self.dense_model = AutoModelForCausalLM.from_pretrained( | ||
self.skeleton_model_stub, | ||
torch_dtype=self.decompressed_model_hf_quantizer.dtype, | ||
device_map=self.decompressed_model_hf_quantizer.device, | ||
) | ||
|
||
# decompression from HFQuantizer should populate weight_scale | ||
assert hasattr( | ||
self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj, | ||
"weight_scale", | ||
) | ||
|
||
# dense model should not have weight_scale populated | ||
assert not hasattr( | ||
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale" | ||
) | ||
|
||
config = AutoConfig.from_pretrained(self.compressed_model_stub) | ||
|
||
compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) | ||
self.compressor = ModelCompressor.from_compression_config(compression_config) | ||
self.compressor.quantization_config.quantization_status = ( | ||
QuantizationStatus.FROZEN | ||
) | ||
|
||
# use the model_path to load the decompressed weights into dense_model | ||
dense_model = copy.deepcopy(self.dense_model) | ||
|
||
# overwrite the weights of the dense model | ||
self.compressor.decompress( | ||
model_path=self.compressed_model_stub, | ||
model=self.dense_model, | ||
) | ||
|
||
# self.dense_model should be decompressed | ||
assert dense_model is not self.dense_model | ||
|
||
self.decompressed_model_manual = self.dense_model | ||
|
||
assert hasattr( | ||
self.decompressed_model_manual.model.layers[0].self_attn.q_proj, | ||
"weight_scale", | ||
) | ||
|
||
def test_hf_quantizer_decompress_match_manual_decompress(self): | ||
manual_device = self.decompressed_model_manual.device | ||
decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device | ||
|
||
self.decompressed_model_manual = self.decompressed_model_manual.to( | ||
manual_device | ||
) | ||
self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to( | ||
decompressed_model_hf_quantizer | ||
) | ||
|
||
for input in self.SAMPLE_INPUTS: | ||
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( | ||
self.decompressed_model_manual.device | ||
) | ||
inputs = inputs.to(self.decompressed_model_manual.device) | ||
|
||
decompressed_model_manual_output = self.tokenizer.batch_decode( | ||
self.decompressed_model_manual.generate(**inputs, max_length=50) | ||
) | ||
|
||
decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode( | ||
self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50) | ||
) | ||
|
||
assert ( | ||
decompressed_model_hf_quantizer_out == decompressed_model_manual_output | ||
) | ||
|
||
@classmethod | ||
def tearDownClass(self): | ||
shutil.rmtree(self.test_dir) | ||
del self.dense_model | ||
del self.decompressed_model_hf_quantizer | ||
del self.decompressed_model_manual |
140 changes: 97 additions & 43 deletions
140
tests/llmcompressor/transformers/compression/test_run_compressed.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,79 +1,133 @@ | ||
import copy | ||
import shutil | ||
import tempfile | ||
import unittest | ||
|
||
import torch | ||
from compressed_tensors import QUANTIZATION_CONFIG_NAME | ||
from compressed_tensors.compressors import ModelCompressor | ||
from compressed_tensors.quantization import QuantizationStatus | ||
from parameterized import parameterized_class | ||
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | ||
from transformers.utils.quantization_config import CompressedTensorsConfig | ||
|
||
from tests.testing_utils import parse_params, requires_gpu | ||
|
||
CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs" | ||
CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs" | ||
|
||
|
||
@requires_gpu | ||
@parameterized_class(parse_params(CONFIG_DIR)) | ||
class TestQuantizationMatches(unittest.TestCase): | ||
model_stub = None | ||
empty_model = None | ||
class TestDecompression(unittest.TestCase): | ||
""" | ||
Check that HFQuantizer decompression is working as expected. | ||
Manually decompress a compressed model and compare the generations | ||
Decompression: | ||
Given a skeleton model and path to the optimized model, | ||
write the optimized model's safetensors to the skeleton model and decompress | ||
Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16 | ||
""" | ||
|
||
compressed_model_stub = None | ||
skeleton_model_stub = None | ||
|
||
SAMPLE_INPUTS = [ | ||
"I love 4-bit quantization because", | ||
"What is the capital of France?", | ||
"def fibonacci(n):", | ||
] | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
cls.test_dir = tempfile.mkdtemp() | ||
def setUpClass(self): | ||
self.test_dir = tempfile.mkdtemp() | ||
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub) | ||
|
||
# TODO: Give option on HFQuantizer to run run_compressed True/False | ||
# currently hardcoded to True | ||
cls.compressed_model = AutoModelForCausalLM.from_pretrained( | ||
cls.model_stub, | ||
# Decompress using HFQuantizer from AutoModelForCausalLM | ||
self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained( | ||
self.compressed_model_stub, | ||
torch_dtype="auto", | ||
device_map="auto", | ||
# run_compressed=True, # TODO: Give option on HFQuantizer | ||
quantization_config=CompressedTensorsConfig(run_compressed=False), | ||
) | ||
# TODO: Use ModelCompressor until decompression is supported through | ||
# HFQuant/run_compressed can be turned off. | ||
cls.uncompressed_model = AutoModelForCausalLM.from_pretrained( | ||
cls.empty_model, | ||
torch_dtype=cls.compressed_model.dtype, | ||
device_map=cls.compressed_model.device, | ||
|
||
# Manually decompress this model | ||
self.dense_model = AutoModelForCausalLM.from_pretrained( | ||
self.skeleton_model_stub, | ||
torch_dtype=self.decompressed_model_hf_quantizer.dtype, | ||
device_map=self.decompressed_model_hf_quantizer.device, | ||
) | ||
|
||
# decompression from HFQuantizer should populate weight_scale | ||
assert hasattr( | ||
self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj, | ||
"weight_scale", | ||
) | ||
|
||
# dense model should not have weight_scale populated | ||
assert not hasattr( | ||
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale" | ||
) | ||
config = AutoConfig.from_pretrained(cls.model_stub) | ||
|
||
config = AutoConfig.from_pretrained(self.compressed_model_stub) | ||
|
||
compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None) | ||
cls.compressor = ModelCompressor.from_compression_config(compression_config) | ||
cls.compressor.quantization_config.quantization_status = ( | ||
self.compressor = ModelCompressor.from_compression_config(compression_config) | ||
self.compressor.quantization_config.quantization_status = ( | ||
QuantizationStatus.FROZEN | ||
) | ||
cls.compressor.decompress( | ||
model_path=cls.model_stub, model=cls.uncompressed_model | ||
|
||
# use the model_path to load the decompressed weights into dense_model | ||
dense_model = copy.deepcopy(self.dense_model) | ||
|
||
# overwrite the weights of the dense model | ||
self.compressor.decompress( | ||
model_path=self.compressed_model_stub, | ||
model=self.dense_model, | ||
) | ||
|
||
cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub) | ||
# self.dense_model should be decompressed | ||
assert dense_model is not self.dense_model | ||
|
||
def test_compressed_matches_uncompressed(self): | ||
SAMPLE_INPUT = [ | ||
"I love 4-bit quantization because", | ||
"What is the capital of France?", | ||
"def fibonacci(n):", | ||
] | ||
self.decompressed_model_manual = self.dense_model | ||
|
||
inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to( | ||
self.compressed_model.device | ||
assert hasattr( | ||
self.decompressed_model_manual.model.layers[0].self_attn.q_proj, | ||
"weight_scale", | ||
) | ||
compressed_output = self.tokenizer.batch_decode( | ||
self.compressed_model.generate(**inputs, max_length=50) | ||
|
||
def test_hf_quantizer_decompress_match_manual_decompress(self): | ||
manual_device = self.decompressed_model_manual.device | ||
decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device | ||
|
||
self.decompressed_model_manual = self.decompressed_model_manual.to( | ||
manual_device | ||
) | ||
uncompressed_output = self.tokenizer.batch_decode( | ||
self.uncompressed_model.generate(**inputs, max_length=50) | ||
self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to( | ||
decompressed_model_hf_quantizer | ||
) | ||
|
||
for idx in range(len(SAMPLE_INPUT)): | ||
assert compressed_output[idx] == uncompressed_output[idx] | ||
for input in self.SAMPLE_INPUTS: | ||
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to( | ||
self.decompressed_model_manual.device | ||
) | ||
inputs = inputs.to(self.decompressed_model_manual.device) | ||
|
||
decompressed_model_manual_output = self.tokenizer.batch_decode( | ||
self.decompressed_model_manual.generate(**inputs, max_length=50) | ||
) | ||
|
||
decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode( | ||
self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50) | ||
) | ||
|
||
assert ( | ||
decompressed_model_hf_quantizer_out == decompressed_model_manual_output | ||
) | ||
|
||
@classmethod | ||
def tearDownClass(cls): | ||
shutil.rmtree(cls.test_dir) | ||
del cls.compressed_model | ||
del cls.uncompressed_model | ||
torch.cuda.empty_cache() | ||
def tearDownClass(self): | ||
shutil.rmtree(self.test_dir) | ||
del self.dense_model | ||
del self.decompressed_model_hf_quantizer | ||
del self.decompressed_model_manual |