Skip to content

Commit

Permalink
[Test Patch] Remove redundant code for "Fix/update test_run_compresse…
Browse files Browse the repository at this point in the history
…d" (#1072)

SUMMARY:
Removed breakpoints and addressed comments for
#970

TEST PLAN:
Ran pytest for the two test files


#970
ORIGINAL PR DESCRIPTION:
~~Contingent on merge of
huggingface/transformers#34719
^ has been merged not yet released


SUMMARY:
Update run_compressed tests from decompression tests to run_comrpressed
tests -> test if run_compressed True/False models generate the same
output

Add decompress tests that copies attrs from the source dir path's model
to the target model.

TEST PLAN:
ran the test using transformers main
must pass
tests/llmcompressor/transformers/compression/test_decompress.py
and tests/llmcompressor/transformers/compression/test_run_compressed.py
  • Loading branch information
horheynm authored Jan 15, 2025
1 parent 0755398 commit 806da33
Show file tree
Hide file tree
Showing 11 changed files with 256 additions and 53 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: "nm-testing/tinyllama-w4a16-compressed"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: "nm-testing/tinyllama-w8a16-dense"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: "nm-testing/tinyllama-w8a8-compressed"
skeleton_model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-fp8-dynamic-compressed"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-compressed
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-Dynamic-uncompressed
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w4a16-compressed"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-compressed
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-uncompressed
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cadence: "commit"
test_type: "regression"
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-compressed
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A16-G128-uncompressed

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cadence: "commit"
test_type: "regression"
model_stub: "nm-testing/tinyllama-w8a8-compressed"
empty_model: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
compressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-compressed
uncompressed_model_stub: nm-testing/TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token-uncompressed
133 changes: 133 additions & 0 deletions tests/llmcompressor/transformers/compression/test_decompress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import copy
import shutil
import tempfile
import unittest

from compressed_tensors import QUANTIZATION_CONFIG_NAME
from compressed_tensors.compressors import ModelCompressor
from compressed_tensors.quantization import QuantizationStatus
from parameterized import parameterized_class
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig

from tests.testing_utils import parse_params, requires_gpu

CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs"


@requires_gpu
@parameterized_class(parse_params(CONFIG_DIR))
class TestDecompression(unittest.TestCase):
"""
Check that HFQuantizer decompression is working as expected.
Manually decompress a compressed model and compare the generations
Decompression:
Given a skeleton model and path to the optimized model,
write the optimized model's safetensors to the skeleton model and decompress
Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16
"""

compressed_model_stub = None
skeleton_model_stub = None

SAMPLE_INPUTS = [
"I love 4-bit quantization because",
"What is the capital of France?",
"def fibonacci(n):",
]

@classmethod
def setUpClass(self):
self.test_dir = tempfile.mkdtemp()
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub)

# Decompress using HFQuantizer from AutoModelForCausalLM
self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained(
self.compressed_model_stub,
torch_dtype="auto",
device_map="auto",
quantization_config=CompressedTensorsConfig(run_compressed=False),
)

# Manually decompress this model
self.dense_model = AutoModelForCausalLM.from_pretrained(
self.skeleton_model_stub,
torch_dtype=self.decompressed_model_hf_quantizer.dtype,
device_map=self.decompressed_model_hf_quantizer.device,
)

# decompression from HFQuantizer should populate weight_scale
assert hasattr(
self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj,
"weight_scale",
)

# dense model should not have weight_scale populated
assert not hasattr(
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale"
)

config = AutoConfig.from_pretrained(self.compressed_model_stub)

compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
self.compressor = ModelCompressor.from_compression_config(compression_config)
self.compressor.quantization_config.quantization_status = (
QuantizationStatus.FROZEN
)

# use the model_path to load the decompressed weights into dense_model
dense_model = copy.deepcopy(self.dense_model)

# overwrite the weights of the dense model
self.compressor.decompress(
model_path=self.compressed_model_stub,
model=self.dense_model,
)

# self.dense_model should be decompressed
assert dense_model is not self.dense_model

self.decompressed_model_manual = self.dense_model

assert hasattr(
self.decompressed_model_manual.model.layers[0].self_attn.q_proj,
"weight_scale",
)

def test_hf_quantizer_decompress_match_manual_decompress(self):
manual_device = self.decompressed_model_manual.device
decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device

self.decompressed_model_manual = self.decompressed_model_manual.to(
manual_device
)
self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to(
decompressed_model_hf_quantizer
)

for input in self.SAMPLE_INPUTS:
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(
self.decompressed_model_manual.device
)
inputs = inputs.to(self.decompressed_model_manual.device)

decompressed_model_manual_output = self.tokenizer.batch_decode(
self.decompressed_model_manual.generate(**inputs, max_length=50)
)

decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode(
self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50)
)

assert (
decompressed_model_hf_quantizer_out == decompressed_model_manual_output
)

@classmethod
def tearDownClass(self):
shutil.rmtree(self.test_dir)
del self.dense_model
del self.decompressed_model_hf_quantizer
del self.decompressed_model_manual
140 changes: 97 additions & 43 deletions tests/llmcompressor/transformers/compression/test_run_compressed.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,133 @@
import copy
import shutil
import tempfile
import unittest

import torch
from compressed_tensors import QUANTIZATION_CONFIG_NAME
from compressed_tensors.compressors import ModelCompressor
from compressed_tensors.quantization import QuantizationStatus
from parameterized import parameterized_class
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.utils.quantization_config import CompressedTensorsConfig

from tests.testing_utils import parse_params, requires_gpu

CONFIG_DIR = "tests/llmcompressor/transformers/compression/run_compressed_configs"
CONFIG_DIR = "tests/llmcompressor/transformers/compression/decompression_configs"


@requires_gpu
@parameterized_class(parse_params(CONFIG_DIR))
class TestQuantizationMatches(unittest.TestCase):
model_stub = None
empty_model = None
class TestDecompression(unittest.TestCase):
"""
Check that HFQuantizer decompression is working as expected.
Manually decompress a compressed model and compare the generations
Decompression:
Given a skeleton model and path to the optimized model,
write the optimized model's safetensors to the skeleton model and decompress
Ex. write weight_scale to the skeleton model and then convert from fp4 to fp16
"""

compressed_model_stub = None
skeleton_model_stub = None

SAMPLE_INPUTS = [
"I love 4-bit quantization because",
"What is the capital of France?",
"def fibonacci(n):",
]

@classmethod
def setUpClass(cls):
cls.test_dir = tempfile.mkdtemp()
def setUpClass(self):
self.test_dir = tempfile.mkdtemp()
self.tokenizer = AutoTokenizer.from_pretrained(self.compressed_model_stub)

# TODO: Give option on HFQuantizer to run run_compressed True/False
# currently hardcoded to True
cls.compressed_model = AutoModelForCausalLM.from_pretrained(
cls.model_stub,
# Decompress using HFQuantizer from AutoModelForCausalLM
self.decompressed_model_hf_quantizer = AutoModelForCausalLM.from_pretrained(
self.compressed_model_stub,
torch_dtype="auto",
device_map="auto",
# run_compressed=True, # TODO: Give option on HFQuantizer
quantization_config=CompressedTensorsConfig(run_compressed=False),
)
# TODO: Use ModelCompressor until decompression is supported through
# HFQuant/run_compressed can be turned off.
cls.uncompressed_model = AutoModelForCausalLM.from_pretrained(
cls.empty_model,
torch_dtype=cls.compressed_model.dtype,
device_map=cls.compressed_model.device,

# Manually decompress this model
self.dense_model = AutoModelForCausalLM.from_pretrained(
self.skeleton_model_stub,
torch_dtype=self.decompressed_model_hf_quantizer.dtype,
device_map=self.decompressed_model_hf_quantizer.device,
)

# decompression from HFQuantizer should populate weight_scale
assert hasattr(
self.decompressed_model_hf_quantizer.model.layers[0].self_attn.q_proj,
"weight_scale",
)

# dense model should not have weight_scale populated
assert not hasattr(
self.dense_model.model.layers[0].self_attn.q_proj, "weight_scale"
)
config = AutoConfig.from_pretrained(cls.model_stub)

config = AutoConfig.from_pretrained(self.compressed_model_stub)

compression_config = getattr(config, QUANTIZATION_CONFIG_NAME, None)
cls.compressor = ModelCompressor.from_compression_config(compression_config)
cls.compressor.quantization_config.quantization_status = (
self.compressor = ModelCompressor.from_compression_config(compression_config)
self.compressor.quantization_config.quantization_status = (
QuantizationStatus.FROZEN
)
cls.compressor.decompress(
model_path=cls.model_stub, model=cls.uncompressed_model

# use the model_path to load the decompressed weights into dense_model
dense_model = copy.deepcopy(self.dense_model)

# overwrite the weights of the dense model
self.compressor.decompress(
model_path=self.compressed_model_stub,
model=self.dense_model,
)

cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_stub)
# self.dense_model should be decompressed
assert dense_model is not self.dense_model

def test_compressed_matches_uncompressed(self):
SAMPLE_INPUT = [
"I love 4-bit quantization because",
"What is the capital of France?",
"def fibonacci(n):",
]
self.decompressed_model_manual = self.dense_model

inputs = self.tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(
self.compressed_model.device
assert hasattr(
self.decompressed_model_manual.model.layers[0].self_attn.q_proj,
"weight_scale",
)
compressed_output = self.tokenizer.batch_decode(
self.compressed_model.generate(**inputs, max_length=50)

def test_hf_quantizer_decompress_match_manual_decompress(self):
manual_device = self.decompressed_model_manual.device
decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.device

self.decompressed_model_manual = self.decompressed_model_manual.to(
manual_device
)
uncompressed_output = self.tokenizer.batch_decode(
self.uncompressed_model.generate(**inputs, max_length=50)
self.decompressed_model_hf_quantizer = self.decompressed_model_hf_quantizer.to(
decompressed_model_hf_quantizer
)

for idx in range(len(SAMPLE_INPUT)):
assert compressed_output[idx] == uncompressed_output[idx]
for input in self.SAMPLE_INPUTS:
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(
self.decompressed_model_manual.device
)
inputs = inputs.to(self.decompressed_model_manual.device)

decompressed_model_manual_output = self.tokenizer.batch_decode(
self.decompressed_model_manual.generate(**inputs, max_length=50)
)

decompressed_model_hf_quantizer_out = self.tokenizer.batch_decode(
self.decompressed_model_hf_quantizer.generate(**inputs, max_length=50)
)

assert (
decompressed_model_hf_quantizer_out == decompressed_model_manual_output
)

@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.test_dir)
del cls.compressed_model
del cls.uncompressed_model
torch.cuda.empty_cache()
def tearDownClass(self):
shutil.rmtree(self.test_dir)
del self.dense_model
del self.decompressed_model_hf_quantizer
del self.decompressed_model_manual

0 comments on commit 806da33

Please sign in to comment.