diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py index 65747f71d..61e6441bb 100644 --- a/src/llmcompressor/transformers/finetune/text_generation.py +++ b/src/llmcompressor/transformers/finetune/text_generation.py @@ -30,6 +30,7 @@ PreTrainedModel, set_seed, ) +from transformers.utils.quantization_config import CompressedTensorsConfig from llmcompressor.core import pre_initialize_structure, reset_session from llmcompressor.pytorch.model_load.helpers import ( @@ -52,7 +53,10 @@ from llmcompressor.transformers.sparsification.sparse_model import ( get_shared_processor_src, ) -from llmcompressor.transformers.utils.helpers import detect_last_checkpoint +from llmcompressor.transformers.utils.helpers import ( + detect_last_checkpoint, + is_model_ct_quantized_from_path, +) from llmcompressor.typing import Processor from llmcompressor.utils.fsdp.helpers import is_fsdp_model @@ -224,6 +228,13 @@ def initialize_model_from_path( "trust_remote_code": model_args.trust_remote_code_model, } # this calls from_pretrained under the hood so should be FSDP safe + + # optimized models must be decompressed to carry out oneshot/train/etc + if is_model_ct_quantized_from_path(model_path): + model_kwargs["quantization_config"] = CompressedTensorsConfig( + run_compressed=False + ) + model = AutoModelForCausalLM.from_pretrained( model_path, **model_kwargs, diff --git a/src/llmcompressor/transformers/utils/helpers.py b/src/llmcompressor/transformers/utils/helpers.py index 1263bb004..c1dcef119 100644 --- a/src/llmcompressor/transformers/utils/helpers.py +++ b/src/llmcompressor/transformers/utils/helpers.py @@ -4,9 +4,13 @@ """ import os -from typing import TYPE_CHECKING, Optional +from pathlib import Path +from typing import TYPE_CHECKING, Optional, Union +import requests +from huggingface_hub import HUGGINGFACE_CO_URL_HOME, hf_hub_download from loguru import logger +from transformers import AutoConfig from transformers.trainer_utils import get_last_checkpoint if TYPE_CHECKING: @@ -15,6 +19,7 @@ __all__ = [ "RECIPE_FILE_NAME", "detect_last_checkpoint", + "is_model_ct_quantized_from_path", ] RECIPE_FILE_NAME = "recipe.yaml" @@ -54,3 +59,101 @@ def detect_last_checkpoint( ) return last_checkpoint + + +def is_model_ct_quantized_from_path(path: str) -> bool: + """ + Determine if model from path is quantized based + on the config + + :param path: path to the model or HF stub + :return: True if config contains quantization_config from the given path + + """ + config = AutoConfig.from_pretrained(path) + if config is not None: + if ( + hasattr(config, "quantization_config") + and config.quantization_config["quant_method"] == "compressed-tensors" + ): + return True + return False + + +def infer_recipe_from_model_path(model_path: Union[str, Path]) -> Optional[str]: + """ + Infer the recipe from the model_path. + + :param model_path: The path to the model to load. It can be one of the following: + - a path to the model directory + - a path to the model file + - Hugging face model ID + :return: The path to the recipe file if found, None otherwise. + """ + model_path = model_path.as_posix() if isinstance(model_path, Path) else model_path + + if os.path.isdir(model_path) or os.path.isfile(model_path): + # Model path is a local path to the model directory or file + model_path = ( + os.path.dirname(model_path) if os.path.isfile(model_path) else model_path + ) + recipe = os.path.join(model_path, RECIPE_FILE_NAME) + + if os.path.isfile(recipe): + logger.info(f"Found recipe in the model_path: {recipe}") + return recipe + logger.debug(f"No recipe found in the model_path: {model_path}") + return None + + # If the model path is a Hugging Face model ID + recipe = recipe_from_huggingface_model_id(hf_stub=model_path) + + if recipe is None: + logger.info("Failed to infer the recipe from the model_path") + + return recipe + + +def recipe_from_huggingface_model_id( + hf_stub: str, recipe_file_name: str = RECIPE_FILE_NAME +) -> Optional[str]: + """ + Attempts to download the recipe from the Hugging Face model ID. + + :param hf_stub: Assumed to be the Hugging Face model ID. + :param recipe_file_name: The name of the recipe file to download. + Defaults to RECIPE_FILE_NAME. + :return: A tuple: + - The path to the recipe file if found, None otherwise. + - True if hf_stub is a valid Hugging Face model ID, False otherwise. + """ + model_id_url = os.path.join(HUGGINGFACE_CO_URL_HOME, hf_stub) + request = requests.head(model_id_url) + + if request.status_code != 200: + logger.debug( + ( + "hf_stub is not a valid Hugging Face model ID. ", + "Skipping recipe resolution.", + ) + ) + return None + + try: + logger.info( + "Attempting to download a recipe ", + f"{hf_stub} " f"from {HUGGINGFACE_CO_URL_HOME}", + ) + recipe = hf_hub_download(repo_id=hf_stub, filename=recipe_file_name) + logger.info(f"Found recipe: {recipe_file_name} for model ID: {hf_stub}.") + except Exception as e: + logger.error( + ( + f"Unable to find recipe {recipe_file_name} " + f"for model ID: {hf_stub}: {e}." + "Skipping recipe resolution." + ) + ) + recipe = None + + return recipe diff --git a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py index 2f6c51ebb..f70e7769f 100644 --- a/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py +++ b/tests/llmcompressor/transformers/obcq/test_consecutive_runs.py @@ -5,7 +5,10 @@ import pytest import yaml from parameterized import parameterized_class +from transformers import AutoModelForCausalLM +from transformers.utils.quantization_config import CompressedTensorsConfig +from llmcompressor.transformers.utils.helpers import infer_recipe_from_model_path from tests.testing_utils import parse_params, requires_gpu CONFIGS_DIRECTORY = "tests/llmcompressor/transformers/obcq/obcq_configs/consec_runs" @@ -15,13 +18,15 @@ class TestConsecutiveRuns(unittest.TestCase): + quantization_config = CompressedTensorsConfig(run_compressed=False) + def _test_consecutive_runs( self, tolerance: float, num_calibration_samples: int = 16 ): import math from llmcompressor.core import active_session - from llmcompressor.pytorch.model_load.helpers import get_session_model + from llmcompressor.pytorch.model_load.helpers import initialize_recipe from llmcompressor.pytorch.utils.helpers import tensor_sparsity from llmcompressor.transformers import oneshot from llmcompressor.utils.pytorch import qat_active @@ -36,12 +41,18 @@ def _test_consecutive_runs( oneshot_device=self.device, clear_sparse_session=False, ) - first_tiny_model = get_session_model() + + first_model = AutoModelForCausalLM.from_pretrained( + self.output_first, + device_map="auto", + quantization_config=self.quantization_config, + ) + layer_0_sparse = tensor_sparsity( - first_tiny_model.model.layers[0].self_attn.k_proj.weight + first_model.model.layers[0].self_attn.k_proj.weight ) assert math.isclose(layer_0_sparse.item(), 0.5, rel_tol=tolerance) - assert qat_active(first_tiny_model) + assert qat_active(first_model) session = active_session() session_recipe = session.lifecycle.recipe_container.compiled_recipe @@ -49,6 +60,10 @@ def _test_consecutive_runs( self.assertEqual(len(stages), 1) session.reset() + recipe = infer_recipe_from_model_path(model_path=self.output_first) + if recipe: + initialize_recipe(model=first_model, recipe_path=recipe) + # reload saved model and up sparsity to 0.7 oneshot( model=self.output_first, @@ -57,15 +72,19 @@ def _test_consecutive_runs( recipe=self.second_recipe, output_dir=self.output_second, oneshot_device=self.device, - clear_sparse_session=False, ) - second_tiny_model = get_session_model() + second_model = AutoModelForCausalLM.from_pretrained( + self.output_second, + device_map="auto", + quantization_config=self.quantization_config, + ) + layer_0_sparse = tensor_sparsity( - second_tiny_model.model.layers[0].self_attn.k_proj.weight + second_model.model.layers[0].self_attn.k_proj.weight ) assert math.isclose(layer_0_sparse.item(), 0.7, rel_tol=tolerance) - assert qat_active(second_tiny_model) + assert qat_active(second_model) session = active_session() session_recipe = session.lifecycle.recipe_container.compiled_recipe @@ -119,7 +138,9 @@ def setUp(self): from transformers import AutoModelForCausalLM self.model = AutoModelForCausalLM.from_pretrained( - self.model, device_map=self.device + self.model, + device_map=self.device, + quantization_config=self.quantization_config, ) self.output = "./oneshot_output"