From ce34562b6c98da29a1b518467c2e7432ffa664ad Mon Sep 17 00:00:00 2001 From: eaidova Date: Tue, 28 Jan 2025 17:49:27 +0400 Subject: [PATCH] align rt_info int8 models compressed by default and via config --- optimum/commands/export/openvino.py | 2 +- optimum/exporters/openvino/__main__.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 8d272a693..a099d8a0c 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -345,7 +345,7 @@ def run(self): is_int8 = self.args.weight_format == "int8" quantization_config = { "bits": 8 if is_int8 else 4, - "ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]), + "ratio": 1.0 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]), "sym": self.args.sym or False, "group_size": -1 if is_int8 else self.args.group_size, "all_layers": None if is_int8 else self.args.all_layers, diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 4047ab64a..73b16bbc7 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -488,6 +488,12 @@ class StoreAttr(object): from optimum.intel.openvino.quantization import _weight_only_quantization _weight_only_quantization(submodel, quantization_config) + # kv cache compression disabled if quantization config is not provided, + # to keep aligned result of applying auto int8 compression and via explicit setting config, we should update it + if submodel.has_rt_info(["runtime_options", "KV_CACHE_PRECISION"]): + prev_rt_info = submodel.get_rt_info("runtime_options").value + prev_rt_info.pop("KV_CACHE_PRECISION") + submodel.set_rt_info(prev_rt_info, "runtime_options") compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml" save_model(submodel, compressed_submodel_path, compress_to_fp16=False) del submodel