[GPU] applying activations scaling only to layers whose 1st input pre…

…cision is the same as infer precision. (openvinotoolkit#28850) ### Details: - Input precisions of some layers could be different from inference precision. - `IncreasePositionIdsPrecision` updates some `MatMul` layers to be executed in f32 precision. - other examples: hybrid-quantized models, precision sensitive layers, and so on. - Activations scaling should not be applied to these cases. - in the case of f32, we don't need activations scaling. - in the case of int8, we can't apply activations scaling. - So. this PR updates the activations scaling to be applied only to layers whose 1st input precision is the same as infer precision. ### Tickets: - 161681
bbielawx · Feb 13, 2025 · 6eed0fb · 6eed0fb
1 parent 09ed323
commit 6eed0fb
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 6 deletions.
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/increase_position_ids_precision.cpp b/src/plugins/intel_gpu/src/plugin/transformations/increase_position_ids_precision.cpp
@@ -17,6 +17,7 @@
 #include "openvino/op/gelu.hpp"
 #include "openvino/op/reshape.hpp"
 #include "openvino/op/squeeze.hpp"
+#include "openvino/op/transpose.hpp"
 #include "openvino/op/unsqueeze.hpp"
 #include "openvino/pass/pattern/op/or.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
@@ -29,7 +30,9 @@ IncreasePositionIdsPrecision::IncreasePositionIdsPrecision() {
     using ov::pass::pattern::op::Or;
 
     auto gemm_or_matmul = wrap_type<ov::intel_gpu::op::Gemm, ov::op::v0::MatMul>();
-    auto concat = wrap_type<ov::op::v0::Concat>({gemm_or_matmul, gemm_or_matmul});
+    auto transpose_m = wrap_type<ov::op::v1::Transpose>({gemm_or_matmul, any_input()});
+    auto concat_input = std::make_shared<Or>(OutputVector{gemm_or_matmul, transpose_m});
+    auto concat = wrap_type<ov::op::v0::Concat>({concat_input, concat_input});
     auto sin = wrap_type<ov::op::v0::Sin>({concat});
     auto cos = wrap_type<ov::op::v0::Cos>({concat});
 

diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -975,9 +975,13 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         pass_config->disable<ov::pass::RoPEFusionIOSlicing>();
         pass_config->disable<ov::pass::RoPEShareCosSin>();
 
+        manager.register_pass<ov::intel_gpu::IncreasePositionIdsPrecision>();
+        // This Validate is needed for proper data type propagation after applying IncreasePositionIdsPrecision pass
+        manager.register_pass<ov::pass::Validate>();
+
         float activations_scale_factor = config.get_property(ov::hint::activations_scale_factor);
 
-        if (activations_scale_factor > 0.f && infer_precision == ov::element::f16 && !enableInt8) {
+        if (activations_scale_factor > 0.f && infer_precision == ov::element::f16) {
             using namespace ov::pass::low_precision;
 
             auto supportedPrecisions = std::vector<PrecisionsRestriction>({});
@@ -1005,6 +1009,11 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
             manager.register_pass<ov::pass::activations_scaling::ScaleDownSingleLayer>(activations_scale_factor, infer_precision);
             manager.register_pass<ov::pass::SharedOpOptimization>();
 
+            pass_config->set_callback<ov::pass::activations_scaling::ScaleDownSingleLayer>(
+                [&infer_precision](const std::shared_ptr<const ov::Node> &node) -> bool {
+                    return (node->input(0).get_element_type() != infer_precision);
+                });
+
             // Move down scalar-multiply layers as much as possible
             auto params = LayerTransformation::Params(false, infer_precision, {infer_precision}, true, true);
             auto lpt_pass = manager.register_pass<LowPrecision>(supportedPrecisions, perTensorQuantization, params);
@@ -1094,10 +1103,6 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
 
         manager.register_pass<ov::intel_gpu::OptimizeSubsequentReshapes>();
 
-        manager.register_pass<ov::intel_gpu::IncreasePositionIdsPrecision>();
-        // This Validate is needed for proper data type propagation after applying IncreasePositionIdsPrecision pass
-        manager.register_pass<ov::pass::Validate>();
-
         manager.register_pass<ov::intel_gpu::SinkReshape>();
 
         if (device_info.supports_immad) {