openvinotoolkit · vladimir-paramuzov · Aug 19, 2020 · Aug 19, 2020
@@ -1354,7 +1354,8 @@ void Program::CreateScaleShiftPrimitive(cldnn::topology& topology, InferenceEngi
         scaleShiftLayerName,
         inputPrimitives[0],
         scalePrimID,
-        biasPrimID);
+        biasPrimID,
+        cldnn::optional_data_type{DataTypeFromPrecision(layer->outData[0]->getPrecision())});
 
     topology.add(scaleShiftPrim);
     AddPrimitiveToProfiler(scaleShiftLayerName, layer);

@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -51,8 +51,9 @@ struct scale : public primitive_base<scale> {
           const primitive_id& input,
           const primitive_id& scale_input,  // should be bfyx or yxfb, where each dimension can be 1, if all dimensions
                                             // are 1 then this is scalar
+          const optional_data_type& output_dt = {},
           const padding& output_padding = padding())
-        : primitive_base(id, {input, scale_input}, output_padding), bias("") {}
+        : primitive_base(id, {input, scale_input}, output_padding, output_dt), bias("") {}
 
     /// @brief Constructs scale primitive with optional adding bias.
     /// @param id This primitive id.
@@ -64,8 +65,9 @@ struct scale : public primitive_base<scale> {
           const primitive_id& scale_input,  // should be bfyx or yxfb, where each dimension can be 1, if all dimensions
                                             // are 1 then this is scalar
           const primitive_id& bias,  // should be same size as scale_input
+          const optional_data_type& output_dt = {},
           const padding& output_padding = padding())
-        : primitive_base(id, {input, scale_input}, output_padding), bias(bias) {}
+        : primitive_base(id, {input, scale_input}, output_padding, output_dt), bias(bias) {}
 
     /// @brief Primitive id containing bias data.
     primitive_id bias;

@@ -1146,11 +1146,43 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati
 
     switch (desc.GetType()) {
         case KernelType::SCALE: {
-            op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " +
-                        in_vars_converted[0] + " * " + ConvertToOutputType(in_var, vec_size) + ";";
+            auto get_acc_t = [&]() -> Datatype {
+                std::vector<Datatype> tensor_types = {desc.output_tensor.GetDType()};
+                for (auto& in : desc.tensors) {
+                    tensor_types.push_back(in.GetDType());
+                }
+
+                std::vector<Datatype> types_prioritized = { Datatype::F32, Datatype::F16 };
+
+                for (auto& type : types_prioritized) {
+                    if (std::any_of(tensor_types.begin(), tensor_types.end(), [=](const Datatype& t) -> bool { return t == type; })) {
+                        return type;
+                    }
+                }
+
+                return Datatype::F32;
+            };
+
+            auto get_input = [&](size_t index) -> std::string {
+                auto in_name = index == 0 ? in_var : GetInputVarName(index - 1, is_shuffled, shuffle_var);
+                auto tensor_type = index == 0 ? in_type : desc.tensors[index - 1].GetDType();
+                auto acc_t = get_acc_t();
+
+                if (tensor_type != acc_t)
+                    return ConvertToType(in_name, acc_t, vec_size);
+                else
+                    return in_name;
+            };
+
+            auto tmp_var = out_var + "_tmp";
             if (desc.tensors.size() > 1) {
-                op_decls += "\\\n\t" + out_var + " += " + in_vars_converted[1] + ";";
+                op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = "
+                          + get_input(0) + " * " + get_input(1) + " + " + get_input(2) + ";";
+            } else {
+                op_decls += "\\\n\t" + GetType(get_acc_t(), vec_size) + " " + tmp_var + " = "
+                          + get_input(0) + " * " + get_input(1) + ";";
             }
+            op_decls += "\\\n\t" + GetOutputType(vec_size) + " " + out_var + " = " + ConvertToOutputType(tmp_var, vec_size) + ";";
             break;
         }
         case KernelType::ELTWISE: {

@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -27,8 +27,7 @@ primitive_type_id scale::type_id() {
 }
 
 layout scale_inst::calc_output_layout(scale_node const& node) {
-    assert(static_cast<bool>(node.get_primitive()->output_data_type) == false &&
-           "Output data type forcing is not supported for scale_node!");
+    auto desc = node.get_primitive();
     auto result = node.input().get_non_padded_output_layout();
 
     auto scale_sizes = node.scale_in().get_non_padded_output_layout().size;
@@ -47,6 +46,9 @@ layout scale_inst::calc_output_layout(scale_node const& node) {
          node.scale_in().get_non_padded_output_layout().data_type == data_types::f16))
         result.data_type = node.scale_in().get_non_padded_output_layout().data_type;
 
+    if (desc->output_data_type)
+        result.data_type = *desc->output_data_type;
+
     if (scale_x_size != 1) {
         CLDNN_ERROR_NOT_EQUAL(node.id(), "Scale x size", scale_x_size, "input x size", input_x_size, "");
     }

@@ -1296,6 +1296,21 @@ TEST_P(conv_int8_scale, basic) {
     execute(p);
 }
 
+TEST_P(conv_int8_scale, fp16_scale_out) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+                 data("weights", get_mem(get_weights_layout(p))),
+                 data("bias", get_mem(get_bias_layout(p))),
+                 data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
+                 convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
+                 scale("scale", "conv_prim", "scale_data", optional_data_type{data_types::f16}),
+                 reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
+    );
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
 INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale,
                         ::testing::ValuesIn(std::vector<bc_test_params>{
                                 bc_test_params{CASE_CONV_U8S8_1, 2, 3},

@@ -24,12 +24,154 @@
 #include <api/engine.hpp>
 #include "test_utils/test_utils.h"
 #include "api/reorder.hpp"
+#include "api/data.hpp"
 
 #include <iostream>
 
 using namespace cldnn;
 using namespace tests;
 
+TEST(scale_gpu, basic_in2x3x2x2_mixed_types_in_fp32_out_fp16) {
+    const auto& engine = get_test_engine();
+
+    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 2 } });
+    auto scale_input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } });
+    auto shift_input = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 2, 1, 1 } });
+
+    std::vector<float> input_vec = { 1.0f, 0.0f,   5.0f,   1.5f,   2.0f,   0.0f,
+                                     6.0f, 5.0f, -10.0f, -11.0f, -12.0f, -13.0f,
+
+                                     3.0f, 0.5f,   7.0f,  12.0f,   4.0f,  -0.5f,
+                                     8.0f, 8.0f, -14.0f, -15.0f, -16.0f, -17.0f };
+    set_values(input, input_vec);
+    set_values(scale_input, { 2.0f, -1.0f });
+    set_values(shift_input, { -5.0f, 10.0f });
+
+    std::vector<float> result_vec = { -3.0f, -5.0f,  5.0f, -2.0f, -1.0f, -5.0f,
+                                       4.0f,  5.0f, 20.0f, 21.0f, 22.0f, 23.0f,
+
+                                       1.0f, -4.0f,  9.0f, 19.0f , 3.0f, -6.0f,
+                                       2.0f,  2.0f, 24.0f, 25.0f, 26.0f, 27.0f };
+
+    topology topology;
+    topology.add(input_layout("input", input.get_layout()));
+    topology.add(data("scale_input", scale_input));
+    topology.add(data("shift_input", shift_input));
+    topology.add(scale("scale", "input", "scale_input", "shift_input", optional_data_type{data_types::f16}));
+    topology.add(reorder("reorder", "scale", format::bfyx, data_types::f32));
+
+    build_options bo;
+    bo.set_option(build_option::optimize_data(true));
+    network network(engine, topology, bo);
+
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("reorder").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    ASSERT_EQ(result_vec.size(), output.count());
+
+    for (unsigned int i = 0; i < result_vec.size(); ++i) {
+        EXPECT_NEAR(output_ptr[i], result_vec[i], 1e-05F);
+    }
+}
+
+TEST(scale_gpu, basic_in2x3x2x2_mixed_types_in_fp16_out_fp32) {
+    const auto& engine = get_test_engine();
+
+    auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 2, 2, 3, 2 } });
+    auto scale_input = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 2, 1, 1 } });
+    auto shift_input = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 2, 1, 1 } });
+
+    std::vector<half_t> input_vec = { half_t(1.0f), half_t(0.0f), half_t(5.0f),   half_t(1.5f),   half_t(2.0f),   half_t(0.0f),
+                                      half_t(6.0f), half_t(5.0f), half_t(-10.0f), half_t(-11.0f), half_t(-12.0f), half_t(-13.0f),
+
+                                      half_t(3.0f), half_t(0.5f), half_t(  7.0f), half_t(12.0f),  half_t(4.0f),   half_t(-0.5f),
+                                      half_t(8.0f), half_t(8.0f), half_t(-14.0f), half_t(-15.0f), half_t(-16.0f), half_t(-17.0f) };
+    set_values(input, input_vec);
+    set_values(scale_input, { half_t(2.0f), half_t(-1.0f) });
+    set_values(shift_input, { half_t(-5.0f), half_t(10.0f) });
+
+    std::vector<float> result_vec = { -3.0f, -5.0f,  5.0f, -2.0f, -1.0f, -5.0f,
+                                       4.0f,  5.0f, 20.0f, 21.0f, 22.0f, 23.0f,
+
+                                       1.0f, -4.0f,  9.0f, 19.0f , 3.0f, -6.0f,
+                                       2.0f,  2.0f, 24.0f, 25.0f, 26.0f, 27.0f };
+
+    topology topology;
+    topology.add(input_layout("input", input.get_layout()));
+    topology.add(data("scale_input", scale_input));
+    topology.add(data("shift_input", shift_input));
+    topology.add(scale("scale", "input", "scale_input", "shift_input", optional_data_type{data_types::f32}));
+    topology.add(reorder("reorder", "scale", format::bfyx, data_types::f32));
+
+    build_options bo;
+    bo.set_option(build_option::optimize_data(true));
+    network network(engine, topology, bo);
+
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("reorder").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    ASSERT_EQ(result_vec.size(), output.count());
+
+    for (unsigned int i = 0; i < result_vec.size(); ++i) {
+        EXPECT_NEAR(output_ptr[i], result_vec[i], 1e-05F);
+    }
+}
+
+TEST(scale_gpu, basic_in2x3x2x2_mixed_types_in_fp32_scale_fp16_out_fp16) {
+    const auto& engine = get_test_engine();
+
+    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 3, 2 } });
+    auto scale_input = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 2, 1, 1 } });
+    auto shift_input = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 2, 1, 1 } });
+
+    std::vector<float> input_vec = { 1.0f, 0.0f,   5.0f,   1.5f,   2.0f,   0.0f,
+                                     6.0f, 5.0f, -10.0f, -11.0f, -12.0f, -13.0f,
+
+                                     3.0f, 0.5f,   7.0f,  12.0f,   4.0f,  -0.5f,
+                                     8.0f, 8.0f, -14.0f, -15.0f, -16.0f, -17.0f };
+    set_values(input, input_vec);
+    set_values(scale_input, { half_t(2.0f), half_t(-1.0f) });
+    set_values(shift_input, { half_t(-5.0f), half_t(10.0f) });
+
+    std::vector<float> result_vec = { -3.0f, -5.0f,  5.0f, -2.0f, -1.0f, -5.0f,
+                                       4.0f,  5.0f, 20.0f, 21.0f, 22.0f, 23.0f,
+
+                                       1.0f, -4.0f,  9.0f, 19.0f , 3.0f, -6.0f,
+                                       2.0f,  2.0f, 24.0f, 25.0f, 26.0f, 27.0f };
+
+    topology topology;
+    topology.add(input_layout("input", input.get_layout()));
+    topology.add(data("scale_input", scale_input));
+    topology.add(data("shift_input", shift_input));
+    topology.add(scale("scale", "input", "scale_input", "shift_input", optional_data_type{data_types::f16}));
+    topology.add(reorder("reorder", "scale", format::bfyx, data_types::f32));
+
+    build_options bo;
+    bo.set_option(build_option::optimize_data(true));
+    network network(engine, topology, bo);
+
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("reorder").get_memory();
+    auto output_ptr = output.pointer<float>();
+
+    ASSERT_EQ(result_vec.size(), output.count());
+
+    for (unsigned int i = 0; i < result_vec.size(); ++i) {
+        EXPECT_NEAR(output_ptr[i], result_vec[i], 1e-05F);
+    }
+}
+
 TEST(scale_gpu, basic_in2x3x2x2_scale_same_size) {
     //  Scale  : 2x3x2x2
     //  Input  : 2x3x2x2
@@ -1155,7 +1297,7 @@ TEST(scale_gpu, basic_in2x3x2x2_scale_yxfb_bfyx_same_size_padding) {
         topology.add(input_layout("input", input.get_layout()));
         topology.add(reorder("reorder", "input", input.get_layout().with_padding(padding{ { 0, 0, 1, 2 }, 0 })));
         topology.add(input_layout("scale_input", scale_input.get_layout()));
-        topology.add(scale("scale", "reorder", "scale_input", padding( { 0, 0, 2, 2 }, 0 )));
+        topology.add(scale("scale", "reorder", "scale_input", {}, padding( { 0, 0, 2, 2 }, 0 )));
 
         std::vector<float> input_vec = { 1.f, 2.f, 3.f, 4.f };
         set_values(input, input_vec);