[IE CLDNN] Add config key to enforce fp32 precision for quantized models

openvinotoolkit · Aug 26, 2020 · aa96de5 · aa96de5
1 parent 936b38d
commit aa96de5
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 4 deletions.
diff --git a/inference-engine/include/cldnn/cldnn_config.hpp b/inference-engine/include/cldnn/cldnn_config.hpp
@@ -59,9 +59,12 @@ DECLARE_CLDNN_CONFIG_KEY(GRAPH_DUMPS_DIR);
 DECLARE_CLDNN_CONFIG_KEY(SOURCES_DUMPS_DIR);
 
 /**
-* @brief This key turns usage of int8 optimizations and qunatized models on.
+* @brief This key enables FP16 precision for quantized models.
+* By default the model is converted to FP32 precision before running LPT. If this key is enabled (default), then non-quantized layers
+* will be converted back to FP16 after LPT, which might imrpove the performance if a model has a lot of compute operations in
+* non-quantized path. This key has no effect if current device doesn't have INT8 optimization capabilities.
 */
-DECLARE_CLDNN_CONFIG_KEY(INT8_ENABLED);
+DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS);
 
 /**
 * @brief This key should be set to correctly handle NV12 input without pre-processing.

diff --git a/inference-engine/src/cldnn_engine/cldnn_config.cpp b/inference-engine/src/cldnn_engine/cldnn_config.cpp
@@ -189,6 +189,14 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
             } else {
                 THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported NV12 flag value: " << val;
             }
+        } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS) == 0) {
+            if (val.compare(PluginConfigParams::YES) == 0) {
+                enable_fp16_for_quantized_models = true;
+            } else if (val.compare(PluginConfigParams::NO) == 0) {
+                enable_fp16_for_quantized_models = false;
+            } else {
+                THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val;
+            }
         } else {
             THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported property key by plugin: " << key;
         }
@@ -228,6 +236,11 @@ void Config::adjustKeyMapValues() {
     else
         key_config_map[CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS] = PluginConfigParams::NO;
 
+    if (enable_fp16_for_quantized_models)
+        key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS] = PluginConfigParams::YES;
+    else
+        key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS] = PluginConfigParams::NO;
+
     {
         std::string qp = "0";
         switch (queuePriority) {

diff --git a/inference-engine/src/cldnn_engine/cldnn_config.h b/inference-engine/src/cldnn_engine/cldnn_config.h
@@ -27,6 +27,7 @@ struct Config {
                enableDynamicBatch(false),
                enableInt8(true),
                nv12_two_inputs(false),
+               enable_fp16_for_quantized_models(true),
                queuePriority(cldnn::priority_mode_types::disabled),
                queueThrottle(cldnn::throttle_mode_types::disabled),
                max_dynamic_batch(1),
@@ -49,6 +50,7 @@ struct Config {
     bool enableDynamicBatch;
     bool enableInt8;
     bool nv12_two_inputs;
+    bool enable_fp16_for_quantized_models;
     cldnn::priority_mode_types queuePriority;
     cldnn::throttle_mode_types queueThrottle;
     int max_dynamic_batch;

diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@@ -230,6 +230,7 @@ void convertLayerPrecision(const CNNLayerPtr& layer, bool isOutput = false) {
                 out_data->setPrecision(PREC_TO);
         }
     }
+
     for (auto &in_data : layer->insData) {
         auto data = in_data.lock();
         if (PREC_FROM == data->getPrecision())
@@ -246,7 +247,6 @@ void convertLayerPrecision(const CNNLayerPtr& layer, bool isOutput = false) {
     if (layer->precision == PREC_FROM)
         layer->precision = PREC_TO;
 
-
     auto wLayer = dynamic_cast<InferenceEngine::WeightableLayer *>(layer.get());
     if (wLayer) {
         if (wLayer->_weights && wLayer->_weights->getTensorDesc().getPrecision() == PREC_FROM) {
@@ -445,7 +445,7 @@ Program::Program(InferenceEngine::ICNNNetwork& network, std::shared_ptr<const cl
         transformer.transform(network);
 
         // [WA part2] Try to find non-quantized layers and convert them back to FP16
-        if (fqFound && baselineIsFP16) {
+        if (fqFound && baselineIsFP16 && config.enable_fp16_for_quantized_models) {
             auto layersSorted = BFSSort(network);
 
             for (auto& layer : layersSorted) {