diff --git a/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java b/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java
index 462eabc45..37b2f325c 100755
--- a/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java
+++ b/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java
@@ -260,19 +260,20 @@ public class CDEUtils {
      public static final int  ASR_MODE_TRANSCRIPTION_RECORD = 3; // transcription + audio record
 
      //keep sync with ggml-jni.h
-     public static final int BECHMARK_ASR      = 0;
-     public static final int BECHMARK_MEMCPY   = 1;
-     public static final int BECHMARK_MULMAT   = 2;
-     public static final int BECHMARK_FULL     = 3;
-     public static final int BENCHMARK_MATRIX  = 4;
-     public static final int BENCHMARK_LLM     = 5;
-     public static final int BENCHMARK_STABLEDIFFUSION= 6;
-     public static final int BENCHMARK_QNN_SAMPLE     = 7;
-     public static final int BENCHMARK_QNN_SAVER      = 8;
-     public static final int BENCHMARK_QNN_MATRIX     = 9;
-     public static final int BENCHMARK_QNN_GGML       = 10;
-     public static final int BENCHMARK_QNN_COMPLEX    = 11;
-     public static final int BENCHMARK_QNN_GGML_OP    = 12;
+     public static final int BENCHMARK_ASR      = 0;
+     public static final int BENCHMARK_MEMCPY   = 1;
+     public static final int BENCHMARK_MULMAT   = 2;
+     public static final int BENCHMARK_FULL     = 3;
+     //public static final int BENCHMARK_MATRIX  = 4;//not used since 04-20-2024
+     public static final int BENCHMARK_LLM     = 4;
+     public static final int BENCHMARK_STABLEDIFFUSION= 5;
+     public static final int BENCHMARK_QNN_SAMPLE     = 6;
+     public static final int BENCHMARK_QNN_SAVER      = 7;
+     public static final int BENCHMARK_QNN_MATRIX     = 8;
+     public static final int BENCHMARK_QNN_GGML       = 9;
+     public static final int BENCHMARK_QNN_COMPLEX    = 10;
+     public static final int BENCHMARK_QNN_GGML_OP    = 11;
+     public static final int BENCHMARK_QNN_AUTO_UT    = 12;
 
      //keep sync with ggml-qnn.h
      public static final int QNN_BACKEND_CPU           = 0;
@@ -3916,20 +3917,22 @@ public void onClick(DialogInterface dialog, int which) {
 
      public static String getBenchmarkDesc(int benchmarkIndex) {
          switch (benchmarkIndex) {
-             case BECHMARK_FULL:
-                 return "GGML whisper_encode";
+             case BENCHMARK_FULL:
+                 return "GGML whisper full";
 
-             case BECHMARK_MEMCPY:
-                 return "GGML memcopy";
+             case BENCHMARK_MEMCPY:
+                 return "GGML memcpy";
 
-             case BECHMARK_MULMAT:
+             case BENCHMARK_MULMAT:
                  return "GGML matrix multiply";
 
-             case BECHMARK_ASR:
-                 return "GGML ASR inference";
+             case BENCHMARK_ASR:
+                 return "GGML whisper ASR";
 
+                 /*//not used since 04-20-2024
              case BENCHMARK_MATRIX:
                  return "GGML matrix";
+                 */
 
              case BENCHMARK_LLM:
                  return "GGML LLAMA";
@@ -3943,17 +3946,21 @@ public static String getBenchmarkDesc(int benchmarkIndex) {
              case BENCHMARK_QNN_SAVER:
                  return "GGML QNN saver";
 
+
              case BENCHMARK_QNN_MATRIX:
-                 return "GGML QNN matrix manipulate";
+                 return "GGML QNN matrix addition";
 
              case BENCHMARK_QNN_GGML:
-                 return "GGML QNN ggml";
+                 return "GGML QNN mapping ggml tensor";
 
              case BENCHMARK_QNN_COMPLEX:
                  return "GGML QNN complex graph";
 
              case BENCHMARK_QNN_GGML_OP:
-                 return "GGML QNN OP"; //for PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API
+                 return "GGML QNN OP UT"; //UT for PoC-S49: implementation of GGML OPs using QNN API
+
+             case BENCHMARK_QNN_AUTO_UT:
+                 return "GGML QNN OP UT automation"; //automation UT for PoC-S49: implementation of GGML OPs using QNN API
          }
 
          return "unknown";
diff --git a/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java b/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java
index c7187f7c4..8a45511ae 100755
--- a/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java
+++ b/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java
@@ -119,7 +119,7 @@ public enum ggml_op {
     /**
      * @param modelPath     /sdcard/kantv/ggml-xxxxxx.bin or  /sdcard/kantv/xxxxxx.gguf or qualcomm's prebuilt dedicated model.so or ""
      * @param audioPath     /sdcard/kantv/jfk.wav
-     * @param nBenchType    0: asr(transcription) 1: memcpy 2: mulmat  3: full/whisper_encode 4: matrix  5: LLAMA  6: stable diffusion 7: QNN sample 8: QNN saver 9: QNN matrix 10: QNN GGML 11: QNN complex 12: QNN GGML OP
+     * @param nBenchType    0: whisper asr 1: memcpy 2: mulmat  3: whisper full 4: LLAMA 5: stable diffusion 6: QNN sample 7: QNN saver 8: QNN matrix 9: QNN GGML 10: QNN complex 11: QNN GGML OP(QNN UT) 12: QNN UT automation
      * @param nThreadCounts 1 - 8
      * @param nBackendType  0: CPU  1: GPU  2: DSP 3: ggml("fake" QNN backend, just for compare performance)
      * @param nOpType       type of matrix manipulate / GGML OP / type of various complex/complicated computation graph
diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java
index 83c6bd802..d015cc26e 100755
--- a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java
+++ b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java
@@ -255,7 +255,7 @@ public void onNothingSelected(AdapterView<?> parent) {
 
              }
          });
-         spinnerBenchType.setSelection(CDEUtils.BENCHMARK_QNN_GGML_OP);
+         spinnerBenchType.setSelection(CDEUtils.BENCHMARK_ASR);
 
          Spinner spinnerThreadsCounts = mActivity.findViewById(R.id.spinnerThreadCounts);
          String[] arrayThreadCounts = getResources().getStringArray(R.array.threadCounts);
diff --git a/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml b/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
index 9310dcdff..5f0c9e413 100755
--- a/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
+++ b/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
@@ -43,7 +43,7 @@
             <TextView
                 android:layout_width="wrap_content"
                 android:layout_height="wrap_content"
-                android:text="Bench:" />
+                android:text="Bench" />
 
             <Spinner
                 android:id="@+id/spinnerBenchType"
@@ -55,7 +55,7 @@
             <TextView
                 android:layout_width="wrap_content"
                 android:layout_height="wrap_content"
-                android:text="Threads:" />
+                android:text="Threads" />
 
             <Spinner
                 android:id="@+id/spinnerThreadCounts"
diff --git a/cdeosplayer/kantv/src/main/res/values/arrays.xml b/cdeosplayer/kantv/src/main/res/values/arrays.xml
index 928612c54..423ffe3d8 100755
--- a/cdeosplayer/kantv/src/main/res/values/arrays.xml
+++ b/cdeosplayer/kantv/src/main/res/values/arrays.xml
@@ -60,20 +60,20 @@
     </string-array>
 
     <string-array name="benchType">
-        <item>asr</item>
-        <item>mempcpy</item>
-        <item>mulmat</item>
-        <item>full</item>
-        <item>matrix</item>
-        <item>llama</item>
-        <item>stablediffusion</item> <!-- not work on Xiaomi 14 currently -->
-        <!-- step by step -->
-        <item>qnn-sample</item> <!-- "play with /say hello to" QNN Sample -->
-        <item>qnn-saver</item>  <!-- study QNN SDK mechanism by QNN Saver -->
-        <item>qnn-matrix</item> <!-- offload a simple matrix addition operation to QNN -->
-        <item>qnn-ggml</item>   <!-- mapping ggml tensor to QNN tensor -->
-        <item>qnn-complex</item><!-- complex computation graph in C/C++ or GGML, and then offload them to QNN -->
-        <item>qnn-ggml-op</item><!-- for PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API  -->
+        <item>whisper asr</item>            <!-- whisper asr benchmark / asr validation -->
+        <item>memcpy</item>                 <!-- whisper memcopy benchmark -->
+        <item>whisper mulmat</item>         <!-- whisper mulmat benchmark -->
+        <item>whisper full</item>           <!-- whisper full benchmark -->
+        <item>llama</item>                  <!-- llama benchmark,            not work currently -->
+        <item>stablediffusion</item>        <!-- stable diffusion benchmark, not work currently -->
+        <!-- step by step for PoC: Add Qualcomm mobile SoC native backend for GGML, https://github.com/zhouwg/kantv/issues/121 -->
+        <item>qnn-sample</item>             <!-- "play with /say hello to" QNN Sample -->
+        <item>qnn-saver</item>              <!-- study QNN SDK mechanism by QNN Saver -->
+        <item>qnn-matrix</item>             <!-- offload a simple matrix addition operation to QNN -->
+        <item>qnn-ggml</item>               <!-- mapping ggml tensor to QNN tensor -->
+        <item>qnn-complex</item>            <!-- complex computation graph in C/C++ or GGML, and then offload them to QNN -->
+        <item>qnn-ggml-op</item>            <!-- UT for PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API  -->
+        <item>qnn-auto-ut</item>            <!-- automation UT for PoC-S49:implementation of other GGML OP(non-mulmat) using QNN API -->
     </string-array>
 
     <string-array name="threadCounts">
diff --git a/core/ggml/jni/ggml-jni-impl-external.cpp b/core/ggml/jni/ggml-jni-impl-external.cpp
index f3d725464..453f27d9d 100644
--- a/core/ggml/jni/ggml-jni-impl-external.cpp
+++ b/core/ggml/jni/ggml-jni-impl-external.cpp
@@ -117,7 +117,12 @@ extern "C" {
 
 static const char * get_qnn_backend_name(int n_backend_type);
 static float tensor_sum_elements(const ggml_tensor * tensor);
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads);
+static bool ggml_graph_compute_helper(
+        struct ggml_cgraph * graph,
+        std::vector<uint8_t> & buf,
+        int   n_threads,
+        ggml_abort_callback   abort_callback,
+        void * abort_callback_data);
 static void tensor_dump(const ggml_tensor * tensor, const char * name);
 #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
 
@@ -885,7 +890,7 @@ void ggml_bench_matrix(int num_threads, int backend_type) {
 
     std::vector<uint8_t> work_buffer;
 
-    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
+    ggml_graph_compute_helper(gf,work_buffer, benchmark_params.n_threads, nullptr, nullptr);
 
     if (get_tensor_data_size(m11) < 100) {
         TENSOR_DUMP(m11);
@@ -963,7 +968,7 @@ void ggml_bench_matrix(int num_threads, int backend_type) {
 
         long long int start = ggml_time_us();
         //GGML_JNI_NOTIFY("Running ggml_graph_compute\n");
-        ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
+        ggml_graph_compute_helper(gf31,work_buffer, benchmark_params.n_threads, nullptr, nullptr);
 
         long long int stop = ggml_time_us();
         long long int usec = stop-start;
@@ -996,7 +1001,7 @@ void ggml_bench_matrix(int num_threads, int backend_type) {
         }
 
         // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
+        ggml_graph_compute_helper(gf,work_buffer, benchmark_params.n_threads, nullptr, nullptr);
     }
     GGML_JNI_NOTIFY("\n");
     GGML_JNI_NOTIFY("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
@@ -3298,19 +3303,26 @@ static intptr_t alignTo(size_t alignment, intptr_t offset) {
 }
 
 
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+
+static bool ggml_graph_compute_helper(
+        struct ggml_cgraph * graph,
+        std::vector<uint8_t> & buf,
+        int   n_threads,
+        ggml_abort_callback   abort_callback,
+        void * abort_callback_data) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 
+    plan.abort_callback      = abort_callback;
+    plan.abort_callback_data = abort_callback_data;
+
     if (plan.work_size > 0) {
         buf.resize(plan.work_size);
         plan.work_data = buf.data();
     }
 
-    ggml_graph_compute(graph, &plan);
+    return ggml_graph_compute(graph, &plan);
 }
 
-
-
 static float tensor_sum_elements(const ggml_tensor * tensor) {
     double sum = 0;
     float  value = 0;
@@ -6854,7 +6866,7 @@ int qnn_matrix(int n_backend_type, int n_op_type) {
         m2              = ggml_add(ctx, m0, m1); // GGML_OP_ADD
         gf              = ggml_new_graph(ctx);
         ggml_build_forward_expand(gf, m2);
-        ggml_graph_compute_helper(work_buffer, gf, 4);
+        ggml_graph_compute_helper(gf, work_buffer,  4, nullptr, nullptr);
         TENSOR_DUMP(m0);
         TENSOR_DUMP(m1);
         TENSOR_DUMP(m2);
@@ -7254,7 +7266,7 @@ int qnn_ggml(int n_backend_type, int n_ggml_op_type) {
         gf              = ggml_new_graph(ctx);
         ggml_set_f32(m2, 0.0f);
         ggml_build_forward_expand(gf, m2);
-        ggml_graph_compute_helper(work_buffer, gf, 4);
+        ggml_graph_compute_helper(gf,work_buffer,  4, nullptr, nullptr);
 
         TENSOR_DUMP(m0);
         TENSOR_DUMP(m1);
@@ -8194,6 +8206,8 @@ int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, in
 
     GGML_JNI_NOTIFY("starting qnn_ggml_op UT(unit test)\n");
 #if 0 // for performance comparison between QNN backend and original GGML
+      // on Xiaomi14,      9x performance gain
+      // on low-end phone, 3x performance gain
     const int sizey = 4096;
     const int sizex = 4096;
     const int sizez = 128;
@@ -8267,7 +8281,7 @@ int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, in
     GGML_JNI_NOTIFY("creating compute graph\n");
     gf = ggml_new_graph(ctx);
     ggml_build_forward_expand(gf, dst);
-    ggml_graph_compute_helper(work_buffer, gf, num_threads);
+    ggml_graph_compute_helper(gf,work_buffer, num_threads, nullptr, nullptr);
 
     if (get_tensor_data_size(dst) < 100) {
         TENSOR_DUMP(src0);
@@ -8297,5 +8311,228 @@ int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, in
     GGML_JNI_NOTIFY("duration of qnn_ggml_op %d(%s) with backend %d(%s) is: %lld milliseconds\n", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type), n_backend_type, get_qnn_backend_name(n_backend_type), n_durtion);
     LOGGD("leave qnn_ggml_op UT(unit test)\n");
 
+    return 0;
+}
+
+
+/**
+  * similar to qnn_ggml_op, but an automation UT for a specify GGML OP with a specify backend
+  *
+  * this function borrow from whisper.cpp
+  */
+//TODO: 1. only support GGML_OP_ADD, GGML_OP_MUL, GGMPL_OP_MULMAT
+//      2. works with FP32
+int qnn_ggml_op_automation_ut(const char *model_path, int num_threads, int n_backend_type,
+                              int n_ggml_op_type) {
+    int result = 0;
+    int64_t n_begin_time = 0LL;
+    int64_t n_end_time = 0LL;
+    int64_t n_durtion = 0LL;
+
+
+    LOGGD("enter qnn_ggml_op_automation_ut\n");
+    LOGGI("mode path:%s", model_path);
+    LOGGI("num_threads:%d", num_threads);
+    LOGGI("backend_type:%d(%s)", n_backend_type, get_qnn_backend_name(n_backend_type));
+    LOGGI("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
+    GGML_JNI_NOTIFY("starting qnn_ggml_op_automation_ut(automation unit test)\n");
+
+    n_begin_time = ggml_time_us();
+
+    srand(time(NULL));
+
+    bool support_ops = (n_ggml_op_type == GGML_OP_MUL_MAT || n_ggml_op_type == GGML_OP_MUL || n_ggml_op_type == GGML_OP_ADD);
+    if (!support_ops) {
+        LOGGD("ggml op %d(%s) not supported  with backend %d(%s)", n_ggml_op_type,
+              ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type,
+              get_qnn_backend_name(n_backend_type));
+        GGML_JNI_NOTIFY("ggml op %d(%s) not supported  with backend %d(%s)", n_ggml_op_type,
+                        ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type,
+                        get_qnn_backend_name(n_backend_type));
+        LOGGD("leave qnn_ggml_op UT(unit test)\n");
+
+        return 1;
+    }
+
+
+    char strbuf[256];
+    std::string tipString;
+    tipString = "";
+
+    const int n_max = 128;
+
+    const std::vector<size_t> sizes = {
+            64, 128, 256, 512, 1024, 2048, 4096,
+    };
+
+    const size_t N_max = sizes.back();
+
+    // a: N*N*sizeof(float)
+    // b: N*N*sizeof(float)
+    // c: N*N*sizeof(float)
+    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
+    std::vector<uint8_t> buf(
+            3llu * N_max * N_max * sizeof(float) + 3 * ggml_tensor_overhead() +
+            ggml_graph_overhead());
+    std::vector<uint8_t> work;
+
+
+    tipString += "\nprepare matrix";
+    kantv_asr_notify_benchmark_c("prepare matrix\n");
+
+    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
+
+    for (int j = 0; j < (int) sizes.size(); j++) {
+        int n_q4_0 = 0;
+        int n_q4_1 = 0;
+        int n_q5_0 = 0;
+        int n_q5_1 = 0;
+        int n_q8_0 = 0;
+        int n_fp16 = 0;
+        int n_fp32 = 0;
+
+        // GFLOPS/s
+        double s_q4_0 = 0.0;
+        double s_q4_1 = 0.0;
+        double s_q5_0 = 0.0;
+        double s_q5_1 = 0.0;
+        double s_q8_0 = 0.0;
+        double s_fp16 = 0.0;
+        double s_fp32 = 0.0;
+
+        const size_t N = sizes[j];
+#if 0
+        for (int k = 0; k < 7; ++k) {
+            const ggml_type wtype =
+                    k == 0 ? GGML_TYPE_Q4_0 :
+                    k == 1 ? GGML_TYPE_Q4_1 :
+                    k == 2 ? GGML_TYPE_Q5_0 :
+                    k == 3 ? GGML_TYPE_Q5_1 :
+                    k == 4 ? GGML_TYPE_Q8_0 :
+                    k == 5 ? GGML_TYPE_F16  : GGML_TYPE_F32;
+#else
+            for (int k = 0; k < 1; ++k) {
+                const ggml_type wtype = GGML_TYPE_F32; //TODO: only f16&f32 supported with QNN backend
+                k = 6; //hardcode to 6 make following code happy
+#endif
+
+
+            double &s =
+                    k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 :
+                                                                          k == 4 ? s_q8_0 :
+                                                                          k == 5 ? s_fp16
+                                                                                 : /*k == 6*/ s_fp32;
+            int &n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 :
+                                                                           k == 4 ? n_q8_0 :
+                                                                           k == 5 ? n_fp16
+                                                                                  : /*k == 6*/ n_fp32;
+
+            struct ggml_init_params gparams = {
+                    /*.mem_size   =*/ buf.size(),
+                    /*.mem_buffer =*/ buf.data(),
+                    /*.no_alloc   =*/ false,
+            };
+#ifdef GGML_USE_QNN
+            if (n_backend_type !=
+                3) //3 is fake QNN backend "ggml", just used to compare performance between QNN backend and original GGML
+                gparams.use_hwaccel = true;
+#endif
+            struct ggml_context *ctx0 = ggml_init(gparams);
+
+            struct ggml_tensor *a = ggml_new_tensor_2d(ctx0, wtype, N, N);
+            struct ggml_tensor *b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
+
+            struct ggml_tensor *c = nullptr;
+
+            switch (n_ggml_op_type) {
+                case GGML_OP_ADD:
+                    c = ggml_add(ctx0, a, b);
+                    break;
+                case GGML_OP_MUL:
+                    c = ggml_mul(ctx0, a, b);
+                    break;
+                case GGML_OP_MUL_MAT:
+                    c =  ggml_mul_mat(ctx0, a, b);
+                    break;
+            }
+
+
+            struct ggml_cgraph *gf = ggml_new_graph(ctx0);
+
+            ggml_build_forward_expand(gf, c);
+
+            double tsum = 0.0;
+
+            // heat-up
+            ggml_graph_compute_helper(gf, work, num_threads, nullptr, nullptr);
+
+            for (int i = 0; i < n_max; ++i) {
+                const int64_t t0 = ggml_time_us();
+
+                kantv_asr_notify_benchmark_c("reset");
+                tipString = "calling ggml_graphic_compute_helper:\n";
+                tipString += "j= " + std::to_string(j) + "(matrix dimension = " +
+                             std::to_string(N) + ",n_max=" + std::to_string(n_max) + ")"
+                             + ",k=" + std::to_string(k) + "(ggml quant type=" +
+                             std::string(whisper_get_ggml_type_str(
+                                     static_cast<ggml_type>(wtype))) + ")"
+                             + ",i=" + std::to_string(i) + "\n";
+
+                kantv_asr_notify_benchmark(tipString);
+
+                ggml_graph_compute_helper(gf, work, num_threads, nullptr, nullptr);
+
+                const int64_t t1 = ggml_time_us();
+
+                tsum += (t1 - t0) * 1e-6;
+                n++;
+
+                if (tsum > 1.0 && n >= 3) {
+                    break;
+                }
+            }
+
+            ggml_free(ctx0);
+
+            s = ((2.0 * N * N * N * n) / tsum) * 1e-9;
+        }
+
+        kantv_asr_notify_benchmark_c("reset");
+        tipString = "";
+        // Q4_0 | Q4_1
+        snprintf(strbuf, sizeof(strbuf),
+                 "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
+                 N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
+        tipString += strbuf;
+
+        // Q5_0 | Q5_1 | Q8_0
+        snprintf(strbuf, sizeof(strbuf),
+                 "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n",
+                 N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0);
+        tipString += strbuf;
+
+        // F16 | F32
+        snprintf(strbuf, sizeof(strbuf),
+                 "%4zu x %4zu: F16  %7.1f GFLOPS (%3d runs) | F32  %7.1f GFLOPS (%3d runs)\n",
+                 N, N, s_fp16, n_fp16, s_fp32, n_fp32);
+        tipString += strbuf;
+
+
+        kantv_asr_notify_benchmark(tipString);
+        LOGGD("%s\n", tipString.c_str());
+    }
+
+
+    n_end_time = ggml_time_us();
+    n_durtion = (n_end_time - n_begin_time) / 1000;
+    LOGGD("duration of qnn_ggml_op_automation_ut %d(%s) with backend %d(%s) is: %lld milliseconds\n",
+          n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type,
+          get_qnn_backend_name(n_backend_type), n_durtion);
+    GGML_JNI_NOTIFY(
+            "duration of qnn_ggml_op_automation_ut %d(%s) with backend %d(%s) is: %lld milliseconds\n",
+            n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type,
+            get_qnn_backend_name(n_backend_type), n_durtion);
+    LOGGD("leave qnn_ggml_op_automation_ut(automation unit test)\n");
+
     return 0;
 }
\ No newline at end of file
diff --git a/core/ggml/jni/ggml-jni-impl.cpp b/core/ggml/jni/ggml-jni-impl.cpp
index 9b9092716..721fa4b8c 100644
--- a/core/ggml/jni/ggml-jni-impl.cpp
+++ b/core/ggml/jni/ggml-jni-impl.cpp
@@ -642,7 +642,8 @@ static const char * whisper_transcribe_from_file(const char * sz_model_path, con
         context = whisper_init_from_file_with_params(sz_model_path, wcp);
         if (nullptr == context) {
             LOGGW("whisper_init_from_file_with_params failure, pls check why\n");
-            GGML_JNI_NOTIFY("whisper_init_from_file_with_params failure, pls check why(pls check whether whispercpp model is valid)\n");
+            GGML_JNI_NOTIFY("whisper_init_from_file_with_params failure(%s), pls check why? whether whispercpp model %s is valid ?\n",
+                            whisper_get_internal_error(), sz_model_path);
             result = -1;
             goto failure;
         }
@@ -803,7 +804,7 @@ void whisper_set_benchmark_status(int b_exit_benchmark) {
  *
  * @param sz_model_path         /sdcard/kantv/ggml-xxxxxx.bin or  /sdcard/kantv/xxxxxx.gguf or qualcomm's prebuilt dedicated model.so or ""
  * @param sz_audio_path         /sdcard/kantv/jfk.wav
- * @param n_bench_type          0: asr(transcription) 1: memcpy 2: mulmat  3: full/whisper_encode 4: matrix  5: LLAMA  6: stable diffusion 7: QNN sample 8: QNN saver 9: QNN matrix 10: QNN GGML 11: QNN complex 12: QNN GGML OP
+ * @param n_bench_type          0: whisper asr 1: memcpy 2: mulmat  3: whisper full 4: LLAMA 5: stable diffusion 6: QNN sample 7: QNN saver 8: QNN matrix 9: QNN GGML 10: QNN complex 11: QNN GGML OP(QNN UT) 12: QNN UT automation
  * @param n_threads             1 - 8
  * @param n_backend_type        0: CPU  1: GPU  2: DSP 3: ggml("fake" QNN backend, just for compare performance)
  * @param n_op_type             type of matrix manipulate / GGML OP / type of various complex/complicated compute graph
@@ -855,16 +856,17 @@ void ggml_jni_bench(const char * sz_model_path, const char *sz_audio_path, int n
             break;
 
         case BECHMARK_MULMAT:
-            whisper_bench_ggml_mul_mat(n_threads);
+            whisper_bench_ggml_mul_mat(n_threads, n_backend_type);
             break;
 
         case BECHMARK_FULL:
             whisper_bench_full();
             break;
-
+            /* not used since 04-20-2024
         case BENCHMARK_MATRIX:
             ggml_bench_matrix(n_backend_type, n_threads);
             break;
+             */
 
         case BENCHMAKR_LLAMA:
             ggml_bench_llama(sz_model_path, n_threads, n_backend_type);
@@ -919,22 +921,26 @@ void ggml_jni_bench(const char * sz_model_path, const char *sz_audio_path, int n
             }
             break;
 
-        case BENCHMARK_QNN_MATRIX:
+        case BENCHMARK_QNN_MATRIX:  //offload a simple fp32 2x2 matrix addition operation to QNN
             qnn_matrix(n_backend_type, n_op_type);
             break;
 
-        case BENCHMARK_QNN_GGML:
+        case BENCHMARK_QNN_GGML:    //mapping ggml tensor to QNN tensor
             qnn_ggml(n_backend_type, n_op_type);
             break;
 
-        case BENCHMARK_QNN_COMPLEX:
+        case BENCHMARK_QNN_COMPLEX: //complicated computation graph in C/C++ or GGML and then offload them to QNN
             qnn_complex_graph(n_backend_type, n_op_type);
             break;
 
-        case BENCHMARK_QNN_GGML_OP:
+        case BENCHMARK_QNN_GGML_OP: //UT for PoC-S49: implementation of GGML OPs using QNN API
             qnn_ggml_op(sz_model_path, n_threads, n_backend_type, n_op_type);
             break;
 
+        case BENCHMARK_QNN_AUTO_UT://automation UT for PoC-S49: implementation of GGML OPs using QNN API
+            qnn_ggml_op_automation_ut(sz_model_path, n_threads, n_backend_type, n_op_type);
+            break;
+
         default:
             break;
     }
diff --git a/core/ggml/jni/ggml-jni.h b/core/ggml/jni/ggml-jni.h
index 03b704af6..16a051b7c 100644
--- a/core/ggml/jni/ggml-jni.h
+++ b/core/ggml/jni/ggml-jni.h
@@ -53,22 +53,23 @@ extern "C" {
 #define JNI_BUF_LEN                 4096
 #define JNI_TMP_LEN                 256
 
-#define BECHMARK_ASR                0
-#define BECHMARK_MEMCPY             1
-#define BECHMARK_MULMAT             2
-#define BECHMARK_FULL               3
-#define BENCHMARK_MATRIX            4
-#define BENCHMAKR_LLAMA             5
-#define BENCHMAKR_STABLEDIFFUSION   6       //not work on Xiaomi 14 currently
-// I think there are three killer/heavyweight AI applications based on GGML currently: whisper.cpp, llama.cpp, stablediffusion.cpp, so they are here
+#define BECHMARK_ASR                0       //whisper.cpp ASR benchmark
+#define BECHMARK_MEMCPY             1       //memcpy  benchmark
+#define BECHMARK_MULMAT             2       //mulmat  benchmark
+#define BECHMARK_FULL               3       //whisper.cpp full benchmark
+/*#define BENCHMARK_MATRIX          4*/     //not used since 04-20-2024
+#define BENCHMAKR_LLAMA             4       //llama.cpp benchmark
+#define BENCHMAKR_STABLEDIFFUSION   5       //stable diffusion benchmark, not work on Xiaomi 14 currently
+// there are three killer/heavyweight AI applications based on GGML currently: whisper.cpp, llama.cpp, stablediffusion.cpp, so they are here
 // then
 // step-by-step for PoC:Add Qualcomm mobile SoC native backend for GGML https://github.com/zhouwg/kantv/issues/121
-#define BENCHMAKR_QNN_SAMPLE        7       //"play with /say hello to" QNN Sample
-#define BENCHMAKR_QNN_SAVER         8       //study QNN SDK mechanism by QNN Saver
-#define BENCHMARK_QNN_MATRIX        9       //offload a simple fp32 2x2 matrix addition operation to QNN
-#define BENCHMARK_QNN_GGML          10      //mapping ggml tensor to QNN tensor
-#define BENCHMARK_QNN_COMPLEX       11      //complex/complicated computation graph in C/C++ or GGML and then offload them to QNN
-#define BENCHMARK_QNN_GGML_OP       12      //for PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API
+#define BENCHMAKR_QNN_SAMPLE        6       //"play with /say hello to" QNN Sample
+#define BENCHMAKR_QNN_SAVER         7       //study QNN SDK mechanism by QNN Saver
+#define BENCHMARK_QNN_MATRIX        8       //offload a simple fp32 2x2 matrix addition operation to QNN
+#define BENCHMARK_QNN_GGML          9       //mapping ggml tensor to QNN tensor
+#define BENCHMARK_QNN_COMPLEX       10      //complex/complicated computation graph in C/C++ or GGML and then offload them to QNN
+#define BENCHMARK_QNN_GGML_OP       11      //UT for PoC-S49: implementation of GGML OPs using QNN API
+#define BENCHMARK_QNN_AUTO_UT       12      //automation UT for PoC-S49: implementation of GGML OPs using QNN API
 #define BENCHMAKR_MAX               12
 
 #define BACKEND_CPU                 0
@@ -88,7 +89,7 @@ extern "C" {
     *
     * @param sz_model_path         /sdcard/kantv/ggml-xxxxxx.bin or  /sdcard/kantv/xxxxxx.gguf or qualcomm's prebuilt dedicated model.so or ""
     * @param sz_audio_path         /sdcard/kantv/jfk.wav
-    * @param n_bench_type          0: asr(transcription) 1: memcpy 2: mulmat  3: full/whisper_encode 4: matrix  5: LLAMA  6: stable diffusion 7: QNN sample 8: QNN saver 9: QNN matrix 10: QNN GGML 11: QNN complex 12: QNN GGML OP
+    * @param n_bench_type          0: whisper asr 1: memcpy 2: mulmat  3: whisper full 4: LLAMA 5: stable diffusion 6: QNN sample 7: QNN saver 8: QNN matrix 9: QNN GGML 10: QNN complex 11: QNN GGML OP(QNN UT) 12: QNN UT automation
     * @param n_threads             1 - 8
     * @param n_backend_type        0: CPU  1: GPU  2: DSP 3: ggml("fake" QNN backend, just for compare performance)
     * @param n_op_type             type of matrix manipulate / GGML OP / type of various complex/complicated computation graph
@@ -204,6 +205,11 @@ extern "C" {
      */
     int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, int n_ggml_op_type);
 
+    /**
+     * similar to qnn_ggml_op, but an automation UT for a specify GGML OP with a specify backend
+     */
+    int qnn_ggml_op_automation_ut(const char * model_path, int num_threads, int n_backend_type, int n_ggml_op_type);
+
 
 // =================================================================================================
 // trying to integrate stablediffusion.cpp on 04-06-2024(Apri,6,2024)
diff --git a/core/ggml/llamacpp/ggml-backend.c b/core/ggml/llamacpp/ggml-backend.c
index 777fe1ff9..e5feb9ff5 100644
--- a/core/ggml/llamacpp/ggml-backend.c
+++ b/core/ggml/llamacpp/ggml-backend.c
@@ -809,14 +809,14 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 }
 
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    ENTER_FUNC();
+    //ENTER_FUNC();
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
 
     if (cpu_ctx->work_size < cplan.work_size) {
         free(cpu_ctx->work_data);
-        LOGGI("here");
+        //LOGGI("here");
         cpu_ctx->work_data = malloc(cplan.work_size);
         if (cpu_ctx->work_data == NULL) {
             cpu_ctx->work_size = 0;
@@ -824,13 +824,13 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
         }
         cpu_ctx->work_size = cplan.work_size;
     }
-    LOGGI("here");
+    //LOGGI("here");
     cplan.work_data = cpu_ctx->work_data;
 
     cplan.abort_callback      = cpu_ctx->abort_callback;
     cplan.abort_callback_data = cpu_ctx->abort_callback_data;
 
-    LEAVE_FUNC();
+    //LEAVE_FUNC();
     return ggml_graph_compute(cgraph, &cplan);
 }
 
diff --git a/core/ggml/llamacpp/ggml-qnn.cpp b/core/ggml/llamacpp/ggml-qnn.cpp
index 838e6bbbd..68d96b981 100644
--- a/core/ggml/llamacpp/ggml-qnn.cpp
+++ b/core/ggml/llamacpp/ggml-qnn.cpp
@@ -20,7 +20,7 @@
  *
  * 5. lack of resource management of internal QNN resources and toggle between different backend(QNN CPU backend, QNN GPU backend, ggml...)
  *
- * 6. only support FP32 / FP16 and many strict limitation(depend on QNN SDK)
+ * 6. only support FP32 / FP16(other data type not used currently)
  *
  * 7. QNN's RPC feature not used currently
  *
@@ -180,6 +180,7 @@ struct qnn_buf_s
 
 struct ggml_backend_qnn_context {
     int device;
+    int threads;
     char name[GGML_MAX_NAME];
     char lib[GGML_MAX_NAME];
     qnn_instance * instance;
@@ -260,9 +261,9 @@ static void ggml_setup_op_has_task_pass(void) {
 
 //QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently
 static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
-        [QNN_CPU]   = {.device = 0, .name =   "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
-        [QNN_GPU]   = {.device = 1, .name =   "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
-        [QNN_HTP]   = {.device = 2, .name =   "qnn-htp", .lib = "libQnnHtp.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
+        [QNN_CPU]   = {.device = 0, .threads = 1, .name =   "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
+        [QNN_GPU]   = {.device = 1, .threads = 1, .name =   "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
+        [QNN_HTP]   = {.device = 2, .threads = 1, .name =   "qnn-htp(aka dsp)", .lib = "libQnnHtp.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
 };
 
 
@@ -1469,7 +1470,7 @@ static void ggml_qnn_logcallback(const char * fmt,
         int len_content = 0;
         memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
         len_content = vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
-        //LOGGD("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf);
+        LOGGD("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf);
     }
 }
 
@@ -1760,33 +1761,27 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * src0, const struct
         return false;
     }
 
-    const int64_t ne10 = src0->ne[0];
-    const int64_t ne11 = src0->ne[1];
-    const int64_t ne20 = src1->ne[0];
-    const int64_t ne21 = src1->ne[1];
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
 
     const int64_t ne0 = dst->ne[0];
     const int64_t ne1 = dst->ne[1];
 
-    /*
-    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
-           (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) &&
-            (dst->type == GGML_TYPE_F32);
-            */
-
 
     //make QNN SDK happy
     if (dst->op == GGML_OP_ADD) {
         return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
                (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) &&
-               (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne10 > 1 && ne11 > 1 && ne20 > 1 && ne21 > 1)) &&
+               (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)) &&
                (src0->rank == src1->rank);
     }
 
     //make QNN SDK happy
     return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
             (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) &&
-                  (src0->type == src1->type) && (src0->type == dst->type) && ((ne10 > 1 && ne11 > 1 && ne20 > 1 && ne21 > 1));
+                  (src0->type == src1->type) && (src0->type == dst->type) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1));
 
 }
 
@@ -3520,7 +3515,7 @@ static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_c
     int task_phase                  = GGML_TASK_TYPE_FINALIZE;
     ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *) backend->context;
 
-    struct ggml_cplan plan          = ggml_graph_plan(cgraph, 1);
+    struct ggml_cplan plan          = ggml_graph_plan(cgraph, 1);//TODO: multithread support in QNN backend
 
     buf_element_t * qnn_buf = nullptr;
 
@@ -3654,6 +3649,13 @@ bool ggml_backend_is_qnn(ggml_backend_t backend) {
 }
 
 
+void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_qnn(backend));
+
+    struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context;
+    ctx->threads = n_threads;
+}
+
 const char * ggml_backend_qnn_get_name(ggml_backend_t backend) {
     return backend->iface.get_name(backend);
 }
diff --git a/core/ggml/llamacpp/ggml-qnn.h b/core/ggml/llamacpp/ggml-qnn.h
index 3ed0121a1..88d3014aa 100644
--- a/core/ggml/llamacpp/ggml-qnn.h
+++ b/core/ggml/llamacpp/ggml-qnn.h
@@ -31,6 +31,8 @@ GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num);
 
 GGML_API bool           ggml_backend_is_qnn(ggml_backend_t backend);
 
+GGML_API void           ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads);
+
 GGML_API int            ggml_backend_qnn_get_device_count(void);
 GGML_API void           ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size);
 
diff --git a/core/ggml/llamacpp/ggml.c b/core/ggml/llamacpp/ggml.c
index 32c65a07b..64f9f36cb 100644
--- a/core/ggml/llamacpp/ggml.c
+++ b/core/ggml/llamacpp/ggml.c
@@ -18321,7 +18321,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
             // distribute new work or execute it direct if 1T
             while (++node_n < cgraph->n_nodes) {
-                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
+                //LOGGI("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
                 struct ggml_tensor * node = cgraph->nodes[node_n];
                 const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
 
diff --git a/core/ggml/whispercpp/whisper.cpp b/core/ggml/whispercpp/whisper.cpp
index b232e5f64..672e7d934 100644
--- a/core/ggml/whispercpp/whisper.cpp
+++ b/core/ggml/whispercpp/whisper.cpp
@@ -146,6 +146,11 @@ extern "C" int __android_log_print(int prio, const char * tag, const char * fmt,
     __attribute__((__format__(printf, 3, 4)));
 #endif
 
+static std::string _s_internal_error_info = "unknown";
+
+const char * whisper_get_internal_error() {
+    return _s_internal_error_info.c_str();
+}
 
 WHISPER_ATTRIBUTE_FORMAT(5, 6)
 static void whisper_log_internal        (ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
@@ -223,10 +228,9 @@ static bool ggml_graph_compute_helper(
                 return false;
             }
         }
-        return ggml_graph_compute(graph, &plan) == GGML_STATUS_SUCCESS;  //04-17-2024, works perfectly with whisper.cpp
+        return ggml_graph_compute(graph, &plan) == GGML_STATUS_SUCCESS;
 #else
-        //04-17-2024, refer to PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API,https://github.com/zhouwg/kantv/issues/121
-        return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS;
+        ggml_backend_qnn_set_n_threads(backend, n_threads);
 #endif
     }
 #endif
@@ -1292,7 +1296,11 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params
             WHISPER_LOG_INFO("%s: using QNN backend\n", __func__);
             backend_gpu = ggml_backend_qnn_init(params.gpu_device);
             if (!backend_gpu) {
-                WHISPER_LOG_ERROR("%s: ggml_backend_qnn_init() failed\n", __func__);
+                char device_name[GGML_MAX_NAME];
+                ggml_backend_qnn_get_device_description(params.gpu_device, device_name, GGML_MAX_NAME);
+                WHISPER_LOG_ERROR("%s: ggml_backend_qnn_init() failed with device %d(%s)\n", __func__, params.gpu_device, device_name);
+                _s_internal_error_info = "ggml_backend_qnn_init() failed with device (" + std::to_string(params.gpu_device) + ") " + device_name;
+                return nullptr; //04-20-2024, do not fall into the default GGML CPU backend, so the upper layer code/UI could know correct feedback
             }
         }
     }
@@ -3559,6 +3567,7 @@ struct whisper_context * whisper_init_from_file_with_params(const char * path_mo
         return nullptr;
     }
 
+    _s_internal_error_info = "unknown";
     ctx->state = whisper_init_state(ctx);
     if (!ctx->state) {
         WHISPER_LOG_INFO("whisper init failure\n");
@@ -6572,12 +6581,12 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
     return s.c_str();
 }
 
-WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
-    fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr);
+WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads, int n_backend) {
+    fputs(whisper_bench_ggml_mul_mat_str(n_threads, n_backend), stderr);
     return 0;
 }
 
-WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
+WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads, int n_backend) {
     static std::string s;
     s = "";
     char strbuf[256];
@@ -6658,10 +6667,13 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
                 /*.mem_buffer =*/ buf.data(),
                 /*.no_alloc   =*/ false,
             };
+
 #ifdef GGML_USE_QNN
-            gparams.use_hwaccel   = true;
+            if (n_backend != 3) //3 is fake QNN backend "ggml", just used to compare performance between QNN backend and original GGML
+                gparams.use_hwaccel = true;
 #endif
 
+
             struct ggml_context * ctx0 = ggml_init(gparams);
 
             struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype,         N, N);
diff --git a/core/ggml/whispercpp/whisper.h b/core/ggml/whispercpp/whisper.h
index bd8d8df82..becbcd075 100644
--- a/core/ggml/whispercpp/whisper.h
+++ b/core/ggml/whispercpp/whisper.h
@@ -654,8 +654,10 @@ extern "C" {
 
     WHISPER_API int          whisper_bench_memcpy          (int n_threads);
     WHISPER_API const char * whisper_bench_memcpy_str      (int n_threads);
-    WHISPER_API int          whisper_bench_ggml_mul_mat    (int n_threads);
-    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
+    WHISPER_API int          whisper_bench_ggml_mul_mat    (int n_threads, int n_backend);
+    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads, int n_backend);
+
+    WHISPER_API const char * whisper_get_internal_error(void);
 
     // Control logging output; default behavior is to print to stderr