diff --git a/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java b/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java
index 462eabc45..37b2f325c 100755
--- a/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java
+++ b/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java
@@ -260,19 +260,20 @@ public class CDEUtils {
public static final int ASR_MODE_TRANSCRIPTION_RECORD = 3; // transcription + audio record
//keep sync with ggml-jni.h
- public static final int BECHMARK_ASR = 0;
- public static final int BECHMARK_MEMCPY = 1;
- public static final int BECHMARK_MULMAT = 2;
- public static final int BECHMARK_FULL = 3;
- public static final int BENCHMARK_MATRIX = 4;
- public static final int BENCHMARK_LLM = 5;
- public static final int BENCHMARK_STABLEDIFFUSION= 6;
- public static final int BENCHMARK_QNN_SAMPLE = 7;
- public static final int BENCHMARK_QNN_SAVER = 8;
- public static final int BENCHMARK_QNN_MATRIX = 9;
- public static final int BENCHMARK_QNN_GGML = 10;
- public static final int BENCHMARK_QNN_COMPLEX = 11;
- public static final int BENCHMARK_QNN_GGML_OP = 12;
+ public static final int BENCHMARK_ASR = 0;
+ public static final int BENCHMARK_MEMCPY = 1;
+ public static final int BENCHMARK_MULMAT = 2;
+ public static final int BENCHMARK_FULL = 3;
+ //public static final int BENCHMARK_MATRIX = 4;//not used since 04-20-2024
+ public static final int BENCHMARK_LLM = 4;
+ public static final int BENCHMARK_STABLEDIFFUSION= 5;
+ public static final int BENCHMARK_QNN_SAMPLE = 6;
+ public static final int BENCHMARK_QNN_SAVER = 7;
+ public static final int BENCHMARK_QNN_MATRIX = 8;
+ public static final int BENCHMARK_QNN_GGML = 9;
+ public static final int BENCHMARK_QNN_COMPLEX = 10;
+ public static final int BENCHMARK_QNN_GGML_OP = 11;
+ public static final int BENCHMARK_QNN_AUTO_UT = 12;
//keep sync with ggml-qnn.h
public static final int QNN_BACKEND_CPU = 0;
@@ -3916,20 +3917,22 @@ public void onClick(DialogInterface dialog, int which) {
public static String getBenchmarkDesc(int benchmarkIndex) {
switch (benchmarkIndex) {
- case BECHMARK_FULL:
- return "GGML whisper_encode";
+ case BENCHMARK_FULL:
+ return "GGML whisper full";
- case BECHMARK_MEMCPY:
- return "GGML memcopy";
+ case BENCHMARK_MEMCPY:
+ return "GGML memcpy";
- case BECHMARK_MULMAT:
+ case BENCHMARK_MULMAT:
return "GGML matrix multiply";
- case BECHMARK_ASR:
- return "GGML ASR inference";
+ case BENCHMARK_ASR:
+ return "GGML whisper ASR";
+ /*//not used since 04-20-2024
case BENCHMARK_MATRIX:
return "GGML matrix";
+ */
case BENCHMARK_LLM:
return "GGML LLAMA";
@@ -3943,17 +3946,21 @@ public static String getBenchmarkDesc(int benchmarkIndex) {
case BENCHMARK_QNN_SAVER:
return "GGML QNN saver";
+
case BENCHMARK_QNN_MATRIX:
- return "GGML QNN matrix manipulate";
+ return "GGML QNN matrix addition";
case BENCHMARK_QNN_GGML:
- return "GGML QNN ggml";
+ return "GGML QNN mapping ggml tensor";
case BENCHMARK_QNN_COMPLEX:
return "GGML QNN complex graph";
case BENCHMARK_QNN_GGML_OP:
- return "GGML QNN OP"; //for PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API
+ return "GGML QNN OP UT"; //UT for PoC-S49: implementation of GGML OPs using QNN API
+
+ case BENCHMARK_QNN_AUTO_UT:
+ return "GGML QNN OP UT automation"; //automation UT for PoC-S49: implementation of GGML OPs using QNN API
}
return "unknown";
diff --git a/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java b/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java
index c7187f7c4..8a45511ae 100755
--- a/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java
+++ b/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java
@@ -119,7 +119,7 @@ public enum ggml_op {
/**
* @param modelPath /sdcard/kantv/ggml-xxxxxx.bin or /sdcard/kantv/xxxxxx.gguf or qualcomm's prebuilt dedicated model.so or ""
* @param audioPath /sdcard/kantv/jfk.wav
- * @param nBenchType 0: asr(transcription) 1: memcpy 2: mulmat 3: full/whisper_encode 4: matrix 5: LLAMA 6: stable diffusion 7: QNN sample 8: QNN saver 9: QNN matrix 10: QNN GGML 11: QNN complex 12: QNN GGML OP
+ * @param nBenchType 0: whisper asr 1: memcpy 2: mulmat 3: whisper full 4: LLAMA 5: stable diffusion 6: QNN sample 7: QNN saver 8: QNN matrix 9: QNN GGML 10: QNN complex 11: QNN GGML OP(QNN UT) 12: QNN UT automation
* @param nThreadCounts 1 - 8
* @param nBackendType 0: CPU 1: GPU 2: DSP 3: ggml("fake" QNN backend, just for compare performance)
* @param nOpType type of matrix manipulate / GGML OP / type of various complex/complicated computation graph
diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java
index 83c6bd802..d015cc26e 100755
--- a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java
+++ b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java
@@ -255,7 +255,7 @@ public void onNothingSelected(AdapterView> parent) {
}
});
- spinnerBenchType.setSelection(CDEUtils.BENCHMARK_QNN_GGML_OP);
+ spinnerBenchType.setSelection(CDEUtils.BENCHMARK_ASR);
Spinner spinnerThreadsCounts = mActivity.findViewById(R.id.spinnerThreadCounts);
String[] arrayThreadCounts = getResources().getStringArray(R.array.threadCounts);
diff --git a/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml b/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
index 9310dcdff..5f0c9e413 100755
--- a/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
+++ b/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml
@@ -43,7 +43,7 @@
+ android:text="Bench" />
+ android:text="Threads" />
- - asr
- - mempcpy
- - mulmat
- - full
- - matrix
- - llama
- - stablediffusion
-
- - qnn-sample
- - qnn-saver
- - qnn-matrix
- - qnn-ggml
- - qnn-complex
- - qnn-ggml-op
+ - whisper asr
+ - memcpy
+ - whisper mulmat
+ - whisper full
+ - llama
+ - stablediffusion
+
+ - qnn-sample
+ - qnn-saver
+ - qnn-matrix
+ - qnn-ggml
+ - qnn-complex
+ - qnn-ggml-op
+ - qnn-auto-ut
diff --git a/core/ggml/jni/ggml-jni-impl-external.cpp b/core/ggml/jni/ggml-jni-impl-external.cpp
index f3d725464..453f27d9d 100644
--- a/core/ggml/jni/ggml-jni-impl-external.cpp
+++ b/core/ggml/jni/ggml-jni-impl-external.cpp
@@ -117,7 +117,12 @@ extern "C" {
static const char * get_qnn_backend_name(int n_backend_type);
static float tensor_sum_elements(const ggml_tensor * tensor);
-static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads);
+static bool ggml_graph_compute_helper(
+ struct ggml_cgraph * graph,
+ std::vector & buf,
+ int n_threads,
+ ggml_abort_callback abort_callback,
+ void * abort_callback_data);
static void tensor_dump(const ggml_tensor * tensor, const char * name);
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
@@ -885,7 +890,7 @@ void ggml_bench_matrix(int num_threads, int backend_type) {
std::vector work_buffer;
- ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
+ ggml_graph_compute_helper(gf,work_buffer, benchmark_params.n_threads, nullptr, nullptr);
if (get_tensor_data_size(m11) < 100) {
TENSOR_DUMP(m11);
@@ -963,7 +968,7 @@ void ggml_bench_matrix(int num_threads, int backend_type) {
long long int start = ggml_time_us();
//GGML_JNI_NOTIFY("Running ggml_graph_compute\n");
- ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
+ ggml_graph_compute_helper(gf31,work_buffer, benchmark_params.n_threads, nullptr, nullptr);
long long int stop = ggml_time_us();
long long int usec = stop-start;
@@ -996,7 +1001,7 @@ void ggml_bench_matrix(int num_threads, int backend_type) {
}
// Running a different graph computation to make sure we override the CPU cache lines
- ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
+ ggml_graph_compute_helper(gf,work_buffer, benchmark_params.n_threads, nullptr, nullptr);
}
GGML_JNI_NOTIFY("\n");
GGML_JNI_NOTIFY("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
@@ -3298,19 +3303,26 @@ static intptr_t alignTo(size_t alignment, intptr_t offset) {
}
-static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) {
+
+static bool ggml_graph_compute_helper(
+ struct ggml_cgraph * graph,
+ std::vector & buf,
+ int n_threads,
+ ggml_abort_callback abort_callback,
+ void * abort_callback_data) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+ plan.abort_callback = abort_callback;
+ plan.abort_callback_data = abort_callback_data;
+
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
- ggml_graph_compute(graph, &plan);
+ return ggml_graph_compute(graph, &plan);
}
-
-
static float tensor_sum_elements(const ggml_tensor * tensor) {
double sum = 0;
float value = 0;
@@ -6854,7 +6866,7 @@ int qnn_matrix(int n_backend_type, int n_op_type) {
m2 = ggml_add(ctx, m0, m1); // GGML_OP_ADD
gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, m2);
- ggml_graph_compute_helper(work_buffer, gf, 4);
+ ggml_graph_compute_helper(gf, work_buffer, 4, nullptr, nullptr);
TENSOR_DUMP(m0);
TENSOR_DUMP(m1);
TENSOR_DUMP(m2);
@@ -7254,7 +7266,7 @@ int qnn_ggml(int n_backend_type, int n_ggml_op_type) {
gf = ggml_new_graph(ctx);
ggml_set_f32(m2, 0.0f);
ggml_build_forward_expand(gf, m2);
- ggml_graph_compute_helper(work_buffer, gf, 4);
+ ggml_graph_compute_helper(gf,work_buffer, 4, nullptr, nullptr);
TENSOR_DUMP(m0);
TENSOR_DUMP(m1);
@@ -8194,6 +8206,8 @@ int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, in
GGML_JNI_NOTIFY("starting qnn_ggml_op UT(unit test)\n");
#if 0 // for performance comparison between QNN backend and original GGML
+ // on Xiaomi14, 9x performance gain
+ // on low-end phone, 3x performance gain
const int sizey = 4096;
const int sizex = 4096;
const int sizez = 128;
@@ -8267,7 +8281,7 @@ int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, in
GGML_JNI_NOTIFY("creating compute graph\n");
gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, dst);
- ggml_graph_compute_helper(work_buffer, gf, num_threads);
+ ggml_graph_compute_helper(gf,work_buffer, num_threads, nullptr, nullptr);
if (get_tensor_data_size(dst) < 100) {
TENSOR_DUMP(src0);
@@ -8297,5 +8311,228 @@ int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, in
GGML_JNI_NOTIFY("duration of qnn_ggml_op %d(%s) with backend %d(%s) is: %lld milliseconds\n", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type), n_backend_type, get_qnn_backend_name(n_backend_type), n_durtion);
LOGGD("leave qnn_ggml_op UT(unit test)\n");
+ return 0;
+}
+
+
+/**
+ * similar to qnn_ggml_op, but an automation UT for a specify GGML OP with a specify backend
+ *
+ * this function borrow from whisper.cpp
+ */
+//TODO: 1. only support GGML_OP_ADD, GGML_OP_MUL, GGMPL_OP_MULMAT
+// 2. works with FP32
+int qnn_ggml_op_automation_ut(const char *model_path, int num_threads, int n_backend_type,
+ int n_ggml_op_type) {
+ int result = 0;
+ int64_t n_begin_time = 0LL;
+ int64_t n_end_time = 0LL;
+ int64_t n_durtion = 0LL;
+
+
+ LOGGD("enter qnn_ggml_op_automation_ut\n");
+ LOGGI("mode path:%s", model_path);
+ LOGGI("num_threads:%d", num_threads);
+ LOGGI("backend_type:%d(%s)", n_backend_type, get_qnn_backend_name(n_backend_type));
+ LOGGI("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
+ GGML_JNI_NOTIFY("starting qnn_ggml_op_automation_ut(automation unit test)\n");
+
+ n_begin_time = ggml_time_us();
+
+ srand(time(NULL));
+
+ bool support_ops = (n_ggml_op_type == GGML_OP_MUL_MAT || n_ggml_op_type == GGML_OP_MUL || n_ggml_op_type == GGML_OP_ADD);
+ if (!support_ops) {
+ LOGGD("ggml op %d(%s) not supported with backend %d(%s)", n_ggml_op_type,
+ ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type,
+ get_qnn_backend_name(n_backend_type));
+ GGML_JNI_NOTIFY("ggml op %d(%s) not supported with backend %d(%s)", n_ggml_op_type,
+ ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type,
+ get_qnn_backend_name(n_backend_type));
+ LOGGD("leave qnn_ggml_op UT(unit test)\n");
+
+ return 1;
+ }
+
+
+ char strbuf[256];
+ std::string tipString;
+ tipString = "";
+
+ const int n_max = 128;
+
+ const std::vector sizes = {
+ 64, 128, 256, 512, 1024, 2048, 4096,
+ };
+
+ const size_t N_max = sizes.back();
+
+ // a: N*N*sizeof(float)
+ // b: N*N*sizeof(float)
+ // c: N*N*sizeof(float)
+ // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
+ std::vector buf(
+ 3llu * N_max * N_max * sizeof(float) + 3 * ggml_tensor_overhead() +
+ ggml_graph_overhead());
+ std::vector work;
+
+
+ tipString += "\nprepare matrix";
+ kantv_asr_notify_benchmark_c("prepare matrix\n");
+
+ for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
+
+ for (int j = 0; j < (int) sizes.size(); j++) {
+ int n_q4_0 = 0;
+ int n_q4_1 = 0;
+ int n_q5_0 = 0;
+ int n_q5_1 = 0;
+ int n_q8_0 = 0;
+ int n_fp16 = 0;
+ int n_fp32 = 0;
+
+ // GFLOPS/s
+ double s_q4_0 = 0.0;
+ double s_q4_1 = 0.0;
+ double s_q5_0 = 0.0;
+ double s_q5_1 = 0.0;
+ double s_q8_0 = 0.0;
+ double s_fp16 = 0.0;
+ double s_fp32 = 0.0;
+
+ const size_t N = sizes[j];
+#if 0
+ for (int k = 0; k < 7; ++k) {
+ const ggml_type wtype =
+ k == 0 ? GGML_TYPE_Q4_0 :
+ k == 1 ? GGML_TYPE_Q4_1 :
+ k == 2 ? GGML_TYPE_Q5_0 :
+ k == 3 ? GGML_TYPE_Q5_1 :
+ k == 4 ? GGML_TYPE_Q8_0 :
+ k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+#else
+ for (int k = 0; k < 1; ++k) {
+ const ggml_type wtype = GGML_TYPE_F32; //TODO: only f16&f32 supported with QNN backend
+ k = 6; //hardcode to 6 make following code happy
+#endif
+
+
+ double &s =
+ k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 :
+ k == 4 ? s_q8_0 :
+ k == 5 ? s_fp16
+ : /*k == 6*/ s_fp32;
+ int &n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 :
+ k == 4 ? n_q8_0 :
+ k == 5 ? n_fp16
+ : /*k == 6*/ n_fp32;
+
+ struct ggml_init_params gparams = {
+ /*.mem_size =*/ buf.size(),
+ /*.mem_buffer =*/ buf.data(),
+ /*.no_alloc =*/ false,
+ };
+#ifdef GGML_USE_QNN
+ if (n_backend_type !=
+ 3) //3 is fake QNN backend "ggml", just used to compare performance between QNN backend and original GGML
+ gparams.use_hwaccel = true;
+#endif
+ struct ggml_context *ctx0 = ggml_init(gparams);
+
+ struct ggml_tensor *a = ggml_new_tensor_2d(ctx0, wtype, N, N);
+ struct ggml_tensor *b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
+
+ struct ggml_tensor *c = nullptr;
+
+ switch (n_ggml_op_type) {
+ case GGML_OP_ADD:
+ c = ggml_add(ctx0, a, b);
+ break;
+ case GGML_OP_MUL:
+ c = ggml_mul(ctx0, a, b);
+ break;
+ case GGML_OP_MUL_MAT:
+ c = ggml_mul_mat(ctx0, a, b);
+ break;
+ }
+
+
+ struct ggml_cgraph *gf = ggml_new_graph(ctx0);
+
+ ggml_build_forward_expand(gf, c);
+
+ double tsum = 0.0;
+
+ // heat-up
+ ggml_graph_compute_helper(gf, work, num_threads, nullptr, nullptr);
+
+ for (int i = 0; i < n_max; ++i) {
+ const int64_t t0 = ggml_time_us();
+
+ kantv_asr_notify_benchmark_c("reset");
+ tipString = "calling ggml_graphic_compute_helper:\n";
+ tipString += "j= " + std::to_string(j) + "(matrix dimension = " +
+ std::to_string(N) + ",n_max=" + std::to_string(n_max) + ")"
+ + ",k=" + std::to_string(k) + "(ggml quant type=" +
+ std::string(whisper_get_ggml_type_str(
+ static_cast(wtype))) + ")"
+ + ",i=" + std::to_string(i) + "\n";
+
+ kantv_asr_notify_benchmark(tipString);
+
+ ggml_graph_compute_helper(gf, work, num_threads, nullptr, nullptr);
+
+ const int64_t t1 = ggml_time_us();
+
+ tsum += (t1 - t0) * 1e-6;
+ n++;
+
+ if (tsum > 1.0 && n >= 3) {
+ break;
+ }
+ }
+
+ ggml_free(ctx0);
+
+ s = ((2.0 * N * N * N * n) / tsum) * 1e-9;
+ }
+
+ kantv_asr_notify_benchmark_c("reset");
+ tipString = "";
+ // Q4_0 | Q4_1
+ snprintf(strbuf, sizeof(strbuf),
+ "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
+ N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
+ tipString += strbuf;
+
+ // Q5_0 | Q5_1 | Q8_0
+ snprintf(strbuf, sizeof(strbuf),
+ "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n",
+ N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0);
+ tipString += strbuf;
+
+ // F16 | F32
+ snprintf(strbuf, sizeof(strbuf),
+ "%4zu x %4zu: F16 %7.1f GFLOPS (%3d runs) | F32 %7.1f GFLOPS (%3d runs)\n",
+ N, N, s_fp16, n_fp16, s_fp32, n_fp32);
+ tipString += strbuf;
+
+
+ kantv_asr_notify_benchmark(tipString);
+ LOGGD("%s\n", tipString.c_str());
+ }
+
+
+ n_end_time = ggml_time_us();
+ n_durtion = (n_end_time - n_begin_time) / 1000;
+ LOGGD("duration of qnn_ggml_op_automation_ut %d(%s) with backend %d(%s) is: %lld milliseconds\n",
+ n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type,
+ get_qnn_backend_name(n_backend_type), n_durtion);
+ GGML_JNI_NOTIFY(
+ "duration of qnn_ggml_op_automation_ut %d(%s) with backend %d(%s) is: %lld milliseconds\n",
+ n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type,
+ get_qnn_backend_name(n_backend_type), n_durtion);
+ LOGGD("leave qnn_ggml_op_automation_ut(automation unit test)\n");
+
return 0;
}
\ No newline at end of file
diff --git a/core/ggml/jni/ggml-jni-impl.cpp b/core/ggml/jni/ggml-jni-impl.cpp
index 9b9092716..721fa4b8c 100644
--- a/core/ggml/jni/ggml-jni-impl.cpp
+++ b/core/ggml/jni/ggml-jni-impl.cpp
@@ -642,7 +642,8 @@ static const char * whisper_transcribe_from_file(const char * sz_model_path, con
context = whisper_init_from_file_with_params(sz_model_path, wcp);
if (nullptr == context) {
LOGGW("whisper_init_from_file_with_params failure, pls check why\n");
- GGML_JNI_NOTIFY("whisper_init_from_file_with_params failure, pls check why(pls check whether whispercpp model is valid)\n");
+ GGML_JNI_NOTIFY("whisper_init_from_file_with_params failure(%s), pls check why? whether whispercpp model %s is valid ?\n",
+ whisper_get_internal_error(), sz_model_path);
result = -1;
goto failure;
}
@@ -803,7 +804,7 @@ void whisper_set_benchmark_status(int b_exit_benchmark) {
*
* @param sz_model_path /sdcard/kantv/ggml-xxxxxx.bin or /sdcard/kantv/xxxxxx.gguf or qualcomm's prebuilt dedicated model.so or ""
* @param sz_audio_path /sdcard/kantv/jfk.wav
- * @param n_bench_type 0: asr(transcription) 1: memcpy 2: mulmat 3: full/whisper_encode 4: matrix 5: LLAMA 6: stable diffusion 7: QNN sample 8: QNN saver 9: QNN matrix 10: QNN GGML 11: QNN complex 12: QNN GGML OP
+ * @param n_bench_type 0: whisper asr 1: memcpy 2: mulmat 3: whisper full 4: LLAMA 5: stable diffusion 6: QNN sample 7: QNN saver 8: QNN matrix 9: QNN GGML 10: QNN complex 11: QNN GGML OP(QNN UT) 12: QNN UT automation
* @param n_threads 1 - 8
* @param n_backend_type 0: CPU 1: GPU 2: DSP 3: ggml("fake" QNN backend, just for compare performance)
* @param n_op_type type of matrix manipulate / GGML OP / type of various complex/complicated compute graph
@@ -855,16 +856,17 @@ void ggml_jni_bench(const char * sz_model_path, const char *sz_audio_path, int n
break;
case BECHMARK_MULMAT:
- whisper_bench_ggml_mul_mat(n_threads);
+ whisper_bench_ggml_mul_mat(n_threads, n_backend_type);
break;
case BECHMARK_FULL:
whisper_bench_full();
break;
-
+ /* not used since 04-20-2024
case BENCHMARK_MATRIX:
ggml_bench_matrix(n_backend_type, n_threads);
break;
+ */
case BENCHMAKR_LLAMA:
ggml_bench_llama(sz_model_path, n_threads, n_backend_type);
@@ -919,22 +921,26 @@ void ggml_jni_bench(const char * sz_model_path, const char *sz_audio_path, int n
}
break;
- case BENCHMARK_QNN_MATRIX:
+ case BENCHMARK_QNN_MATRIX: //offload a simple fp32 2x2 matrix addition operation to QNN
qnn_matrix(n_backend_type, n_op_type);
break;
- case BENCHMARK_QNN_GGML:
+ case BENCHMARK_QNN_GGML: //mapping ggml tensor to QNN tensor
qnn_ggml(n_backend_type, n_op_type);
break;
- case BENCHMARK_QNN_COMPLEX:
+ case BENCHMARK_QNN_COMPLEX: //complicated computation graph in C/C++ or GGML and then offload them to QNN
qnn_complex_graph(n_backend_type, n_op_type);
break;
- case BENCHMARK_QNN_GGML_OP:
+ case BENCHMARK_QNN_GGML_OP: //UT for PoC-S49: implementation of GGML OPs using QNN API
qnn_ggml_op(sz_model_path, n_threads, n_backend_type, n_op_type);
break;
+ case BENCHMARK_QNN_AUTO_UT://automation UT for PoC-S49: implementation of GGML OPs using QNN API
+ qnn_ggml_op_automation_ut(sz_model_path, n_threads, n_backend_type, n_op_type);
+ break;
+
default:
break;
}
diff --git a/core/ggml/jni/ggml-jni.h b/core/ggml/jni/ggml-jni.h
index 03b704af6..16a051b7c 100644
--- a/core/ggml/jni/ggml-jni.h
+++ b/core/ggml/jni/ggml-jni.h
@@ -53,22 +53,23 @@ extern "C" {
#define JNI_BUF_LEN 4096
#define JNI_TMP_LEN 256
-#define BECHMARK_ASR 0
-#define BECHMARK_MEMCPY 1
-#define BECHMARK_MULMAT 2
-#define BECHMARK_FULL 3
-#define BENCHMARK_MATRIX 4
-#define BENCHMAKR_LLAMA 5
-#define BENCHMAKR_STABLEDIFFUSION 6 //not work on Xiaomi 14 currently
-// I think there are three killer/heavyweight AI applications based on GGML currently: whisper.cpp, llama.cpp, stablediffusion.cpp, so they are here
+#define BECHMARK_ASR 0 //whisper.cpp ASR benchmark
+#define BECHMARK_MEMCPY 1 //memcpy benchmark
+#define BECHMARK_MULMAT 2 //mulmat benchmark
+#define BECHMARK_FULL 3 //whisper.cpp full benchmark
+/*#define BENCHMARK_MATRIX 4*/ //not used since 04-20-2024
+#define BENCHMAKR_LLAMA 4 //llama.cpp benchmark
+#define BENCHMAKR_STABLEDIFFUSION 5 //stable diffusion benchmark, not work on Xiaomi 14 currently
+// there are three killer/heavyweight AI applications based on GGML currently: whisper.cpp, llama.cpp, stablediffusion.cpp, so they are here
// then
// step-by-step for PoC:Add Qualcomm mobile SoC native backend for GGML https://github.com/zhouwg/kantv/issues/121
-#define BENCHMAKR_QNN_SAMPLE 7 //"play with /say hello to" QNN Sample
-#define BENCHMAKR_QNN_SAVER 8 //study QNN SDK mechanism by QNN Saver
-#define BENCHMARK_QNN_MATRIX 9 //offload a simple fp32 2x2 matrix addition operation to QNN
-#define BENCHMARK_QNN_GGML 10 //mapping ggml tensor to QNN tensor
-#define BENCHMARK_QNN_COMPLEX 11 //complex/complicated computation graph in C/C++ or GGML and then offload them to QNN
-#define BENCHMARK_QNN_GGML_OP 12 //for PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API
+#define BENCHMAKR_QNN_SAMPLE 6 //"play with /say hello to" QNN Sample
+#define BENCHMAKR_QNN_SAVER 7 //study QNN SDK mechanism by QNN Saver
+#define BENCHMARK_QNN_MATRIX 8 //offload a simple fp32 2x2 matrix addition operation to QNN
+#define BENCHMARK_QNN_GGML 9 //mapping ggml tensor to QNN tensor
+#define BENCHMARK_QNN_COMPLEX 10 //complex/complicated computation graph in C/C++ or GGML and then offload them to QNN
+#define BENCHMARK_QNN_GGML_OP 11 //UT for PoC-S49: implementation of GGML OPs using QNN API
+#define BENCHMARK_QNN_AUTO_UT 12 //automation UT for PoC-S49: implementation of GGML OPs using QNN API
#define BENCHMAKR_MAX 12
#define BACKEND_CPU 0
@@ -88,7 +89,7 @@ extern "C" {
*
* @param sz_model_path /sdcard/kantv/ggml-xxxxxx.bin or /sdcard/kantv/xxxxxx.gguf or qualcomm's prebuilt dedicated model.so or ""
* @param sz_audio_path /sdcard/kantv/jfk.wav
- * @param n_bench_type 0: asr(transcription) 1: memcpy 2: mulmat 3: full/whisper_encode 4: matrix 5: LLAMA 6: stable diffusion 7: QNN sample 8: QNN saver 9: QNN matrix 10: QNN GGML 11: QNN complex 12: QNN GGML OP
+ * @param n_bench_type 0: whisper asr 1: memcpy 2: mulmat 3: whisper full 4: LLAMA 5: stable diffusion 6: QNN sample 7: QNN saver 8: QNN matrix 9: QNN GGML 10: QNN complex 11: QNN GGML OP(QNN UT) 12: QNN UT automation
* @param n_threads 1 - 8
* @param n_backend_type 0: CPU 1: GPU 2: DSP 3: ggml("fake" QNN backend, just for compare performance)
* @param n_op_type type of matrix manipulate / GGML OP / type of various complex/complicated computation graph
@@ -204,6 +205,11 @@ extern "C" {
*/
int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, int n_ggml_op_type);
+ /**
+ * similar to qnn_ggml_op, but an automation UT for a specify GGML OP with a specify backend
+ */
+ int qnn_ggml_op_automation_ut(const char * model_path, int num_threads, int n_backend_type, int n_ggml_op_type);
+
// =================================================================================================
// trying to integrate stablediffusion.cpp on 04-06-2024(Apri,6,2024)
diff --git a/core/ggml/llamacpp/ggml-backend.c b/core/ggml/llamacpp/ggml-backend.c
index 777fe1ff9..e5feb9ff5 100644
--- a/core/ggml/llamacpp/ggml-backend.c
+++ b/core/ggml/llamacpp/ggml-backend.c
@@ -809,14 +809,14 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
}
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
- ENTER_FUNC();
+ //ENTER_FUNC();
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
if (cpu_ctx->work_size < cplan.work_size) {
free(cpu_ctx->work_data);
- LOGGI("here");
+ //LOGGI("here");
cpu_ctx->work_data = malloc(cplan.work_size);
if (cpu_ctx->work_data == NULL) {
cpu_ctx->work_size = 0;
@@ -824,13 +824,13 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
}
cpu_ctx->work_size = cplan.work_size;
}
- LOGGI("here");
+ //LOGGI("here");
cplan.work_data = cpu_ctx->work_data;
cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
- LEAVE_FUNC();
+ //LEAVE_FUNC();
return ggml_graph_compute(cgraph, &cplan);
}
diff --git a/core/ggml/llamacpp/ggml-qnn.cpp b/core/ggml/llamacpp/ggml-qnn.cpp
index 838e6bbbd..68d96b981 100644
--- a/core/ggml/llamacpp/ggml-qnn.cpp
+++ b/core/ggml/llamacpp/ggml-qnn.cpp
@@ -20,7 +20,7 @@
*
* 5. lack of resource management of internal QNN resources and toggle between different backend(QNN CPU backend, QNN GPU backend, ggml...)
*
- * 6. only support FP32 / FP16 and many strict limitation(depend on QNN SDK)
+ * 6. only support FP32 / FP16(other data type not used currently)
*
* 7. QNN's RPC feature not used currently
*
@@ -180,6 +180,7 @@ struct qnn_buf_s
struct ggml_backend_qnn_context {
int device;
+ int threads;
char name[GGML_MAX_NAME];
char lib[GGML_MAX_NAME];
qnn_instance * instance;
@@ -260,9 +261,9 @@ static void ggml_setup_op_has_task_pass(void) {
//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently
static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
- [QNN_CPU] = {.device = 0, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
- [QNN_GPU] = {.device = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
- [QNN_HTP] = {.device = 2, .name = "qnn-htp", .lib = "libQnnHtp.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
+ [QNN_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
+ [QNN_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
+ [QNN_HTP] = {.device = 2, .threads = 1, .name = "qnn-htp(aka dsp)", .lib = "libQnnHtp.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr},
};
@@ -1469,7 +1470,7 @@ static void ggml_qnn_logcallback(const char * fmt,
int len_content = 0;
memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
len_content = vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
- //LOGGD("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf);
+ LOGGD("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf);
}
}
@@ -1760,33 +1761,27 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * src0, const struct
return false;
}
- const int64_t ne10 = src0->ne[0];
- const int64_t ne11 = src0->ne[1];
- const int64_t ne20 = src1->ne[0];
- const int64_t ne21 = src1->ne[1];
+ const int64_t ne00 = src0->ne[0];
+ const int64_t ne01 = src0->ne[1];
+ const int64_t ne10 = src1->ne[0];
+ const int64_t ne11 = src1->ne[1];
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
- /*
- return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
- (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) &&
- (dst->type == GGML_TYPE_F32);
- */
-
//make QNN SDK happy
if (dst->op == GGML_OP_ADD) {
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) &&
- (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne10 > 1 && ne11 > 1 && ne20 > 1 && ne21 > 1)) &&
+ (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)) &&
(src0->rank == src1->rank);
}
//make QNN SDK happy
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) &&
- (src0->type == src1->type) && (src0->type == dst->type) && ((ne10 > 1 && ne11 > 1 && ne20 > 1 && ne21 > 1));
+ (src0->type == src1->type) && (src0->type == dst->type) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1));
}
@@ -3520,7 +3515,7 @@ static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_c
int task_phase = GGML_TASK_TYPE_FINALIZE;
ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
- struct ggml_cplan plan = ggml_graph_plan(cgraph, 1);
+ struct ggml_cplan plan = ggml_graph_plan(cgraph, 1);//TODO: multithread support in QNN backend
buf_element_t * qnn_buf = nullptr;
@@ -3654,6 +3649,13 @@ bool ggml_backend_is_qnn(ggml_backend_t backend) {
}
+void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) {
+ GGML_ASSERT(ggml_backend_is_qnn(backend));
+
+ struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context;
+ ctx->threads = n_threads;
+}
+
const char * ggml_backend_qnn_get_name(ggml_backend_t backend) {
return backend->iface.get_name(backend);
}
diff --git a/core/ggml/llamacpp/ggml-qnn.h b/core/ggml/llamacpp/ggml-qnn.h
index 3ed0121a1..88d3014aa 100644
--- a/core/ggml/llamacpp/ggml-qnn.h
+++ b/core/ggml/llamacpp/ggml-qnn.h
@@ -31,6 +31,8 @@ GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num);
GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend);
+GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads);
+
GGML_API int ggml_backend_qnn_get_device_count(void);
GGML_API void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size);
diff --git a/core/ggml/llamacpp/ggml.c b/core/ggml/llamacpp/ggml.c
index 32c65a07b..64f9f36cb 100644
--- a/core/ggml/llamacpp/ggml.c
+++ b/core/ggml/llamacpp/ggml.c
@@ -18321,7 +18321,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
// distribute new work or execute it direct if 1T
while (++node_n < cgraph->n_nodes) {
- GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
+ //LOGGI("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
struct ggml_tensor * node = cgraph->nodes[node_n];
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
diff --git a/core/ggml/whispercpp/whisper.cpp b/core/ggml/whispercpp/whisper.cpp
index b232e5f64..672e7d934 100644
--- a/core/ggml/whispercpp/whisper.cpp
+++ b/core/ggml/whispercpp/whisper.cpp
@@ -146,6 +146,11 @@ extern "C" int __android_log_print(int prio, const char * tag, const char * fmt,
__attribute__((__format__(printf, 3, 4)));
#endif
+static std::string _s_internal_error_info = "unknown";
+
+const char * whisper_get_internal_error() {
+ return _s_internal_error_info.c_str();
+}
WHISPER_ATTRIBUTE_FORMAT(5, 6)
static void whisper_log_internal (ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
@@ -223,10 +228,9 @@ static bool ggml_graph_compute_helper(
return false;
}
}
- return ggml_graph_compute(graph, &plan) == GGML_STATUS_SUCCESS; //04-17-2024, works perfectly with whisper.cpp
+ return ggml_graph_compute(graph, &plan) == GGML_STATUS_SUCCESS;
#else
- //04-17-2024, refer to PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API,https://github.com/zhouwg/kantv/issues/121
- return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS;
+ ggml_backend_qnn_set_n_threads(backend, n_threads);
#endif
}
#endif
@@ -1292,7 +1296,11 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params
WHISPER_LOG_INFO("%s: using QNN backend\n", __func__);
backend_gpu = ggml_backend_qnn_init(params.gpu_device);
if (!backend_gpu) {
- WHISPER_LOG_ERROR("%s: ggml_backend_qnn_init() failed\n", __func__);
+ char device_name[GGML_MAX_NAME];
+ ggml_backend_qnn_get_device_description(params.gpu_device, device_name, GGML_MAX_NAME);
+ WHISPER_LOG_ERROR("%s: ggml_backend_qnn_init() failed with device %d(%s)\n", __func__, params.gpu_device, device_name);
+ _s_internal_error_info = "ggml_backend_qnn_init() failed with device (" + std::to_string(params.gpu_device) + ") " + device_name;
+ return nullptr; //04-20-2024, do not fall into the default GGML CPU backend, so the upper layer code/UI could know correct feedback
}
}
}
@@ -3559,6 +3567,7 @@ struct whisper_context * whisper_init_from_file_with_params(const char * path_mo
return nullptr;
}
+ _s_internal_error_info = "unknown";
ctx->state = whisper_init_state(ctx);
if (!ctx->state) {
WHISPER_LOG_INFO("whisper init failure\n");
@@ -6572,12 +6581,12 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
return s.c_str();
}
-WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
- fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr);
+WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads, int n_backend) {
+ fputs(whisper_bench_ggml_mul_mat_str(n_threads, n_backend), stderr);
return 0;
}
-WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
+WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads, int n_backend) {
static std::string s;
s = "";
char strbuf[256];
@@ -6658,10 +6667,13 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ false,
};
+
#ifdef GGML_USE_QNN
- gparams.use_hwaccel = true;
+ if (n_backend != 3) //3 is fake QNN backend "ggml", just used to compare performance between QNN backend and original GGML
+ gparams.use_hwaccel = true;
#endif
+
struct ggml_context * ctx0 = ggml_init(gparams);
struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
diff --git a/core/ggml/whispercpp/whisper.h b/core/ggml/whispercpp/whisper.h
index bd8d8df82..becbcd075 100644
--- a/core/ggml/whispercpp/whisper.h
+++ b/core/ggml/whispercpp/whisper.h
@@ -654,8 +654,10 @@ extern "C" {
WHISPER_API int whisper_bench_memcpy (int n_threads);
WHISPER_API const char * whisper_bench_memcpy_str (int n_threads);
- WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
- WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
+ WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads, int n_backend);
+ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads, int n_backend);
+
+ WHISPER_API const char * whisper_get_internal_error(void);
// Control logging output; default behavior is to print to stderr