diff --git a/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java b/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java index 462eabc45..37b2f325c 100755 --- a/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java +++ b/cdeosplayer/cdeosplayer-lib/src/main/java/cdeos/media/player/CDEUtils.java @@ -260,19 +260,20 @@ public class CDEUtils { public static final int ASR_MODE_TRANSCRIPTION_RECORD = 3; // transcription + audio record //keep sync with ggml-jni.h - public static final int BECHMARK_ASR = 0; - public static final int BECHMARK_MEMCPY = 1; - public static final int BECHMARK_MULMAT = 2; - public static final int BECHMARK_FULL = 3; - public static final int BENCHMARK_MATRIX = 4; - public static final int BENCHMARK_LLM = 5; - public static final int BENCHMARK_STABLEDIFFUSION= 6; - public static final int BENCHMARK_QNN_SAMPLE = 7; - public static final int BENCHMARK_QNN_SAVER = 8; - public static final int BENCHMARK_QNN_MATRIX = 9; - public static final int BENCHMARK_QNN_GGML = 10; - public static final int BENCHMARK_QNN_COMPLEX = 11; - public static final int BENCHMARK_QNN_GGML_OP = 12; + public static final int BENCHMARK_ASR = 0; + public static final int BENCHMARK_MEMCPY = 1; + public static final int BENCHMARK_MULMAT = 2; + public static final int BENCHMARK_FULL = 3; + //public static final int BENCHMARK_MATRIX = 4;//not used since 04-20-2024 + public static final int BENCHMARK_LLM = 4; + public static final int BENCHMARK_STABLEDIFFUSION= 5; + public static final int BENCHMARK_QNN_SAMPLE = 6; + public static final int BENCHMARK_QNN_SAVER = 7; + public static final int BENCHMARK_QNN_MATRIX = 8; + public static final int BENCHMARK_QNN_GGML = 9; + public static final int BENCHMARK_QNN_COMPLEX = 10; + public static final int BENCHMARK_QNN_GGML_OP = 11; + public static final int BENCHMARK_QNN_AUTO_UT = 12; //keep sync with ggml-qnn.h public static final int QNN_BACKEND_CPU = 0; @@ -3916,20 +3917,22 @@ public void onClick(DialogInterface dialog, int which) { public static String getBenchmarkDesc(int benchmarkIndex) { switch (benchmarkIndex) { - case BECHMARK_FULL: - return "GGML whisper_encode"; + case BENCHMARK_FULL: + return "GGML whisper full"; - case BECHMARK_MEMCPY: - return "GGML memcopy"; + case BENCHMARK_MEMCPY: + return "GGML memcpy"; - case BECHMARK_MULMAT: + case BENCHMARK_MULMAT: return "GGML matrix multiply"; - case BECHMARK_ASR: - return "GGML ASR inference"; + case BENCHMARK_ASR: + return "GGML whisper ASR"; + /*//not used since 04-20-2024 case BENCHMARK_MATRIX: return "GGML matrix"; + */ case BENCHMARK_LLM: return "GGML LLAMA"; @@ -3943,17 +3946,21 @@ public static String getBenchmarkDesc(int benchmarkIndex) { case BENCHMARK_QNN_SAVER: return "GGML QNN saver"; + case BENCHMARK_QNN_MATRIX: - return "GGML QNN matrix manipulate"; + return "GGML QNN matrix addition"; case BENCHMARK_QNN_GGML: - return "GGML QNN ggml"; + return "GGML QNN mapping ggml tensor"; case BENCHMARK_QNN_COMPLEX: return "GGML QNN complex graph"; case BENCHMARK_QNN_GGML_OP: - return "GGML QNN OP"; //for PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API + return "GGML QNN OP UT"; //UT for PoC-S49: implementation of GGML OPs using QNN API + + case BENCHMARK_QNN_AUTO_UT: + return "GGML QNN OP UT automation"; //automation UT for PoC-S49: implementation of GGML OPs using QNN API } return "unknown"; diff --git a/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java b/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java index c7187f7c4..8a45511ae 100755 --- a/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java +++ b/cdeosplayer/cdeosplayer-lib/src/main/java/org/ggml/ggmljava.java @@ -119,7 +119,7 @@ public enum ggml_op { /** * @param modelPath /sdcard/kantv/ggml-xxxxxx.bin or /sdcard/kantv/xxxxxx.gguf or qualcomm's prebuilt dedicated model.so or "" * @param audioPath /sdcard/kantv/jfk.wav - * @param nBenchType 0: asr(transcription) 1: memcpy 2: mulmat 3: full/whisper_encode 4: matrix 5: LLAMA 6: stable diffusion 7: QNN sample 8: QNN saver 9: QNN matrix 10: QNN GGML 11: QNN complex 12: QNN GGML OP + * @param nBenchType 0: whisper asr 1: memcpy 2: mulmat 3: whisper full 4: LLAMA 5: stable diffusion 6: QNN sample 7: QNN saver 8: QNN matrix 9: QNN GGML 10: QNN complex 11: QNN GGML OP(QNN UT) 12: QNN UT automation * @param nThreadCounts 1 - 8 * @param nBackendType 0: CPU 1: GPU 2: DSP 3: ggml("fake" QNN backend, just for compare performance) * @param nOpType type of matrix manipulate / GGML OP / type of various complex/complicated computation graph diff --git a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java index 83c6bd802..d015cc26e 100755 --- a/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java +++ b/cdeosplayer/kantv/src/main/java/com/cdeos/kantv/ui/fragment/ASRResearchFragment.java @@ -255,7 +255,7 @@ public void onNothingSelected(AdapterView parent) { } }); - spinnerBenchType.setSelection(CDEUtils.BENCHMARK_QNN_GGML_OP); + spinnerBenchType.setSelection(CDEUtils.BENCHMARK_ASR); Spinner spinnerThreadsCounts = mActivity.findViewById(R.id.spinnerThreadCounts); String[] arrayThreadCounts = getResources().getStringArray(R.array.threadCounts); diff --git a/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml b/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml index 9310dcdff..5f0c9e413 100755 --- a/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml +++ b/cdeosplayer/kantv/src/main/res/layout/fragment_asr.xml @@ -43,7 +43,7 @@ + android:text="Bench" /> + android:text="Threads" /> - asr - mempcpy - mulmat - full - matrix - llama - stablediffusion - - qnn-sample - qnn-saver - qnn-matrix - qnn-ggml - qnn-complex - qnn-ggml-op + whisper asr + memcpy + whisper mulmat + whisper full + llama + stablediffusion + + qnn-sample + qnn-saver + qnn-matrix + qnn-ggml + qnn-complex + qnn-ggml-op + qnn-auto-ut diff --git a/core/ggml/jni/ggml-jni-impl-external.cpp b/core/ggml/jni/ggml-jni-impl-external.cpp index f3d725464..453f27d9d 100644 --- a/core/ggml/jni/ggml-jni-impl-external.cpp +++ b/core/ggml/jni/ggml-jni-impl-external.cpp @@ -117,7 +117,12 @@ extern "C" { static const char * get_qnn_backend_name(int n_backend_type); static float tensor_sum_elements(const ggml_tensor * tensor); -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads); +static bool ggml_graph_compute_helper( + struct ggml_cgraph * graph, + std::vector & buf, + int n_threads, + ggml_abort_callback abort_callback, + void * abort_callback_data); static void tensor_dump(const ggml_tensor * tensor, const char * name); #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) @@ -885,7 +890,7 @@ void ggml_bench_matrix(int num_threads, int backend_type) { std::vector work_buffer; - ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); + ggml_graph_compute_helper(gf,work_buffer, benchmark_params.n_threads, nullptr, nullptr); if (get_tensor_data_size(m11) < 100) { TENSOR_DUMP(m11); @@ -963,7 +968,7 @@ void ggml_bench_matrix(int num_threads, int backend_type) { long long int start = ggml_time_us(); //GGML_JNI_NOTIFY("Running ggml_graph_compute\n"); - ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads); + ggml_graph_compute_helper(gf31,work_buffer, benchmark_params.n_threads, nullptr, nullptr); long long int stop = ggml_time_us(); long long int usec = stop-start; @@ -996,7 +1001,7 @@ void ggml_bench_matrix(int num_threads, int backend_type) { } // Running a different graph computation to make sure we override the CPU cache lines - ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads); + ggml_graph_compute_helper(gf,work_buffer, benchmark_params.n_threads, nullptr, nullptr); } GGML_JNI_NOTIFY("\n"); GGML_JNI_NOTIFY("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); @@ -3298,19 +3303,26 @@ static intptr_t alignTo(size_t alignment, intptr_t offset) { } -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + +static bool ggml_graph_compute_helper( + struct ggml_cgraph * graph, + std::vector & buf, + int n_threads, + ggml_abort_callback abort_callback, + void * abort_callback_data) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + plan.abort_callback = abort_callback; + plan.abort_callback_data = abort_callback_data; + if (plan.work_size > 0) { buf.resize(plan.work_size); plan.work_data = buf.data(); } - ggml_graph_compute(graph, &plan); + return ggml_graph_compute(graph, &plan); } - - static float tensor_sum_elements(const ggml_tensor * tensor) { double sum = 0; float value = 0; @@ -6854,7 +6866,7 @@ int qnn_matrix(int n_backend_type, int n_op_type) { m2 = ggml_add(ctx, m0, m1); // GGML_OP_ADD gf = ggml_new_graph(ctx); ggml_build_forward_expand(gf, m2); - ggml_graph_compute_helper(work_buffer, gf, 4); + ggml_graph_compute_helper(gf, work_buffer, 4, nullptr, nullptr); TENSOR_DUMP(m0); TENSOR_DUMP(m1); TENSOR_DUMP(m2); @@ -7254,7 +7266,7 @@ int qnn_ggml(int n_backend_type, int n_ggml_op_type) { gf = ggml_new_graph(ctx); ggml_set_f32(m2, 0.0f); ggml_build_forward_expand(gf, m2); - ggml_graph_compute_helper(work_buffer, gf, 4); + ggml_graph_compute_helper(gf,work_buffer, 4, nullptr, nullptr); TENSOR_DUMP(m0); TENSOR_DUMP(m1); @@ -8194,6 +8206,8 @@ int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, in GGML_JNI_NOTIFY("starting qnn_ggml_op UT(unit test)\n"); #if 0 // for performance comparison between QNN backend and original GGML + // on Xiaomi14, 9x performance gain + // on low-end phone, 3x performance gain const int sizey = 4096; const int sizex = 4096; const int sizez = 128; @@ -8267,7 +8281,7 @@ int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, in GGML_JNI_NOTIFY("creating compute graph\n"); gf = ggml_new_graph(ctx); ggml_build_forward_expand(gf, dst); - ggml_graph_compute_helper(work_buffer, gf, num_threads); + ggml_graph_compute_helper(gf,work_buffer, num_threads, nullptr, nullptr); if (get_tensor_data_size(dst) < 100) { TENSOR_DUMP(src0); @@ -8297,5 +8311,228 @@ int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, in GGML_JNI_NOTIFY("duration of qnn_ggml_op %d(%s) with backend %d(%s) is: %lld milliseconds\n", n_ggml_op_type, ggml_op_name((enum ggml_op)n_ggml_op_type), n_backend_type, get_qnn_backend_name(n_backend_type), n_durtion); LOGGD("leave qnn_ggml_op UT(unit test)\n"); + return 0; +} + + +/** + * similar to qnn_ggml_op, but an automation UT for a specify GGML OP with a specify backend + * + * this function borrow from whisper.cpp + */ +//TODO: 1. only support GGML_OP_ADD, GGML_OP_MUL, GGMPL_OP_MULMAT +// 2. works with FP32 +int qnn_ggml_op_automation_ut(const char *model_path, int num_threads, int n_backend_type, + int n_ggml_op_type) { + int result = 0; + int64_t n_begin_time = 0LL; + int64_t n_end_time = 0LL; + int64_t n_durtion = 0LL; + + + LOGGD("enter qnn_ggml_op_automation_ut\n"); + LOGGI("mode path:%s", model_path); + LOGGI("num_threads:%d", num_threads); + LOGGI("backend_type:%d(%s)", n_backend_type, get_qnn_backend_name(n_backend_type)); + LOGGI("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type)); + GGML_JNI_NOTIFY("starting qnn_ggml_op_automation_ut(automation unit test)\n"); + + n_begin_time = ggml_time_us(); + + srand(time(NULL)); + + bool support_ops = (n_ggml_op_type == GGML_OP_MUL_MAT || n_ggml_op_type == GGML_OP_MUL || n_ggml_op_type == GGML_OP_ADD); + if (!support_ops) { + LOGGD("ggml op %d(%s) not supported with backend %d(%s)", n_ggml_op_type, + ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type, + get_qnn_backend_name(n_backend_type)); + GGML_JNI_NOTIFY("ggml op %d(%s) not supported with backend %d(%s)", n_ggml_op_type, + ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type, + get_qnn_backend_name(n_backend_type)); + LOGGD("leave qnn_ggml_op UT(unit test)\n"); + + return 1; + } + + + char strbuf[256]; + std::string tipString; + tipString = ""; + + const int n_max = 128; + + const std::vector sizes = { + 64, 128, 256, 512, 1024, 2048, 4096, + }; + + const size_t N_max = sizes.back(); + + // a: N*N*sizeof(float) + // b: N*N*sizeof(float) + // c: N*N*sizeof(float) + // when F16 is used, there is an extra work buffer of size N*N*sizeof(float) + std::vector buf( + 3llu * N_max * N_max * sizeof(float) + 3 * ggml_tensor_overhead() + + ggml_graph_overhead()); + std::vector work; + + + tipString += "\nprepare matrix"; + kantv_asr_notify_benchmark_c("prepare matrix\n"); + + for (size_t i = 0; i < buf.size(); i++) buf[i] = i; + + for (int j = 0; j < (int) sizes.size(); j++) { + int n_q4_0 = 0; + int n_q4_1 = 0; + int n_q5_0 = 0; + int n_q5_1 = 0; + int n_q8_0 = 0; + int n_fp16 = 0; + int n_fp32 = 0; + + // GFLOPS/s + double s_q4_0 = 0.0; + double s_q4_1 = 0.0; + double s_q5_0 = 0.0; + double s_q5_1 = 0.0; + double s_q8_0 = 0.0; + double s_fp16 = 0.0; + double s_fp32 = 0.0; + + const size_t N = sizes[j]; +#if 0 + for (int k = 0; k < 7; ++k) { + const ggml_type wtype = + k == 0 ? GGML_TYPE_Q4_0 : + k == 1 ? GGML_TYPE_Q4_1 : + k == 2 ? GGML_TYPE_Q5_0 : + k == 3 ? GGML_TYPE_Q5_1 : + k == 4 ? GGML_TYPE_Q8_0 : + k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32; +#else + for (int k = 0; k < 1; ++k) { + const ggml_type wtype = GGML_TYPE_F32; //TODO: only f16&f32 supported with QNN backend + k = 6; //hardcode to 6 make following code happy +#endif + + + double &s = + k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : + k == 4 ? s_q8_0 : + k == 5 ? s_fp16 + : /*k == 6*/ s_fp32; + int &n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : + k == 4 ? n_q8_0 : + k == 5 ? n_fp16 + : /*k == 6*/ n_fp32; + + struct ggml_init_params gparams = { + /*.mem_size =*/ buf.size(), + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ false, + }; +#ifdef GGML_USE_QNN + if (n_backend_type != + 3) //3 is fake QNN backend "ggml", just used to compare performance between QNN backend and original GGML + gparams.use_hwaccel = true; +#endif + struct ggml_context *ctx0 = ggml_init(gparams); + + struct ggml_tensor *a = ggml_new_tensor_2d(ctx0, wtype, N, N); + struct ggml_tensor *b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N); + + struct ggml_tensor *c = nullptr; + + switch (n_ggml_op_type) { + case GGML_OP_ADD: + c = ggml_add(ctx0, a, b); + break; + case GGML_OP_MUL: + c = ggml_mul(ctx0, a, b); + break; + case GGML_OP_MUL_MAT: + c = ggml_mul_mat(ctx0, a, b); + break; + } + + + struct ggml_cgraph *gf = ggml_new_graph(ctx0); + + ggml_build_forward_expand(gf, c); + + double tsum = 0.0; + + // heat-up + ggml_graph_compute_helper(gf, work, num_threads, nullptr, nullptr); + + for (int i = 0; i < n_max; ++i) { + const int64_t t0 = ggml_time_us(); + + kantv_asr_notify_benchmark_c("reset"); + tipString = "calling ggml_graphic_compute_helper:\n"; + tipString += "j= " + std::to_string(j) + "(matrix dimension = " + + std::to_string(N) + ",n_max=" + std::to_string(n_max) + ")" + + ",k=" + std::to_string(k) + "(ggml quant type=" + + std::string(whisper_get_ggml_type_str( + static_cast(wtype))) + ")" + + ",i=" + std::to_string(i) + "\n"; + + kantv_asr_notify_benchmark(tipString); + + ggml_graph_compute_helper(gf, work, num_threads, nullptr, nullptr); + + const int64_t t1 = ggml_time_us(); + + tsum += (t1 - t0) * 1e-6; + n++; + + if (tsum > 1.0 && n >= 3) { + break; + } + } + + ggml_free(ctx0); + + s = ((2.0 * N * N * N * n) / tsum) * 1e-9; + } + + kantv_asr_notify_benchmark_c("reset"); + tipString = ""; + // Q4_0 | Q4_1 + snprintf(strbuf, sizeof(strbuf), + "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n", + N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1); + tipString += strbuf; + + // Q5_0 | Q5_1 | Q8_0 + snprintf(strbuf, sizeof(strbuf), + "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n", + N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0); + tipString += strbuf; + + // F16 | F32 + snprintf(strbuf, sizeof(strbuf), + "%4zu x %4zu: F16 %7.1f GFLOPS (%3d runs) | F32 %7.1f GFLOPS (%3d runs)\n", + N, N, s_fp16, n_fp16, s_fp32, n_fp32); + tipString += strbuf; + + + kantv_asr_notify_benchmark(tipString); + LOGGD("%s\n", tipString.c_str()); + } + + + n_end_time = ggml_time_us(); + n_durtion = (n_end_time - n_begin_time) / 1000; + LOGGD("duration of qnn_ggml_op_automation_ut %d(%s) with backend %d(%s) is: %lld milliseconds\n", + n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type, + get_qnn_backend_name(n_backend_type), n_durtion); + GGML_JNI_NOTIFY( + "duration of qnn_ggml_op_automation_ut %d(%s) with backend %d(%s) is: %lld milliseconds\n", + n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type), n_backend_type, + get_qnn_backend_name(n_backend_type), n_durtion); + LOGGD("leave qnn_ggml_op_automation_ut(automation unit test)\n"); + return 0; } \ No newline at end of file diff --git a/core/ggml/jni/ggml-jni-impl.cpp b/core/ggml/jni/ggml-jni-impl.cpp index 9b9092716..721fa4b8c 100644 --- a/core/ggml/jni/ggml-jni-impl.cpp +++ b/core/ggml/jni/ggml-jni-impl.cpp @@ -642,7 +642,8 @@ static const char * whisper_transcribe_from_file(const char * sz_model_path, con context = whisper_init_from_file_with_params(sz_model_path, wcp); if (nullptr == context) { LOGGW("whisper_init_from_file_with_params failure, pls check why\n"); - GGML_JNI_NOTIFY("whisper_init_from_file_with_params failure, pls check why(pls check whether whispercpp model is valid)\n"); + GGML_JNI_NOTIFY("whisper_init_from_file_with_params failure(%s), pls check why? whether whispercpp model %s is valid ?\n", + whisper_get_internal_error(), sz_model_path); result = -1; goto failure; } @@ -803,7 +804,7 @@ void whisper_set_benchmark_status(int b_exit_benchmark) { * * @param sz_model_path /sdcard/kantv/ggml-xxxxxx.bin or /sdcard/kantv/xxxxxx.gguf or qualcomm's prebuilt dedicated model.so or "" * @param sz_audio_path /sdcard/kantv/jfk.wav - * @param n_bench_type 0: asr(transcription) 1: memcpy 2: mulmat 3: full/whisper_encode 4: matrix 5: LLAMA 6: stable diffusion 7: QNN sample 8: QNN saver 9: QNN matrix 10: QNN GGML 11: QNN complex 12: QNN GGML OP + * @param n_bench_type 0: whisper asr 1: memcpy 2: mulmat 3: whisper full 4: LLAMA 5: stable diffusion 6: QNN sample 7: QNN saver 8: QNN matrix 9: QNN GGML 10: QNN complex 11: QNN GGML OP(QNN UT) 12: QNN UT automation * @param n_threads 1 - 8 * @param n_backend_type 0: CPU 1: GPU 2: DSP 3: ggml("fake" QNN backend, just for compare performance) * @param n_op_type type of matrix manipulate / GGML OP / type of various complex/complicated compute graph @@ -855,16 +856,17 @@ void ggml_jni_bench(const char * sz_model_path, const char *sz_audio_path, int n break; case BECHMARK_MULMAT: - whisper_bench_ggml_mul_mat(n_threads); + whisper_bench_ggml_mul_mat(n_threads, n_backend_type); break; case BECHMARK_FULL: whisper_bench_full(); break; - + /* not used since 04-20-2024 case BENCHMARK_MATRIX: ggml_bench_matrix(n_backend_type, n_threads); break; + */ case BENCHMAKR_LLAMA: ggml_bench_llama(sz_model_path, n_threads, n_backend_type); @@ -919,22 +921,26 @@ void ggml_jni_bench(const char * sz_model_path, const char *sz_audio_path, int n } break; - case BENCHMARK_QNN_MATRIX: + case BENCHMARK_QNN_MATRIX: //offload a simple fp32 2x2 matrix addition operation to QNN qnn_matrix(n_backend_type, n_op_type); break; - case BENCHMARK_QNN_GGML: + case BENCHMARK_QNN_GGML: //mapping ggml tensor to QNN tensor qnn_ggml(n_backend_type, n_op_type); break; - case BENCHMARK_QNN_COMPLEX: + case BENCHMARK_QNN_COMPLEX: //complicated computation graph in C/C++ or GGML and then offload them to QNN qnn_complex_graph(n_backend_type, n_op_type); break; - case BENCHMARK_QNN_GGML_OP: + case BENCHMARK_QNN_GGML_OP: //UT for PoC-S49: implementation of GGML OPs using QNN API qnn_ggml_op(sz_model_path, n_threads, n_backend_type, n_op_type); break; + case BENCHMARK_QNN_AUTO_UT://automation UT for PoC-S49: implementation of GGML OPs using QNN API + qnn_ggml_op_automation_ut(sz_model_path, n_threads, n_backend_type, n_op_type); + break; + default: break; } diff --git a/core/ggml/jni/ggml-jni.h b/core/ggml/jni/ggml-jni.h index 03b704af6..16a051b7c 100644 --- a/core/ggml/jni/ggml-jni.h +++ b/core/ggml/jni/ggml-jni.h @@ -53,22 +53,23 @@ extern "C" { #define JNI_BUF_LEN 4096 #define JNI_TMP_LEN 256 -#define BECHMARK_ASR 0 -#define BECHMARK_MEMCPY 1 -#define BECHMARK_MULMAT 2 -#define BECHMARK_FULL 3 -#define BENCHMARK_MATRIX 4 -#define BENCHMAKR_LLAMA 5 -#define BENCHMAKR_STABLEDIFFUSION 6 //not work on Xiaomi 14 currently -// I think there are three killer/heavyweight AI applications based on GGML currently: whisper.cpp, llama.cpp, stablediffusion.cpp, so they are here +#define BECHMARK_ASR 0 //whisper.cpp ASR benchmark +#define BECHMARK_MEMCPY 1 //memcpy benchmark +#define BECHMARK_MULMAT 2 //mulmat benchmark +#define BECHMARK_FULL 3 //whisper.cpp full benchmark +/*#define BENCHMARK_MATRIX 4*/ //not used since 04-20-2024 +#define BENCHMAKR_LLAMA 4 //llama.cpp benchmark +#define BENCHMAKR_STABLEDIFFUSION 5 //stable diffusion benchmark, not work on Xiaomi 14 currently +// there are three killer/heavyweight AI applications based on GGML currently: whisper.cpp, llama.cpp, stablediffusion.cpp, so they are here // then // step-by-step for PoC:Add Qualcomm mobile SoC native backend for GGML https://github.com/zhouwg/kantv/issues/121 -#define BENCHMAKR_QNN_SAMPLE 7 //"play with /say hello to" QNN Sample -#define BENCHMAKR_QNN_SAVER 8 //study QNN SDK mechanism by QNN Saver -#define BENCHMARK_QNN_MATRIX 9 //offload a simple fp32 2x2 matrix addition operation to QNN -#define BENCHMARK_QNN_GGML 10 //mapping ggml tensor to QNN tensor -#define BENCHMARK_QNN_COMPLEX 11 //complex/complicated computation graph in C/C++ or GGML and then offload them to QNN -#define BENCHMARK_QNN_GGML_OP 12 //for PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API +#define BENCHMAKR_QNN_SAMPLE 6 //"play with /say hello to" QNN Sample +#define BENCHMAKR_QNN_SAVER 7 //study QNN SDK mechanism by QNN Saver +#define BENCHMARK_QNN_MATRIX 8 //offload a simple fp32 2x2 matrix addition operation to QNN +#define BENCHMARK_QNN_GGML 9 //mapping ggml tensor to QNN tensor +#define BENCHMARK_QNN_COMPLEX 10 //complex/complicated computation graph in C/C++ or GGML and then offload them to QNN +#define BENCHMARK_QNN_GGML_OP 11 //UT for PoC-S49: implementation of GGML OPs using QNN API +#define BENCHMARK_QNN_AUTO_UT 12 //automation UT for PoC-S49: implementation of GGML OPs using QNN API #define BENCHMAKR_MAX 12 #define BACKEND_CPU 0 @@ -88,7 +89,7 @@ extern "C" { * * @param sz_model_path /sdcard/kantv/ggml-xxxxxx.bin or /sdcard/kantv/xxxxxx.gguf or qualcomm's prebuilt dedicated model.so or "" * @param sz_audio_path /sdcard/kantv/jfk.wav - * @param n_bench_type 0: asr(transcription) 1: memcpy 2: mulmat 3: full/whisper_encode 4: matrix 5: LLAMA 6: stable diffusion 7: QNN sample 8: QNN saver 9: QNN matrix 10: QNN GGML 11: QNN complex 12: QNN GGML OP + * @param n_bench_type 0: whisper asr 1: memcpy 2: mulmat 3: whisper full 4: LLAMA 5: stable diffusion 6: QNN sample 7: QNN saver 8: QNN matrix 9: QNN GGML 10: QNN complex 11: QNN GGML OP(QNN UT) 12: QNN UT automation * @param n_threads 1 - 8 * @param n_backend_type 0: CPU 1: GPU 2: DSP 3: ggml("fake" QNN backend, just for compare performance) * @param n_op_type type of matrix manipulate / GGML OP / type of various complex/complicated computation graph @@ -204,6 +205,11 @@ extern "C" { */ int qnn_ggml_op(const char * model_path, int num_threads, int n_backend_type, int n_ggml_op_type); + /** + * similar to qnn_ggml_op, but an automation UT for a specify GGML OP with a specify backend + */ + int qnn_ggml_op_automation_ut(const char * model_path, int num_threads, int n_backend_type, int n_ggml_op_type); + // ================================================================================================= // trying to integrate stablediffusion.cpp on 04-06-2024(Apri,6,2024) diff --git a/core/ggml/llamacpp/ggml-backend.c b/core/ggml/llamacpp/ggml-backend.c index 777fe1ff9..e5feb9ff5 100644 --- a/core/ggml/llamacpp/ggml-backend.c +++ b/core/ggml/llamacpp/ggml-backend.c @@ -809,14 +809,14 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe } GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - ENTER_FUNC(); + //ENTER_FUNC(); struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); if (cpu_ctx->work_size < cplan.work_size) { free(cpu_ctx->work_data); - LOGGI("here"); + //LOGGI("here"); cpu_ctx->work_data = malloc(cplan.work_size); if (cpu_ctx->work_data == NULL) { cpu_ctx->work_size = 0; @@ -824,13 +824,13 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t } cpu_ctx->work_size = cplan.work_size; } - LOGGI("here"); + //LOGGI("here"); cplan.work_data = cpu_ctx->work_data; cplan.abort_callback = cpu_ctx->abort_callback; cplan.abort_callback_data = cpu_ctx->abort_callback_data; - LEAVE_FUNC(); + //LEAVE_FUNC(); return ggml_graph_compute(cgraph, &cplan); } diff --git a/core/ggml/llamacpp/ggml-qnn.cpp b/core/ggml/llamacpp/ggml-qnn.cpp index 838e6bbbd..68d96b981 100644 --- a/core/ggml/llamacpp/ggml-qnn.cpp +++ b/core/ggml/llamacpp/ggml-qnn.cpp @@ -20,7 +20,7 @@ * * 5. lack of resource management of internal QNN resources and toggle between different backend(QNN CPU backend, QNN GPU backend, ggml...) * - * 6. only support FP32 / FP16 and many strict limitation(depend on QNN SDK) + * 6. only support FP32 / FP16(other data type not used currently) * * 7. QNN's RPC feature not used currently * @@ -180,6 +180,7 @@ struct qnn_buf_s struct ggml_backend_qnn_context { int device; + int threads; char name[GGML_MAX_NAME]; char lib[GGML_MAX_NAME]; qnn_instance * instance; @@ -260,9 +261,9 @@ static void ggml_setup_op_has_task_pass(void) { //QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/HTP(aka DSP) backend currently static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = { - [QNN_CPU] = {.device = 0, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, - [QNN_GPU] = {.device = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, - [QNN_HTP] = {.device = 2, .name = "qnn-htp", .lib = "libQnnHtp.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, + [QNN_CPU] = {.device = 0, .threads = 1, .name = "qnn-cpu", .lib = "libQnnCpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, + [QNN_GPU] = {.device = 1, .threads = 1, .name = "qnn-gpu", .lib = "libQnnGpu.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, + [QNN_HTP] = {.device = 2, .threads = 1, .name = "qnn-htp(aka dsp)", .lib = "libQnnHtp.so", .instance = nullptr, .buffer_pool = nullptr, .backend = nullptr, .raw_interface = nullptr, .raw_system_interface = nullptr}, }; @@ -1469,7 +1470,7 @@ static void ggml_qnn_logcallback(const char * fmt, int len_content = 0; memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN); len_content = vsnprintf(reinterpret_cast(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp); - //LOGGD("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf); + LOGGD("%8.1fms [%-7s] %s ", ms, levelStr, s_ggml_qnn_logbuf); } } @@ -1760,33 +1761,27 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * src0, const struct return false; } - const int64_t ne10 = src0->ne[0]; - const int64_t ne11 = src0->ne[1]; - const int64_t ne20 = src1->ne[0]; - const int64_t ne21 = src1->ne[1]; + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; - /* - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && - (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && - (dst->type == GGML_TYPE_F32); - */ - //make QNN SDK happy if (dst->op == GGML_OP_ADD) { return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && - (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne10 > 1 && ne11 > 1 && ne20 > 1 && ne21 > 1)) && + (dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)) && (src0->rank == src1->rank); } //make QNN SDK happy return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && - (src0->type == src1->type) && (src0->type == dst->type) && ((ne10 > 1 && ne11 > 1 && ne20 > 1 && ne21 > 1)); + (src0->type == src1->type) && (src0->type == dst->type) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)); } @@ -3520,7 +3515,7 @@ static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_c int task_phase = GGML_TASK_TYPE_FINALIZE; ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context; - struct ggml_cplan plan = ggml_graph_plan(cgraph, 1); + struct ggml_cplan plan = ggml_graph_plan(cgraph, 1);//TODO: multithread support in QNN backend buf_element_t * qnn_buf = nullptr; @@ -3654,6 +3649,13 @@ bool ggml_backend_is_qnn(ggml_backend_t backend) { } +void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(ggml_backend_is_qnn(backend)); + + struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context; + ctx->threads = n_threads; +} + const char * ggml_backend_qnn_get_name(ggml_backend_t backend) { return backend->iface.get_name(backend); } diff --git a/core/ggml/llamacpp/ggml-qnn.h b/core/ggml/llamacpp/ggml-qnn.h index 3ed0121a1..88d3014aa 100644 --- a/core/ggml/llamacpp/ggml-qnn.h +++ b/core/ggml/llamacpp/ggml-qnn.h @@ -31,6 +31,8 @@ GGML_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num); GGML_API bool ggml_backend_is_qnn(ggml_backend_t backend); +GGML_API void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads); + GGML_API int ggml_backend_qnn_get_device_count(void); GGML_API void ggml_backend_qnn_get_device_description(int device, char * description, size_t description_size); diff --git a/core/ggml/llamacpp/ggml.c b/core/ggml/llamacpp/ggml.c index 32c65a07b..64f9f36cb 100644 --- a/core/ggml/llamacpp/ggml.c +++ b/core/ggml/llamacpp/ggml.c @@ -18321,7 +18321,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // distribute new work or execute it direct if 1T while (++node_n < cgraph->n_nodes) { - GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); + //LOGGI("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); struct ggml_tensor * node = cgraph->nodes[node_n]; const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); diff --git a/core/ggml/whispercpp/whisper.cpp b/core/ggml/whispercpp/whisper.cpp index b232e5f64..672e7d934 100644 --- a/core/ggml/whispercpp/whisper.cpp +++ b/core/ggml/whispercpp/whisper.cpp @@ -146,6 +146,11 @@ extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, __attribute__((__format__(printf, 3, 4))); #endif +static std::string _s_internal_error_info = "unknown"; + +const char * whisper_get_internal_error() { + return _s_internal_error_info.c_str(); +} WHISPER_ATTRIBUTE_FORMAT(5, 6) static void whisper_log_internal (ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); @@ -223,10 +228,9 @@ static bool ggml_graph_compute_helper( return false; } } - return ggml_graph_compute(graph, &plan) == GGML_STATUS_SUCCESS; //04-17-2024, works perfectly with whisper.cpp + return ggml_graph_compute(graph, &plan) == GGML_STATUS_SUCCESS; #else - //04-17-2024, refer to PoC-S49: implementation of other GGML OP(non-mulmat) using QNN API,https://github.com/zhouwg/kantv/issues/121 - return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS; + ggml_backend_qnn_set_n_threads(backend, n_threads); #endif } #endif @@ -1292,7 +1296,11 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params WHISPER_LOG_INFO("%s: using QNN backend\n", __func__); backend_gpu = ggml_backend_qnn_init(params.gpu_device); if (!backend_gpu) { - WHISPER_LOG_ERROR("%s: ggml_backend_qnn_init() failed\n", __func__); + char device_name[GGML_MAX_NAME]; + ggml_backend_qnn_get_device_description(params.gpu_device, device_name, GGML_MAX_NAME); + WHISPER_LOG_ERROR("%s: ggml_backend_qnn_init() failed with device %d(%s)\n", __func__, params.gpu_device, device_name); + _s_internal_error_info = "ggml_backend_qnn_init() failed with device (" + std::to_string(params.gpu_device) + ") " + device_name; + return nullptr; //04-20-2024, do not fall into the default GGML CPU backend, so the upper layer code/UI could know correct feedback } } } @@ -3559,6 +3567,7 @@ struct whisper_context * whisper_init_from_file_with_params(const char * path_mo return nullptr; } + _s_internal_error_info = "unknown"; ctx->state = whisper_init_state(ctx); if (!ctx->state) { WHISPER_LOG_INFO("whisper init failure\n"); @@ -6572,12 +6581,12 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) { return s.c_str(); } -WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) { - fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr); +WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads, int n_backend) { + fputs(whisper_bench_ggml_mul_mat_str(n_threads, n_backend), stderr); return 0; } -WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { +WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads, int n_backend) { static std::string s; s = ""; char strbuf[256]; @@ -6658,10 +6667,13 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { /*.mem_buffer =*/ buf.data(), /*.no_alloc =*/ false, }; + #ifdef GGML_USE_QNN - gparams.use_hwaccel = true; + if (n_backend != 3) //3 is fake QNN backend "ggml", just used to compare performance between QNN backend and original GGML + gparams.use_hwaccel = true; #endif + struct ggml_context * ctx0 = ggml_init(gparams); struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N); diff --git a/core/ggml/whispercpp/whisper.h b/core/ggml/whispercpp/whisper.h index bd8d8df82..becbcd075 100644 --- a/core/ggml/whispercpp/whisper.h +++ b/core/ggml/whispercpp/whisper.h @@ -654,8 +654,10 @@ extern "C" { WHISPER_API int whisper_bench_memcpy (int n_threads); WHISPER_API const char * whisper_bench_memcpy_str (int n_threads); - WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads); - WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads); + WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads, int n_backend); + WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads, int n_backend); + + WHISPER_API const char * whisper_get_internal_error(void); // Control logging output; default behavior is to print to stderr