ggml-qnn: refine ggml backend subsystem

kantv-ai · May 30, 2024 · 70835aa · 70835aa
1 parent fcf5338
commit 70835aa
Show file tree

Hide file tree

Showing 3 changed files with 160 additions and 131 deletions.
diff --git a/core/ggml/llamacpp/ggml-backend.c b/core/ggml/llamacpp/ggml-backend.c
@@ -280,21 +280,148 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
     return backend->iface.graph_plan_compute(backend, plan);
 }
 
+static ggml_backend_t g_cpu_backend = NULL;
+static bool GGML_OP_HAS_INIT    [GGML_OP_COUNT] = { 0 };
+static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
+static void ggml_setup_op_has_task_pass(void) {
+    {   // INIT
+        bool * p = GGML_OP_HAS_INIT;
+
+        p[GGML_OP_ACC                    ] = true;
+        p[GGML_OP_MUL_MAT                ] = true;
+        p[GGML_OP_MUL_MAT_ID             ] = true;
+        p[GGML_OP_OUT_PROD               ] = true;
+        p[GGML_OP_SET                    ] = true;
+        p[GGML_OP_GET_ROWS_BACK          ] = true;
+        p[GGML_OP_DIAG_MASK_INF          ] = true;
+        p[GGML_OP_DIAG_MASK_ZERO         ] = true;
+        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
+        p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
+        p[GGML_OP_FLASH_ATTN_BACK        ] = true;
+        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
+        p[GGML_OP_ADD_REL_POS            ] = true;
+    }
+
+    {   // FINALIZE
+        bool * p = GGML_OP_HAS_FINALIZE;
+
+        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
+    }
+}
+
+
+extern void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state);
+static enum ggml_status ggml_backend_graph_compute_mixed(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    enum ggml_status result         = GGML_STATUS_SUCCESS;
+    int node_n                      = -1;
+
+    static bool is_first_call = true;
+    if (is_first_call) {
+        ggml_setup_op_has_task_pass();
+        is_first_call = false;
+    }
+
+    struct ggml_cplan plan          = ggml_graph_plan(cgraph, 1);
+    if (plan.work_size > 0) {
+        plan.work_data = (uint8_t *)(malloc(plan.work_size));
+        if (NULL == plan.work_data) {
+            return GGML_STATUS_ALLOC_FAILED;
+        }
+    }
+
+    struct ggml_compute_params params = {
+            /*.type  =*/ GGML_TASK_TYPE_FINALIZE,
+            /*.ith   =*/ 0,
+            /*.nth   =*/ 0,
+            /*.wsize =*/ plan.work_size,
+            /*.wdata =*/ plan.work_data
+    };
+    while (++node_n < cgraph->n_nodes) {
+        struct ggml_tensor * node = cgraph->nodes[node_n];
+        params.nth = 1;
+
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+
+        if (ggml_backend_supports_op(backend, node)) {
+            //LOGGD("%s: running op %s (%s) with backend %s\n", __func__, node->name, ggml_op_name(node->op), ggml_backend_name(backend));
+            if (backend->iface.offload_op != NULL) {
+                backend->iface.offload_op(backend, node);
+            }
+        } else {
+            //LOGGD("%s: error: op not supported %s (%s) with backend %s\n", __func__, node->name, ggml_op_name(node->op), ggml_backend_name(backend));
+            if (GGML_OP_HAS_INIT[node->op]) {
+                params.type = GGML_TASK_TYPE_INIT;
+                ggml_compute_forward(&params, node, NULL);
+            }
+            params.type = GGML_TASK_TYPE_COMPUTE;
+            ggml_compute_forward(&params, node, NULL);
+            if (GGML_OP_HAS_FINALIZE[node->op]) {
+                params.type = GGML_TASK_TYPE_FINALIZE;
+                ggml_compute_forward(&params, node, NULL);
+            }
+        }
+    }
+
+    if (NULL != plan.work_data) {
+        free(plan.work_data);
+    }
+
+    return result;
+}
+
+extern bool ggml_backend_is_qnn(ggml_backend_t backend);
 enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
+    enum ggml_status err = GGML_STATUS_SUCCESS;
+
+    if (NULL == g_cpu_backend) {
+        ggml_backend_cpu_init();
+    }
+    if (backend != g_cpu_backend) {
+        if (ggml_backend_is_qnn(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
+            err = ggml_backend_graph_compute_mixed(backend, cgraph);
+        } else {  //compatible for sycl backend or other existing backend
+            err = backend->iface.graph_compute(backend, cgraph);
+        }
+    } else {
+        //compatible for existing backend
+        err = backend->iface.graph_compute(backend, cgraph);;
+    }
     ggml_backend_synchronize(backend);
     return err;
 }
 
+
 enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_compute(backend, cgraph);
+    enum ggml_status err = GGML_STATUS_SUCCESS;
+
+    if (NULL == g_cpu_backend) {
+        ggml_backend_cpu_init();
+    }
+    if (backend != g_cpu_backend) {
+        if (ggml_backend_is_qnn(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
+            err = ggml_backend_graph_compute_mixed(backend, cgraph);
+        } else {  //compatible for sycl backend or other existing backend
+            err = backend->iface.graph_compute(backend, cgraph);
+        }
+    } else {
+        //compatible for existing backend
+        err = backend->iface.graph_compute(backend, cgraph);;
+    }
+    ggml_backend_synchronize(backend);
+    return err;
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     return backend->iface.supports_op(backend, op);
 }
 
 bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    if (ggml_backend_is_qnn(backend)) { //compatible for sycl backend or other existing backend
+        return false;
+    }
+
     if (backend->iface.offload_op != NULL) {
         return backend->iface.offload_op(backend, op);
     }
@@ -899,6 +1026,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
         /* .interface = */ cpu_backend_i,
         /* .context   = */ ctx
     };
+    g_cpu_backend = cpu_backend;
+
     return cpu_backend;
 }
 

diff --git a/core/ggml/llamacpp/ggml-qnn.cpp b/core/ggml/llamacpp/ggml-qnn.cpp
@@ -311,35 +311,7 @@ struct ggml_backend_qnn_context {
 // =================================================================================================
 static ggml_backend_t g_qnn_backend = nullptr;
 
-static int g_current_device        = QNN_BACKEND_GGML; // QNN_BACKEND_GGML is the default ggml backend
-
-static bool GGML_OP_HAS_INIT    [GGML_OP_COUNT] = { 0 };
-static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
-static void ggml_setup_op_has_task_pass(void) {
-    {   // INIT
-        bool * p = GGML_OP_HAS_INIT;
-
-        p[GGML_OP_ACC                    ] = true;
-        p[GGML_OP_MUL_MAT                ] = true;
-        p[GGML_OP_MUL_MAT_ID             ] = true;
-        p[GGML_OP_OUT_PROD               ] = true;
-        p[GGML_OP_SET                    ] = true;
-        p[GGML_OP_GET_ROWS_BACK          ] = true;
-        p[GGML_OP_DIAG_MASK_INF          ] = true;
-        p[GGML_OP_DIAG_MASK_ZERO         ] = true;
-        p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
-        p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
-        p[GGML_OP_FLASH_ATTN_BACK        ] = true;
-        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
-        p[GGML_OP_ADD_REL_POS            ] = true;
-    }
-
-    {   // FINALIZE
-        bool * p = GGML_OP_HAS_FINALIZE;
-
-        p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
-    }
-}
+static int g_current_device         = QNN_BACKEND_GGML;
 
 
 //QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently
@@ -1420,11 +1392,9 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const
 #else
             __android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf);
 #endif
-            //for Android command line application
-            printf("%s\n", s_ggml_qnn_log_internal_buf);
-#else
-            printf("%s\n", s_ggml_qnn_log_internal_buf);
 #endif
+            //for Android command line application or WoA
+            printf("%s\n", s_ggml_qnn_log_internal_buf);
         }
         va_end(args);
     }
@@ -2483,10 +2453,6 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
     if (nullptr == tensor)
         return false;
 
-    if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) {
-        return false;
-    }
-
     if (b_dump_tensor_info) {
         QNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
                       ggml_type_name(tensor->type));
@@ -2498,6 +2464,10 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
         }
     }
 
+    if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) {
+        return false;
+    }
+
     //ensure tensor->src[0] and tensor->src[1] is not nullptr
     bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT));
     if (!supported_op) {
@@ -2550,6 +2520,10 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
                     tensor->nb[1], tensor->nb[2]);
         }
 
+        if (tensor->ne[1] < 32) { // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size
+            return false;
+        }
+
     }
 
     //TODO: this is limitation
@@ -3410,20 +3384,6 @@ bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_t
     ggml_qnn_func_t func                = nullptr;
     ggml_qnn_func_common_t  func_common = nullptr;
 
-#if 1// NOT_IN_PR // not in PR, should be removed before PR because this is a workaround method during development stage
-    bool use_hwaccel                    = false;
-    use_hwaccel = (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU);
-    bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT));
-    if (!use_hwaccel && !supported_op) {
-        ggml_compute_forward(params, tensor, nullptr);
-        return false;
-    }
-    if ((!use_hwaccel) && (!ggml_qnn_can_handle_op(tensor, false))) {
-        ggml_compute_forward(params, tensor, nullptr);
-        return false;
-    }
-#endif
-
     switch (tensor->op) {
         case GGML_OP_ADD:
             func = ggml_qnn_add;
@@ -3847,76 +3807,40 @@ static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_
 }
 
 
-static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    GGML_UNUSED(backend);
-
-    switch (op->op) {
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL:
-        case GGML_OP_ADD:
-            return true;
-        default:
-            return false;
-    }
-}
-
-
 static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     enum ggml_status result         = GGML_STATUS_SUCCESS;
-    int node_n                      = -1;
     ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *) backend->context;
 
-    struct ggml_cplan plan          = ggml_graph_plan(cgraph, 1);
-    if (plan.work_size > 0) {
-#if NOT_IN_PR
-        plan.work_data = static_cast<uint8_t *>(ctx->buffer_pool->buffer_pool_base);
-#else
-        plan.work_data = static_cast<uint8_t *>(malloc(plan.work_size));
-        if (nullptr == plan.work_data) {
-            QNN_LOG_ERROR("malloc failed");
-            return result;
+    ggml_compute_params params = {};
+    params.type = GGML_TASK_TYPE_COMPUTE;
+    params.ith = 0;
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
         }
-#endif
-    }
-
-    struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_TYPE_FINALIZE,
-            /*.ith   =*/ 0,
-            /*.nth   =*/ 0,
-            /*.wsize =*/ plan.work_size,
-            /*.wdata =*/ plan.work_data,
-    };
-    while (++node_n < cgraph->n_nodes) {
-        struct ggml_tensor * node = cgraph->nodes[node_n];
-        params.nth = 1;
-        if (GGML_OP_HAS_INIT[node->op]) {
-            params.type = GGML_TASK_TYPE_INIT;
-            ggml_qnn_compute_forward(&params, node);
-        }
-        params.type = GGML_TASK_TYPE_COMPUTE;
-        ggml_qnn_compute_forward(&params, node);
-        if (GGML_OP_HAS_FINALIZE[node->op]) {
-            params.type = GGML_TASK_TYPE_FINALIZE;
-            ggml_qnn_compute_forward(&params, node);
+        bool ok = ggml_qnn_compute_forward(&params, node);
+        if (!ok) {
+            QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
         }
     }
 
-#if NOT_IN_PR
-    free(plan.work_data);
-#endif
-
     return result;
 }
 
 
+static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+    GGML_UNUSED(backend);
+
+    return (ggml_qnn_can_handle_op(op, false));
+}
+
+
 //note: this function will be used in new/proposal/refined ggml backend subsystem(will be available in a standalone PR)
 static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) {
     GGML_UNUSED(backend);
 
-    if (ggml_qnn_can_handle_op(tensor, false))
-        return true;
-    else
-        return false;
+    return ggml_qnn_compute_forward(nullptr, (ggml_tensor*)tensor);
 }
 
 
@@ -4054,12 +3978,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
         }
     }
 
-    static bool is_first_call = true;
-    if (is_first_call) {
-        ggml_setup_op_has_task_pass();
-        is_first_call = false;
-    }
-
     if (QNN_BACKEND_NPU == device) {
         std::string path = qnn_lib_path;
         if (0 == setenv("LD_LIBRARY_PATH",

diff --git a/core/ggml/llamacpp/ggml.c b/core/ggml/llamacpp/ggml.c
@@ -17269,24 +17269,6 @@ void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tenso
         return;
     }
 
-    //depend on this PR in upstream whisper.cpp https://github.com/ggerganov/whisper.cpp/pull/2073
-    const struct ggml_tensor * src0 = tensor->src[0];
-    const struct ggml_tensor * src1 = tensor->src[1];
-    if (NULL != src0 && NULL != src1) {
-        if (src0->backend == GGML_BACKEND_TYPE_GPU) {
-#ifdef GGML_USE_QNN
-            //LOGGI("hw acceleration with QNN");
-            if ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)) {
-                ggml_qnn_compute_forward(params, tensor);
-                return;
-            }
-#endif
-        } else {
-            //LOGGI("no hw acceleration");
-        }
-    }
-
-
     switch (tensor->op) {
         case GGML_OP_DUP:
             {