ggml-qnn: keep sync with PR(ggml-org/llama.cpp#7641) in upstream

kantv-ai · May 31, 2024 · e00612d · e00612d
1 parent 70835aa
commit e00612d
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 14 deletions.
diff --git a/core/ggml/llamacpp/ggml-backend.c b/core/ggml/llamacpp/ggml-backend.c
@@ -309,7 +309,7 @@ static void ggml_setup_op_has_task_pass(void) {
     }
 }
 
-
+struct ggml_compute_state;
 extern void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state);
 static enum ggml_status ggml_backend_graph_compute_mixed(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     enum ggml_status result         = GGML_STATUS_SUCCESS;
@@ -371,15 +371,28 @@ static enum ggml_status ggml_backend_graph_compute_mixed(ggml_backend_t backend,
     return result;
 }
 
+#ifdef GGML_USE_QNN
 extern bool ggml_backend_is_qnn(ggml_backend_t backend);
+#endif
+
+static bool is_qnn_backend(ggml_backend_t backend) {
+#ifdef GGML_USE_QNN
+    return ggml_backend_is_qnn(backend);
+#else
+    GGML_UNUSED(backend);
+    return false;
+#endif
+}
+
 enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     enum ggml_status err = GGML_STATUS_SUCCESS;
 
     if (NULL == g_cpu_backend) {
         ggml_backend_cpu_init();
     }
     if (backend != g_cpu_backend) {
-        if (ggml_backend_is_qnn(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
+        if (is_qnn_backend(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
+            //mixed inference between Qualcomm's CPU/GPU or CPU/NPU
             err = ggml_backend_graph_compute_mixed(backend, cgraph);
         } else {  //compatible for sycl backend or other existing backend
             err = backend->iface.graph_compute(backend, cgraph);
@@ -400,7 +413,8 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
         ggml_backend_cpu_init();
     }
     if (backend != g_cpu_backend) {
-        if (ggml_backend_is_qnn(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
+        if (is_qnn_backend(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
+            //mixed inference between Qualcomm's CPU/GPU or CPU/NPU
             err = ggml_backend_graph_compute_mixed(backend, cgraph);
         } else {  //compatible for sycl backend or other existing backend
             err = backend->iface.graph_compute(backend, cgraph);
@@ -409,7 +423,7 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
         //compatible for existing backend
         err = backend->iface.graph_compute(backend, cgraph);;
     }
-    ggml_backend_synchronize(backend);
+
     return err;
 }
 
@@ -418,7 +432,7 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
 }
 
 bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    if (ggml_backend_is_qnn(backend)) { //compatible for sycl backend or other existing backend
+    if (is_qnn_backend(backend)) { //compatible for sycl backend or other existing backend
         return false;
     }
 

diff --git a/core/ggml/llamacpp/ggml-qnn.cpp b/core/ggml/llamacpp/ggml-qnn.cpp
@@ -1,7 +1,3 @@
-#define NOT_IN_PR   1
-
-
-#if NOT_IN_PR
 /*
  * Copyright (c) 2024- KanTV Authors
  *
@@ -42,7 +38,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -110,10 +105,6 @@ __attribute__((__format__(printf, 3, 4)));
 #endif
 static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
 
-#if 1// NOT_IN_PR //should be removed before PR because this is a workaround method during development stage
-extern "C" void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state);
-#endif
-
 
 // =================================================================================================
 //
@@ -133,6 +124,7 @@ extern "C" void ggml_compute_forward(struct ggml_compute_params * params, struct
 #define BUF_CONTROL_BASE                                0xEE000000
 
 #define GGML_QNN_DEBUG                                  1
+#define NOT_IN_PR                                       1  //for submit/update PR(ggml-qnn.cpp&ggml.h) to upstream more easily and quickly
 
 #define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define QNN_LOG_WARN(...)  ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)