From e00612deffff7103212063e5f938f67cfd362387 Mon Sep 17 00:00:00 2001
From: "zhou.weiguo" <zhouwg2000@gmail.com>
Date: Fri, 31 May 2024 09:42:41 +0800
Subject: [PATCH] ggml-qnn: keep sync with
 PR(https://github.com/ggerganov/llama.cpp/pull/7641) in upstream

---
 core/ggml/llamacpp/ggml-backend.c | 24 +++++++++++++++++++-----
 core/ggml/llamacpp/ggml-qnn.cpp   | 10 +---------
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/core/ggml/llamacpp/ggml-backend.c b/core/ggml/llamacpp/ggml-backend.c
index bb8e22acc..dc5fb16d6 100644
--- a/core/ggml/llamacpp/ggml-backend.c
+++ b/core/ggml/llamacpp/ggml-backend.c
@@ -309,7 +309,7 @@ static void ggml_setup_op_has_task_pass(void) {
     }
 }
 
-
+struct ggml_compute_state;
 extern void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state);
 static enum ggml_status ggml_backend_graph_compute_mixed(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     enum ggml_status result         = GGML_STATUS_SUCCESS;
@@ -371,7 +371,19 @@ static enum ggml_status ggml_backend_graph_compute_mixed(ggml_backend_t backend,
     return result;
 }
 
+#ifdef GGML_USE_QNN
 extern bool ggml_backend_is_qnn(ggml_backend_t backend);
+#endif
+
+static bool is_qnn_backend(ggml_backend_t backend) {
+#ifdef GGML_USE_QNN
+    return ggml_backend_is_qnn(backend);
+#else
+    GGML_UNUSED(backend);
+    return false;
+#endif
+}
+
 enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     enum ggml_status err = GGML_STATUS_SUCCESS;
 
@@ -379,7 +391,8 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
         ggml_backend_cpu_init();
     }
     if (backend != g_cpu_backend) {
-        if (ggml_backend_is_qnn(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
+        if (is_qnn_backend(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
+            //mixed inference between Qualcomm's CPU/GPU or CPU/NPU
             err = ggml_backend_graph_compute_mixed(backend, cgraph);
         } else {  //compatible for sycl backend or other existing backend
             err = backend->iface.graph_compute(backend, cgraph);
@@ -400,7 +413,8 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
         ggml_backend_cpu_init();
     }
     if (backend != g_cpu_backend) {
-        if (ggml_backend_is_qnn(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
+        if (is_qnn_backend(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
+            //mixed inference between Qualcomm's CPU/GPU or CPU/NPU
             err = ggml_backend_graph_compute_mixed(backend, cgraph);
         } else {  //compatible for sycl backend or other existing backend
             err = backend->iface.graph_compute(backend, cgraph);
@@ -409,7 +423,7 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct
         //compatible for existing backend
         err = backend->iface.graph_compute(backend, cgraph);;
     }
-    ggml_backend_synchronize(backend);
+
     return err;
 }
 
@@ -418,7 +432,7 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
 }
 
 bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    if (ggml_backend_is_qnn(backend)) { //compatible for sycl backend or other existing backend
+    if (is_qnn_backend(backend)) { //compatible for sycl backend or other existing backend
         return false;
     }
 
diff --git a/core/ggml/llamacpp/ggml-qnn.cpp b/core/ggml/llamacpp/ggml-qnn.cpp
index d3b543556..2bcd414fa 100644
--- a/core/ggml/llamacpp/ggml-qnn.cpp
+++ b/core/ggml/llamacpp/ggml-qnn.cpp
@@ -1,7 +1,3 @@
-#define NOT_IN_PR   1
-
-
-#if NOT_IN_PR
 /*
  * Copyright (c) 2024- KanTV Authors
  *
@@ -42,7 +38,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -110,10 +105,6 @@ __attribute__((__format__(printf, 3, 4)));
 #endif
 static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
 
-#if 1// NOT_IN_PR //should be removed before PR because this is a workaround method during development stage
-extern "C" void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state);
-#endif
-
 
 // =================================================================================================
 //
@@ -133,6 +124,7 @@ extern "C" void ggml_compute_forward(struct ggml_compute_params * params, struct
 #define BUF_CONTROL_BASE                                0xEE000000
 
 #define GGML_QNN_DEBUG                                  1
+#define NOT_IN_PR                                       1  //for submit/update PR(ggml-qnn.cpp&ggml.h) to upstream more easily and quickly
 
 #define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define QNN_LOG_WARN(...)  ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)