From e00612deffff7103212063e5f938f67cfd362387 Mon Sep 17 00:00:00 2001 From: "zhou.weiguo" Date: Fri, 31 May 2024 09:42:41 +0800 Subject: [PATCH] ggml-qnn: keep sync with PR(https://github.com/ggerganov/llama.cpp/pull/7641) in upstream --- core/ggml/llamacpp/ggml-backend.c | 24 +++++++++++++++++++----- core/ggml/llamacpp/ggml-qnn.cpp | 10 +--------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/core/ggml/llamacpp/ggml-backend.c b/core/ggml/llamacpp/ggml-backend.c index bb8e22acc..dc5fb16d6 100644 --- a/core/ggml/llamacpp/ggml-backend.c +++ b/core/ggml/llamacpp/ggml-backend.c @@ -309,7 +309,7 @@ static void ggml_setup_op_has_task_pass(void) { } } - +struct ggml_compute_state; extern void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state); static enum ggml_status ggml_backend_graph_compute_mixed(ggml_backend_t backend, struct ggml_cgraph * cgraph) { enum ggml_status result = GGML_STATUS_SUCCESS; @@ -371,7 +371,19 @@ static enum ggml_status ggml_backend_graph_compute_mixed(ggml_backend_t backend, return result; } +#ifdef GGML_USE_QNN extern bool ggml_backend_is_qnn(ggml_backend_t backend); +#endif + +static bool is_qnn_backend(ggml_backend_t backend) { +#ifdef GGML_USE_QNN + return ggml_backend_is_qnn(backend); +#else + GGML_UNUSED(backend); + return false; +#endif +} + enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { enum ggml_status err = GGML_STATUS_SUCCESS; @@ -379,7 +391,8 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_ ggml_backend_cpu_init(); } if (backend != g_cpu_backend) { - if (ggml_backend_is_qnn(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL + if (is_qnn_backend(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL + //mixed inference between Qualcomm's CPU/GPU or CPU/NPU err = ggml_backend_graph_compute_mixed(backend, cgraph); } else { //compatible for sycl backend or other existing backend err = backend->iface.graph_compute(backend, cgraph); @@ -400,7 +413,8 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_backend_cpu_init(); } if (backend != g_cpu_backend) { - if (ggml_backend_is_qnn(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL + if (is_qnn_backend(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL + //mixed inference between Qualcomm's CPU/GPU or CPU/NPU err = ggml_backend_graph_compute_mixed(backend, cgraph); } else { //compatible for sycl backend or other existing backend err = backend->iface.graph_compute(backend, cgraph); @@ -409,7 +423,7 @@ enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct //compatible for existing backend err = backend->iface.graph_compute(backend, cgraph);; } - ggml_backend_synchronize(backend); + return err; } @@ -418,7 +432,7 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * } bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { - if (ggml_backend_is_qnn(backend)) { //compatible for sycl backend or other existing backend + if (is_qnn_backend(backend)) { //compatible for sycl backend or other existing backend return false; } diff --git a/core/ggml/llamacpp/ggml-qnn.cpp b/core/ggml/llamacpp/ggml-qnn.cpp index d3b543556..2bcd414fa 100644 --- a/core/ggml/llamacpp/ggml-qnn.cpp +++ b/core/ggml/llamacpp/ggml-qnn.cpp @@ -1,7 +1,3 @@ -#define NOT_IN_PR 1 - - -#if NOT_IN_PR /* * Copyright (c) 2024- KanTV Authors * @@ -42,7 +38,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#endif #include #include #include @@ -110,10 +105,6 @@ __attribute__((__format__(printf, 3, 4))); #endif static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...); -#if 1// NOT_IN_PR //should be removed before PR because this is a workaround method during development stage -extern "C" void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state); -#endif - // ================================================================================================= // @@ -133,6 +124,7 @@ extern "C" void ggml_compute_forward(struct ggml_compute_params * params, struct #define BUF_CONTROL_BASE 0xEE000000 #define GGML_QNN_DEBUG 1 +#define NOT_IN_PR 1 //for submit/update PR(ggml-qnn.cpp&ggml.h) to upstream more easily and quickly #define QNN_LOG_ERROR(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__) #define QNN_LOG_WARN(...) ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)