Skip to content

Commit

Permalink
ggml-qnn: refine ggml backend subsystem
Browse files Browse the repository at this point in the history
  • Loading branch information
zhouwg committed May 30, 2024
1 parent fcf5338 commit 70835aa
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 131 deletions.
133 changes: 131 additions & 2 deletions core/ggml/llamacpp/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -280,21 +280,148 @@ enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_ba
return backend->iface.graph_plan_compute(backend, plan);
}

static ggml_backend_t g_cpu_backend = NULL;
static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
static void ggml_setup_op_has_task_pass(void) {
{ // INIT
bool * p = GGML_OP_HAS_INIT;

p[GGML_OP_ACC ] = true;
p[GGML_OP_MUL_MAT ] = true;
p[GGML_OP_MUL_MAT_ID ] = true;
p[GGML_OP_OUT_PROD ] = true;
p[GGML_OP_SET ] = true;
p[GGML_OP_GET_ROWS_BACK ] = true;
p[GGML_OP_DIAG_MASK_INF ] = true;
p[GGML_OP_DIAG_MASK_ZERO ] = true;
p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
p[GGML_OP_FLASH_ATTN_BACK ] = true;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
p[GGML_OP_ADD_REL_POS ] = true;
}

{ // FINALIZE
bool * p = GGML_OP_HAS_FINALIZE;

p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
}
}


extern void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state);
static enum ggml_status ggml_backend_graph_compute_mixed(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
enum ggml_status result = GGML_STATUS_SUCCESS;
int node_n = -1;

static bool is_first_call = true;
if (is_first_call) {
ggml_setup_op_has_task_pass();
is_first_call = false;
}

struct ggml_cplan plan = ggml_graph_plan(cgraph, 1);
if (plan.work_size > 0) {
plan.work_data = (uint8_t *)(malloc(plan.work_size));
if (NULL == plan.work_data) {
return GGML_STATUS_ALLOC_FAILED;
}
}

struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_TYPE_FINALIZE,
/*.ith =*/ 0,
/*.nth =*/ 0,
/*.wsize =*/ plan.work_size,
/*.wdata =*/ plan.work_data
};
while (++node_n < cgraph->n_nodes) {
struct ggml_tensor * node = cgraph->nodes[node_n];
params.nth = 1;

if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
}

if (ggml_backend_supports_op(backend, node)) {
//LOGGD("%s: running op %s (%s) with backend %s\n", __func__, node->name, ggml_op_name(node->op), ggml_backend_name(backend));
if (backend->iface.offload_op != NULL) {
backend->iface.offload_op(backend, node);
}
} else {
//LOGGD("%s: error: op not supported %s (%s) with backend %s\n", __func__, node->name, ggml_op_name(node->op), ggml_backend_name(backend));
if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_TYPE_INIT;
ggml_compute_forward(&params, node, NULL);
}
params.type = GGML_TASK_TYPE_COMPUTE;
ggml_compute_forward(&params, node, NULL);
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_TYPE_FINALIZE;
ggml_compute_forward(&params, node, NULL);
}
}
}

if (NULL != plan.work_data) {
free(plan.work_data);
}

return result;
}

extern bool ggml_backend_is_qnn(ggml_backend_t backend);
enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
enum ggml_status err = GGML_STATUS_SUCCESS;

if (NULL == g_cpu_backend) {
ggml_backend_cpu_init();
}
if (backend != g_cpu_backend) {
if (ggml_backend_is_qnn(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
err = ggml_backend_graph_compute_mixed(backend, cgraph);
} else { //compatible for sycl backend or other existing backend
err = backend->iface.graph_compute(backend, cgraph);
}
} else {
//compatible for existing backend
err = backend->iface.graph_compute(backend, cgraph);;
}
ggml_backend_synchronize(backend);
return err;
}


enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph);
enum ggml_status err = GGML_STATUS_SUCCESS;

if (NULL == g_cpu_backend) {
ggml_backend_cpu_init();
}
if (backend != g_cpu_backend) {
if (ggml_backend_is_qnn(backend)) { // or if (backend->iface.offload_op != NULL) but sycl backend's iface.offload_op is not NULL
err = ggml_backend_graph_compute_mixed(backend, cgraph);
} else { //compatible for sycl backend or other existing backend
err = backend->iface.graph_compute(backend, cgraph);
}
} else {
//compatible for existing backend
err = backend->iface.graph_compute(backend, cgraph);;
}
ggml_backend_synchronize(backend);
return err;
}

bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
return backend->iface.supports_op(backend, op);
}

bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
if (ggml_backend_is_qnn(backend)) { //compatible for sycl backend or other existing backend
return false;
}

if (backend->iface.offload_op != NULL) {
return backend->iface.offload_op(backend, op);
}
Expand Down Expand Up @@ -899,6 +1026,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
/* .interface = */ cpu_backend_i,
/* .context = */ ctx
};
g_cpu_backend = cpu_backend;

return cpu_backend;
}

Expand Down
140 changes: 29 additions & 111 deletions core/ggml/llamacpp/ggml-qnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,35 +311,7 @@ struct ggml_backend_qnn_context {
// =================================================================================================
static ggml_backend_t g_qnn_backend = nullptr;

static int g_current_device = QNN_BACKEND_GGML; // QNN_BACKEND_GGML is the default ggml backend

static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
static void ggml_setup_op_has_task_pass(void) {
{ // INIT
bool * p = GGML_OP_HAS_INIT;

p[GGML_OP_ACC ] = true;
p[GGML_OP_MUL_MAT ] = true;
p[GGML_OP_MUL_MAT_ID ] = true;
p[GGML_OP_OUT_PROD ] = true;
p[GGML_OP_SET ] = true;
p[GGML_OP_GET_ROWS_BACK ] = true;
p[GGML_OP_DIAG_MASK_INF ] = true;
p[GGML_OP_DIAG_MASK_ZERO ] = true;
p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
p[GGML_OP_FLASH_ATTN_BACK ] = true;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
p[GGML_OP_ADD_REL_POS ] = true;
}

{ // FINALIZE
bool * p = GGML_OP_HAS_FINALIZE;

p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
}
}
static int g_current_device = QNN_BACKEND_GGML;


//QNN cDSP and HTA backend would not be used currently, just focus on QNN CPU/GPU/NPU(aka HTP/DSP) backend currently
Expand Down Expand Up @@ -1420,11 +1392,9 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const
#else
__android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf);
#endif
//for Android command line application
printf("%s\n", s_ggml_qnn_log_internal_buf);
#else
printf("%s\n", s_ggml_qnn_log_internal_buf);
#endif
//for Android command line application or WoA
printf("%s\n", s_ggml_qnn_log_internal_buf);
}
va_end(args);
}
Expand Down Expand Up @@ -2483,10 +2453,6 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
if (nullptr == tensor)
return false;

if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) {
return false;
}

if (b_dump_tensor_info) {
QNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
ggml_type_name(tensor->type));
Expand All @@ -2498,6 +2464,10 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
}
}

if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW || tensor->op == GGML_OP_PERMUTE || tensor->op == GGML_OP_NONE) {
return false;
}

//ensure tensor->src[0] and tensor->src[1] is not nullptr
bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT));
if (!supported_op) {
Expand Down Expand Up @@ -2550,6 +2520,10 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
tensor->nb[1], tensor->nb[2]);
}

if (tensor->ne[1] < 32) { // GPU/NPU inference will slower then CPU inference when tensor->ne[1] < min batch size
return false;
}

}

//TODO: this is limitation
Expand Down Expand Up @@ -3410,20 +3384,6 @@ bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_t
ggml_qnn_func_t func = nullptr;
ggml_qnn_func_common_t func_common = nullptr;

#if 1// NOT_IN_PR // not in PR, should be removed before PR because this is a workaround method during development stage
bool use_hwaccel = false;
use_hwaccel = (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU);
bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT));
if (!use_hwaccel && !supported_op) {
ggml_compute_forward(params, tensor, nullptr);
return false;
}
if ((!use_hwaccel) && (!ggml_qnn_can_handle_op(tensor, false))) {
ggml_compute_forward(params, tensor, nullptr);
return false;
}
#endif

switch (tensor->op) {
case GGML_OP_ADD:
func = ggml_qnn_add;
Expand Down Expand Up @@ -3847,76 +3807,40 @@ static ggml_backend_buffer_type_t ggml_backend_qnn_get_default_buffer_type(ggml_
}


static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
GGML_UNUSED(backend);

switch (op->op) {
case GGML_OP_MUL_MAT:
case GGML_OP_MUL:
case GGML_OP_ADD:
return true;
default:
return false;
}
}


static ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
enum ggml_status result = GGML_STATUS_SUCCESS;
int node_n = -1;
ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;

struct ggml_cplan plan = ggml_graph_plan(cgraph, 1);
if (plan.work_size > 0) {
#if NOT_IN_PR
plan.work_data = static_cast<uint8_t *>(ctx->buffer_pool->buffer_pool_base);
#else
plan.work_data = static_cast<uint8_t *>(malloc(plan.work_size));
if (nullptr == plan.work_data) {
QNN_LOG_ERROR("malloc failed");
return result;
ggml_compute_params params = {};
params.type = GGML_TASK_TYPE_COMPUTE;
params.ith = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
}
#endif
}

struct ggml_compute_params params = {
/*.type =*/ GGML_TASK_TYPE_FINALIZE,
/*.ith =*/ 0,
/*.nth =*/ 0,
/*.wsize =*/ plan.work_size,
/*.wdata =*/ plan.work_data,
};
while (++node_n < cgraph->n_nodes) {
struct ggml_tensor * node = cgraph->nodes[node_n];
params.nth = 1;
if (GGML_OP_HAS_INIT[node->op]) {
params.type = GGML_TASK_TYPE_INIT;
ggml_qnn_compute_forward(&params, node);
}
params.type = GGML_TASK_TYPE_COMPUTE;
ggml_qnn_compute_forward(&params, node);
if (GGML_OP_HAS_FINALIZE[node->op]) {
params.type = GGML_TASK_TYPE_FINALIZE;
ggml_qnn_compute_forward(&params, node);
bool ok = ggml_qnn_compute_forward(&params, node);
if (!ok) {
QNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
}
}

#if NOT_IN_PR
free(plan.work_data);
#endif

return result;
}


static bool ggml_backend_qnn_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
GGML_UNUSED(backend);

return (ggml_qnn_can_handle_op(op, false));
}


//note: this function will be used in new/proposal/refined ggml backend subsystem(will be available in a standalone PR)
static bool ggml_backend_qnn_offload_op(ggml_backend_t backend, const ggml_tensor * tensor) {
GGML_UNUSED(backend);

if (ggml_qnn_can_handle_op(tensor, false))
return true;
else
return false;
return ggml_qnn_compute_forward(nullptr, (ggml_tensor*)tensor);
}


Expand Down Expand Up @@ -4054,12 +3978,6 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
}
}

static bool is_first_call = true;
if (is_first_call) {
ggml_setup_op_has_task_pass();
is_first_call = false;
}

if (QNN_BACKEND_NPU == device) {
std::string path = qnn_lib_path;
if (0 == setenv("LD_LIBRARY_PATH",
Expand Down
18 changes: 0 additions & 18 deletions core/ggml/llamacpp/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -17269,24 +17269,6 @@ void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tenso
return;
}

//depend on this PR in upstream whisper.cpp https://github.com/ggerganov/whisper.cpp/pull/2073
const struct ggml_tensor * src0 = tensor->src[0];
const struct ggml_tensor * src1 = tensor->src[1];
if (NULL != src0 && NULL != src1) {
if (src0->backend == GGML_BACKEND_TYPE_GPU) {
#ifdef GGML_USE_QNN
//LOGGI("hw acceleration with QNN");
if ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL) || (tensor->op == GGML_OP_MUL_MAT)) {
ggml_qnn_compute_forward(params, tensor);
return;
}
#endif
} else {
//LOGGI("no hw acceleration");
}
}


switch (tensor->op) {
case GGML_OP_DUP:
{
Expand Down

0 comments on commit 70835aa

Please sign in to comment.