From f477d4f7ece7d840f85dd0fe5d7177a7afdb775c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 24 Apr 2023 21:51:34 +0300 Subject: [PATCH 01/27] ggml : cgraph export brainstorming --- examples/common-ggml.cpp | 92 +++++++++++++++++++++++++++++++++++ examples/common-ggml.h | 4 ++ examples/mnist/CMakeLists.txt | 2 +- examples/mnist/main.cpp | 3 ++ 4 files changed, 100 insertions(+), 1 deletion(-) diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 9215dbeab..6100ad4d0 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -233,3 +233,95 @@ bool ggml_common_quantize_0( return true; } + +#define GGML_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + +void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p\n", + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->data); +} + +void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p\n", + arg, + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->n_tasks, + tensor->data); +} + +void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { + FILE * fout = stdout; + + fprintf(fout, "\n"); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s\n", + "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA"); + + for (int i = 0; i < cgraph->n_leafs; ++i) { + const int64_t * ne = cgraph->leafs[i]->ne; + const size_t * nb = cgraph->leafs[i]->nb; + + ggml_graph_export_leaf(cgraph->leafs[i], fout); + + GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); + GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); + GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); + } + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s\n", + "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA"); + + for (int i = 0; i < cgraph->n_nodes; ++i) { + const int64_t * ne = cgraph->nodes[i]->ne; + const size_t * nb = cgraph->nodes[i]->nb; + + ggml_graph_export_node(cgraph->nodes[i], "DST", fout); + + if (cgraph->nodes[i]->src0) { + ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); + } + + if (cgraph->nodes[i]->src1) { + ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); + } + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + if (cgraph->nodes[i]->opt[j]) { + ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); + } + } + + fprintf(fout, "\n"); + } + + fprintf(fout, "\n"); +} diff --git a/examples/common-ggml.h b/examples/common-ggml.h index 477de341a..1c6fcd34b 100644 --- a/examples/common-ggml.h +++ b/examples/common-ggml.h @@ -16,3 +16,7 @@ bool ggml_common_quantize_0( const ggml_ftype ftype, const std::vector & to_quant, const std::vector & to_skip); + +struct ggml_cgraph; + +void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt index 91b802ae5..1a676d9df 100644 --- a/examples/mnist/CMakeLists.txt +++ b/examples/mnist/CMakeLists.txt @@ -3,5 +3,5 @@ set(TEST_TARGET mnist) add_executable(${TEST_TARGET} main.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common) +target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index c84eedd92..2bfebd0da 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -1,6 +1,7 @@ #include "ggml/ggml.h" #include "common.h" +#include "common-ggml.h" #include #include @@ -204,6 +205,8 @@ int mnist_eval( const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; + ggml_graph_export(&gf, "mnist.ggml"); + ggml_free(ctx0); return prediction; From 2a0342114acd8fad9100ab7cc3a22b5bfb67c430 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 21 May 2023 16:11:50 +0300 Subject: [PATCH 02/27] mnist : code style --- examples/mnist/main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index 2bfebd0da..655ab559d 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -205,6 +205,7 @@ int mnist_eval( const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; + // export the computation graph ggml_graph_export(&gf, "mnist.ggml"); ggml_free(ctx0); From 85dcc0c3fc3bc8568e573176b55da9fc7f387cab Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 21 May 2023 22:46:03 +0300 Subject: [PATCH 03/27] mnist : minor --- examples/common-ggml.cpp | 106 ++++++++++++++++++++++++--------------- 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 6100ad4d0..b79aab835 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -246,20 +246,21 @@ void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p\n", + fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n", ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], - tensor->data); + tensor->data, + tensor->name); } void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p\n", + fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), @@ -267,61 +268,84 @@ void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], tensor->n_tasks, - tensor->data); + tensor->data, + tensor->name); } void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { - FILE * fout = stdout; + // print + { + FILE * fout = stdout; - fprintf(fout, "\n"); - fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); - fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); - fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); - fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "\n"); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); - // header - fprintf(fout, "\n"); - fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s\n", - "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA"); + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n", + "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME"); - for (int i = 0; i < cgraph->n_leafs; ++i) { - const int64_t * ne = cgraph->leafs[i]->ne; - const size_t * nb = cgraph->leafs[i]->nb; + for (int i = 0; i < cgraph->n_leafs; ++i) { + ggml_graph_export_leaf(cgraph->leafs[i], fout); - ggml_graph_export_leaf(cgraph->leafs[i], fout); + GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); + GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); + GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); + } - GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); - GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); - GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); - } + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n", + "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME"); - // header - fprintf(fout, "\n"); - fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s\n", - "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA"); + for (int i = 0; i < cgraph->n_nodes; ++i) { + ggml_graph_export_node(cgraph->nodes[i], "DST", fout); - for (int i = 0; i < cgraph->n_nodes; ++i) { - const int64_t * ne = cgraph->nodes[i]->ne; - const size_t * nb = cgraph->nodes[i]->nb; + if (cgraph->nodes[i]->src0) { + ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); + } - ggml_graph_export_node(cgraph->nodes[i], "DST", fout); + if (cgraph->nodes[i]->src1) { + ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); + } - if (cgraph->nodes[i]->src0) { - ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); + for (int j = 0; j < GGML_MAX_OPT; ++j) { + if (cgraph->nodes[i]->opt[j]) { + ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); + } + } + + fprintf(fout, "\n"); } - if (cgraph->nodes[i]->src1) { - ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); + fprintf(fout, "\n"); + } + + // write binary data + { + FILE * fout = fopen(fname, "wb"); + + if (!fout) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return; } - for (int j = 0; j < GGML_MAX_OPT; ++j) { - if (cgraph->nodes[i]->opt[j]) { - ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); - } + // header + { + uint32_t magic = GGML_FILE_MAGIC; + uint32_t version = GGML_FILE_VERSION; + uint32_t leafs = cgraph->n_leafs; + uint32_t nodes = cgraph->n_nodes; + + fwrite(&magic, sizeof(uint32_t), 1, fout); + fwrite(&version, sizeof(uint32_t), 1, fout); + fwrite(&leafs, sizeof(uint32_t), 1, fout); + fwrite(&nodes, sizeof(uint32_t), 1, fout); } - fprintf(fout, "\n"); + fclose(fout); } - - fprintf(fout, "\n"); } From 95c85071b7c62eb9426a2e9bb0f7d54a0695027c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 24 May 2023 15:15:28 +0300 Subject: [PATCH 04/27] ggml : initial cgraph export --- examples/common-ggml.cpp | 90 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index b79aab835..84a028204 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -335,10 +335,10 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { // header { - uint32_t magic = GGML_FILE_MAGIC; - uint32_t version = GGML_FILE_VERSION; - uint32_t leafs = cgraph->n_leafs; - uint32_t nodes = cgraph->n_nodes; + const uint32_t magic = GGML_FILE_MAGIC; + const uint32_t version = GGML_FILE_VERSION; + const uint32_t leafs = cgraph->n_leafs; + const uint32_t nodes = cgraph->n_nodes; fwrite(&magic, sizeof(uint32_t), 1, fout); fwrite(&version, sizeof(uint32_t), 1, fout); @@ -346,6 +346,88 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { fwrite(&nodes, sizeof(uint32_t), 1, fout); } + // leafs + { + const uint32_t n_leafs = cgraph->n_leafs; + + fwrite(&n_leafs, sizeof(uint32_t), 1, fout); + + for (int i = 0; i < cgraph->n_leafs; ++i) { + const struct ggml_tensor * tensor = cgraph->leafs[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const int64_t ne = tensor->ne[j]; + const size_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(int64_t), 1, fout); + fwrite(&nb, sizeof(size_t), 1, fout); + } + + // store the pointer address + fwrite(&tensor->data, sizeof(void *), 1, fout); + + { + const size_t len = strlen(tensor->name); + + fwrite(&len, sizeof(size_t), 1, fout); + fwrite(tensor->name, sizeof(char), len, fout); + } + + // dump the data + // TODO: pad this to 32 byte boundary + { + const size_t size = ggml_nbytes(tensor); + + fwrite(tensor->data, sizeof(char), size, fout); + } + } + } + + // nodes + { + const uint32_t n_nodes = cgraph->n_nodes; + + fwrite(&n_nodes, sizeof(uint32_t), 1, fout); + + for (int i = 0; i < cgraph->n_nodes; ++i) { + const struct ggml_tensor * tensor = cgraph->nodes[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const int64_t ne = tensor->ne[j]; + const size_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(int64_t), 1, fout); + fwrite(&nb, sizeof(size_t), 1, fout); + } + + // store the pointer address + fwrite(&tensor->data, sizeof(void *), 1, fout); + + { + const size_t len = strlen(tensor->name); + + fwrite(&len, sizeof(size_t), 1, fout); + fwrite(tensor->name, sizeof(char), len, fout); + } + } + } + fclose(fout); } } From 312018907ac2b6e6dfc6884150822758abb8587c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 24 May 2023 21:47:30 +0300 Subject: [PATCH 05/27] ggml : initial graph import (wip) --- examples/common-ggml.cpp | 258 +++++++++++++++++++++++++++++++++------ examples/common-ggml.h | 5 +- examples/mnist/main.cpp | 18 ++- 3 files changed, 240 insertions(+), 41 deletions(-) diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 84a028204..3e355ae24 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -273,15 +273,27 @@ void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, } void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { + assert(cgraph->work == NULL); + assert(cgraph->work_size == 0); + + uint64_t size_eval = 0; + + // compute size of intermediate results + // TODO: does not take into account scratch buffers !!!! + for (int i = 0; i < cgraph->n_nodes; ++i) { + size_eval += ggml_nbytes(cgraph->nodes[i]); + } + // print { FILE * fout = stdout; fprintf(fout, "\n"); - fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); - fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); - fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); - fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %8llu\n", "eval", size_eval); // header fprintf(fout, "\n"); @@ -340,18 +352,15 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { const uint32_t leafs = cgraph->n_leafs; const uint32_t nodes = cgraph->n_nodes; - fwrite(&magic, sizeof(uint32_t), 1, fout); - fwrite(&version, sizeof(uint32_t), 1, fout); - fwrite(&leafs, sizeof(uint32_t), 1, fout); - fwrite(&nodes, sizeof(uint32_t), 1, fout); + fwrite(&magic, sizeof(uint32_t), 1, fout); + fwrite(&version, sizeof(uint32_t), 1, fout); + fwrite(&leafs, sizeof(uint32_t), 1, fout); + fwrite(&nodes, sizeof(uint32_t), 1, fout); + fwrite(&size_eval, sizeof(uint64_t), 1, fout); } // leafs { - const uint32_t n_leafs = cgraph->n_leafs; - - fwrite(&n_leafs, sizeof(uint32_t), 1, fout); - for (int i = 0; i < cgraph->n_leafs; ++i) { const struct ggml_tensor * tensor = cgraph->leafs[i]; @@ -364,23 +373,22 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { fwrite(&n_dims, sizeof(uint32_t), 1, fout); for (int j = 0; j < GGML_MAX_DIMS; ++j) { - const int64_t ne = tensor->ne[j]; - const size_t nb = tensor->nb[j]; + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; - fwrite(&ne, sizeof(int64_t), 1, fout); - fwrite(&nb, sizeof(size_t), 1, fout); + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); } // store the pointer address - fwrite(&tensor->data, sizeof(void *), 1, fout); - { - const size_t len = strlen(tensor->name); + const uint64_t ptr = (uint64_t) tensor->data; - fwrite(&len, sizeof(size_t), 1, fout); - fwrite(tensor->name, sizeof(char), len, fout); + fwrite(&ptr, sizeof(uint64_t), 1, fout); } + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + // dump the data // TODO: pad this to 32 byte boundary { @@ -393,10 +401,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { // nodes { - const uint32_t n_nodes = cgraph->n_nodes; - - fwrite(&n_nodes, sizeof(uint32_t), 1, fout); - for (int i = 0; i < cgraph->n_nodes; ++i) { const struct ggml_tensor * tensor = cgraph->nodes[i]; @@ -409,25 +413,211 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { fwrite(&n_dims, sizeof(uint32_t), 1, fout); for (int j = 0; j < GGML_MAX_DIMS; ++j) { - const int64_t ne = tensor->ne[j]; - const size_t nb = tensor->nb[j]; + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; - fwrite(&ne, sizeof(int64_t), 1, fout); - fwrite(&nb, sizeof(size_t), 1, fout); + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); } // store the pointer address - fwrite(&tensor->data, sizeof(void *), 1, fout); - { - const size_t len = strlen(tensor->name); + const uint64_t ptr = (uint64_t) tensor->data; - fwrite(&len, sizeof(size_t), 1, fout); - fwrite(tensor->name, sizeof(char), len, fout); + fwrite(&ptr, sizeof(uint64_t), 1, fout); } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); } } fclose(fout); } } + +ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { + assert(*ctx_data == NULL); + assert(*ctx_eval == NULL); + + ggml_cgraph result; + + struct ggml_tensor * data = NULL; + + // read file into data + { + FILE * fin = fopen(fname, "rb"); + + if (!fin) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return result; + } + + size_t fsize = 0; + + fseek(fin, 0, SEEK_END); + fsize = ftell(fin); + fseek(fin, 0, SEEK_SET); + + // create the data context + { + const size_t overhead = 1*GGML_TENSOR_OVERHEAD; + + struct ggml_init_params params = { + .mem_size = fsize + overhead, + .mem_buffer = NULL, + .no_alloc = false, + }; + + *ctx_data = ggml_init(params); + + if (!*ctx_data) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize); + + fread(data->data, sizeof(char), fsize, fin); + + fclose(fin); + } + + // populate result + { + const char * ptr = (const char *) data->data; + + const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic); + + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic); + return result; + } + + const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version); + + if (version != GGML_FILE_VERSION) { + fprintf(stderr, "%s: invalid version number\n", __func__); + return result; + } + + const uint32_t leafs = *(const uint32_t *) ptr; ptr += sizeof(leafs); + const uint32_t nodes = *(const uint32_t *) ptr; ptr += sizeof(nodes); + const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval); + + result.n_leafs = leafs; + result.n_nodes = nodes; + + // create the data context + { + const size_t overhead = (leafs + nodes)*GGML_TENSOR_OVERHEAD; + + struct ggml_init_params params = { + .mem_size = size_eval + overhead, + .mem_buffer = NULL, + .no_alloc = true, + }; + + *ctx_eval = ggml_init(params); + + if (!*ctx_eval) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + // leafs + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (int i = 0; i < leafs; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + + tensor->data = (void *) ptr; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + result.leafs[i] = tensor; + + ptr += ggml_nbytes(tensor); + + fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + + ggml_set_no_alloc(*ctx_eval, false); + + // nodes + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (int i = 0; i < nodes; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + result.nodes[i] = tensor; + + fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + } + + return result; +} diff --git a/examples/common-ggml.h b/examples/common-ggml.h index 1c6fcd34b..9260a7fd4 100644 --- a/examples/common-ggml.h +++ b/examples/common-ggml.h @@ -17,6 +17,5 @@ bool ggml_common_quantize_0( const std::vector & to_quant, const std::vector & to_skip); -struct ggml_cgraph; - -void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); +void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); +ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index 655ab559d..304ba13b6 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -194,8 +194,9 @@ int mnist_eval( // soft max ggml_tensor * probs = ggml_soft_max(ctx0, fc2); - // run the computation + // build / export / run the computation graph ggml_build_forward_expand(&gf, probs); + ggml_graph_export (&gf, "mnist.ggml"); // export before running to avoid work tensor ggml_graph_compute (ctx0, &gf); //ggml_graph_print (&gf); @@ -205,11 +206,20 @@ int mnist_eval( const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; - // export the computation graph - ggml_graph_export(&gf, "mnist.ggml"); - ggml_free(ctx0); + // TMP + // import the computation graph + { + struct ggml_context * ctx_data = NULL; + struct ggml_context * ctx_eval = NULL; + + struct ggml_cgraph gfi = ggml_graph_import("mnist.ggml", &ctx_data, &ctx_eval); + gfi.n_threads = n_threads; + + ggml_graph_compute(ctx0, &gfi); + } + return prediction; } From d2d1c220da782becf74a806e16bed3d1f129e604 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 25 May 2023 23:02:20 +0300 Subject: [PATCH 06/27] ggml : import op args correctly --- examples/common-ggml.cpp | 75 ++++++++++++++++++++++++++++++++++++++++ examples/mnist/main.cpp | 3 +- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 3e355ae24..544c0b644 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -428,6 +428,55 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { } fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + + // output the op arguments + { + struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; + + args[0] = tensor->src0; + args[1] = tensor->src1; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + args[2 + j] = tensor->opt[j]; + } + + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + if (args[j]) { + int32_t idx = -1; + + // check if leaf + { + for (int k = 0; k < cgraph->n_leafs; ++k) { + if (args[j] == cgraph->leafs[k]) { + idx = k; + break; + } + } + } + + // check if node + if (idx == -1) { + for (int k = 0; k < cgraph->n_nodes; ++k) { + if (args[j] == cgraph->nodes[k]) { + idx = GGML_MAX_NODES + k; + break; + } + } + } + + if (idx == -1) { + fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i); + return; + } + + fwrite(&idx, sizeof(int32_t), 1, fout); + } else { + const int32_t nul = -1; + + fwrite(&nul, sizeof(int32_t), 1, fout); + } + } + } } } @@ -612,6 +661,32 @@ ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_dat tensor->nb[j] = nb[j]; } + // parse args + { + struct ggml_tensor ** args[2 + GGML_MAX_OPT] = { + &tensor->src0, + &tensor->src1, + }; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + args[2 + j] = &tensor->opt[j]; + } + + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + const uint32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx); + + if (arg_idx == -1) { + continue; + } + + if (arg_idx < GGML_MAX_NODES) { + *args[j] = result.leafs[arg_idx]; + } else { + *args[j] = result.nodes[arg_idx - GGML_MAX_NODES]; + } + } + } + result.nodes[i] = tensor; fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index 304ba13b6..28f187b53 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -196,7 +196,6 @@ int mnist_eval( // build / export / run the computation graph ggml_build_forward_expand(&gf, probs); - ggml_graph_export (&gf, "mnist.ggml"); // export before running to avoid work tensor ggml_graph_compute (ctx0, &gf); //ggml_graph_print (&gf); @@ -208,6 +207,8 @@ int mnist_eval( ggml_free(ctx0); + ggml_graph_export(&gf, "mnist.ggml"); + // TMP // import the computation graph { From 4cfd92b82e435a068b95b2fd1261e66fc6eeda31 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 May 2023 11:23:52 +0300 Subject: [PATCH 07/27] ggml : add ggml_get_tensor_by_name() --- examples/mnist/main.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index 28f187b53..eab4a711c 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -193,6 +193,7 @@ int mnist_eval( // soft max ggml_tensor * probs = ggml_soft_max(ctx0, fc2); + ggml_set_name(probs, "probs"); // build / export / run the computation graph ggml_build_forward_expand(&gf, probs); @@ -201,25 +202,27 @@ int mnist_eval( //ggml_graph_print (&gf); ggml_graph_dump_dot(&gf, NULL, "mnist.dot"); + ggml_graph_export(&gf, "mnist.ggml"); + +#if 0 const float * probs_data = ggml_get_data_f32(probs); const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; +#else + struct ggml_context * ctx_data = NULL; + struct ggml_context * ctx_eval = NULL; - ggml_free(ctx0); + struct ggml_cgraph gfi = ggml_graph_import("mnist.ggml", &ctx_data, &ctx_eval); + gfi.n_threads = n_threads; - ggml_graph_export(&gf, "mnist.ggml"); + ggml_graph_compute(ctx0, &gfi); - // TMP - // import the computation graph - { - struct ggml_context * ctx_data = NULL; - struct ggml_context * ctx_eval = NULL; + const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gfi, "probs")); - struct ggml_cgraph gfi = ggml_graph_import("mnist.ggml", &ctx_data, &ctx_eval); - gfi.n_threads = n_threads; + const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; +#endif - ggml_graph_compute(ctx0, &gfi); - } + ggml_free(ctx0); return prediction; } From b0450c273bd077d555def520cd26158427023fe6 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 May 2023 16:00:20 +0300 Subject: [PATCH 08/27] mnist : add compute graph evaluation on CPU example --- examples/mnist/CMakeLists.txt | 6 ++ examples/mnist/main-cpu.cpp | 118 ++++++++++++++++++++++++++++++++++ examples/mnist/main.cpp | 40 ++++++------ 3 files changed, 142 insertions(+), 22 deletions(-) create mode 100644 examples/mnist/main-cpu.cpp diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt index 1a676d9df..c3750954d 100644 --- a/examples/mnist/CMakeLists.txt +++ b/examples/mnist/CMakeLists.txt @@ -5,3 +5,9 @@ set(TEST_TARGET mnist) add_executable(${TEST_TARGET} main.cpp) target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) +# +# mnist-cpu + +set(TEST_TARGET mnist-cpu) +add_executable(${TEST_TARGET} main-cpu.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp new file mode 100644 index 000000000..86ca7259f --- /dev/null +++ b/examples/mnist/main-cpu.cpp @@ -0,0 +1,118 @@ +// Use a pre-generated MNIST compute graph for inference on the CPU +// +// You can generate a compute graph using the "mnist" tool: +// +// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte +// +// This command creates the "mnist.ggml" file, which contains the generated compute graph. +// Now, you can re-use the compute graph with the "mnist-cpu" tool: +// +// $ ./bin/mnist-cpu ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte +// + +#include "ggml/ggml.h" + +#include "common.h" +#include "common-ggml.h" + +#include +#include +#include +#include +#include +#include + +// evaluate the MNIST compute graph +// +// - fname_cgraph: path to the compute graph +// - n_threads: number of threads to use +// - digit: 784 pixel values +// +// returns 0 - 9 prediction +int mnist_eval( + const char * fname_cgraph, + const int n_threads, + std::vector digit + ) { + // load the compute graph + struct ggml_context * ctx_data = NULL; + struct ggml_context * ctx_eval = NULL; + + struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); + gfi.n_threads = n_threads; + + // allocate eval context + // needed during ggml_graph_compute() to allocate a work tensor + static size_t buf_size = gfi.work_size; // TODO + static void * buf = malloc(buf_size); + + struct ggml_init_params params = { + .mem_size = buf_size, + .mem_buffer = buf, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + struct ggml_tensor * input = ggml_get_tensor_by_name(&gfi, "input"); + memcpy(input->data, digit.data(), ggml_nbytes(input)); + + ggml_graph_compute(ctx0, &gfi); + + const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gfi, "probs")); + + const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; + + ggml_free(ctx0); + ggml_free(ctx_data); + ggml_free(ctx_eval); + + return prediction; +} + +int main(int argc, char ** argv) { + srand(time(NULL)); + ggml_time_init(); + + if (argc != 3) { + fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]); + exit(0); + } + + uint8_t buf[784]; + std::vector digit; + + // read a random digit from the test set + { + std::ifstream fin(argv[2], std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]); + return 1; + } + + // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000) + fin.seekg(16 + 784 * (rand() % 10000)); + fin.read((char *) &buf, sizeof(buf)); + } + + // render the digit in ASCII + { + digit.resize(sizeof(buf)); + + for (int row = 0; row < 28; row++) { + for (int col = 0; col < 28; col++) { + fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_'); + digit[row*28 + col] = ((float)buf[row*28 + col]); + } + + fprintf(stderr, "\n"); + } + + fprintf(stderr, "\n"); + } + + const int prediction = mnist_eval(argv[1], 1, digit); + + fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction); + + return 0; +} diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index eab4a711c..ced167c00 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -166,7 +166,8 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) { int mnist_eval( const mnist_model & model, const int n_threads, - std::vector digit + std::vector digit, + const char * fname_cgraph ) { const auto & hparams = model.hparams; @@ -202,25 +203,17 @@ int mnist_eval( //ggml_graph_print (&gf); ggml_graph_dump_dot(&gf, NULL, "mnist.dot"); - ggml_graph_export(&gf, "mnist.ggml"); + if (fname_cgraph) { + // export the compute graph for later use + // see the "mnist-cpu" example + ggml_graph_export(&gf, "mnist.ggml"); -#if 0 - const float * probs_data = ggml_get_data_f32(probs); - - const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; -#else - struct ggml_context * ctx_data = NULL; - struct ggml_context * ctx_eval = NULL; - - struct ggml_cgraph gfi = ggml_graph_import("mnist.ggml", &ctx_data, &ctx_eval); - gfi.n_threads = n_threads; - - ggml_graph_compute(ctx0, &gfi); + fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph); + } - const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gfi, "probs")); + const float * probs_data = ggml_get_data_f32(probs); const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; -#endif ggml_free(ctx0); @@ -231,30 +224,31 @@ int mnist_eval( extern "C" { #endif -int wasm_eval(uint8_t *digitPtr) -{ +int wasm_eval(uint8_t * digitPtr) { mnist_model model; if (!mnist_model_load("models/mnist/ggml-model-f32.bin", model)) { fprintf(stderr, "error loading model\n"); return -1; } std::vector digit(digitPtr, digitPtr + 784); - int result = mnist_eval(model, 1, digit); + int result = mnist_eval(model, 1, digit, nullptr); ggml_free(model.ctx); + return result; } -int wasm_random_digit(char *digitPtr) -{ +int wasm_random_digit(char * digitPtr) { auto fin = std::ifstream("models/mnist/t10k-images.idx3-ubyte", std::ios::binary); if (!fin) { fprintf(stderr, "failed to open digits file\n"); return 0; } srand(time(NULL)); + // Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000) fin.seekg(16 + 784 * (rand() % 10000)); fin.read(digitPtr, 784); + return 1; } @@ -318,7 +312,9 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n"); } - fprintf(stdout, "%s: predicted digit is %d\n", __func__, mnist_eval(model, 1, digit)); + const int prediction = mnist_eval(model, 1, digit, "mnist.ggml"); + + fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction); ggml_free(model.ctx); From ddea4885e5e80dec7880b875fbc1cac41ef343be Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 May 2023 16:15:10 +0300 Subject: [PATCH 09/27] ggml : add ggml_tensor_overhead() --- examples/common-ggml.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 544c0b644..0db2c034b 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -509,7 +509,7 @@ ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_dat // create the data context { - const size_t overhead = 1*GGML_TENSOR_OVERHEAD; + const size_t overhead = 1*ggml_tensor_overhead(); struct ggml_init_params params = { .mem_size = fsize + overhead, @@ -559,7 +559,7 @@ ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_dat // create the data context { - const size_t overhead = (leafs + nodes)*GGML_TENSOR_OVERHEAD; + const size_t overhead = (leafs + nodes)*ggml_tensor_overhead(); struct ggml_init_params params = { .mem_size = size_eval + overhead, From f698dbf059c3da3397dc5f8a6654f21dd26153fd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 May 2023 17:10:18 +0300 Subject: [PATCH 10/27] ggml : rename new functions to ggml_cgraph_... --- examples/common-ggml.cpp | 18 +++++++++--------- examples/common-ggml.h | 5 +++-- examples/mnist/main-cpu.cpp | 2 +- examples/mnist/main.cpp | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 0db2c034b..ac16994c0 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -242,7 +242,7 @@ bool ggml_common_quantize_0( } \ } while (0) -void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { +void ggml_cgraph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; @@ -256,7 +256,7 @@ void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { tensor->name); } -void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { +void ggml_cgraph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; @@ -272,7 +272,7 @@ void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, tensor->name); } -void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { +void ggml_cgraph_export(const struct ggml_cgraph * cgraph, const char * fname) { assert(cgraph->work == NULL); assert(cgraph->work_size == 0); @@ -301,7 +301,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME"); for (int i = 0; i < cgraph->n_leafs; ++i) { - ggml_graph_export_leaf(cgraph->leafs[i], fout); + ggml_cgraph_export_leaf(cgraph->leafs[i], fout); GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); @@ -314,19 +314,19 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME"); for (int i = 0; i < cgraph->n_nodes; ++i) { - ggml_graph_export_node(cgraph->nodes[i], "DST", fout); + ggml_cgraph_export_node(cgraph->nodes[i], "DST", fout); if (cgraph->nodes[i]->src0) { - ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); + ggml_cgraph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); } if (cgraph->nodes[i]->src1) { - ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); + ggml_cgraph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); } for (int j = 0; j < GGML_MAX_OPT; ++j) { if (cgraph->nodes[i]->opt[j]) { - ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); + ggml_cgraph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); } } @@ -484,7 +484,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { } } -ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { +ggml_cgraph ggml_cgraph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { assert(*ctx_data == NULL); assert(*ctx_eval == NULL); diff --git a/examples/common-ggml.h b/examples/common-ggml.h index 9260a7fd4..1a85abed3 100644 --- a/examples/common-ggml.h +++ b/examples/common-ggml.h @@ -17,5 +17,6 @@ bool ggml_common_quantize_0( const std::vector & to_quant, const std::vector & to_skip); -void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); -ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); +// these will move to ggml when ready +void ggml_cgraph_export(const struct ggml_cgraph * cgraph, const char * fname); +ggml_cgraph ggml_cgraph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp index 86ca7259f..1a07b7453 100644 --- a/examples/mnist/main-cpu.cpp +++ b/examples/mnist/main-cpu.cpp @@ -38,7 +38,7 @@ int mnist_eval( struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_eval = NULL; - struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); + struct ggml_cgraph gfi = ggml_cgraph_import(fname_cgraph, &ctx_data, &ctx_eval); gfi.n_threads = n_threads; // allocate eval context diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index ced167c00..c6ecec2cd 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -206,7 +206,7 @@ int mnist_eval( if (fname_cgraph) { // export the compute graph for later use // see the "mnist-cpu" example - ggml_graph_export(&gf, "mnist.ggml"); + ggml_cgraph_export(&gf, "mnist.ggml"); fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph); } From bf93623ef83121241c0458e9e701307445f29c7b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 May 2023 18:19:42 +0300 Subject: [PATCH 11/27] mnist : add Metal inference skeleton (WIP) --- examples/mnist/CMakeLists.txt | 22 +++++++ examples/mnist/main-mtl.cpp | 110 ++++++++++++++++++++++++++++++++++ examples/mnist/main-mtl.h | 18 ++++++ examples/mnist/main-mtl.m | 14 +++++ 4 files changed, 164 insertions(+) create mode 100644 examples/mnist/main-mtl.cpp create mode 100644 examples/mnist/main-mtl.h create mode 100644 examples/mnist/main-mtl.m diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt index c3750954d..222d7cd2b 100644 --- a/examples/mnist/CMakeLists.txt +++ b/examples/mnist/CMakeLists.txt @@ -11,3 +11,25 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) set(TEST_TARGET mnist-cpu) add_executable(${TEST_TARGET} main-cpu.cpp) target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) + +if (APPLE) + # + # mnist-mtl + + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED) + + set(TEST_TARGET mnist-mtl) + add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m) + target_link_libraries(${TEST_TARGET} PRIVATE + ggml + common + common-ggml + ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} + ${METALKIT_FRAMEWORK} + ${METALPERFORMANCE_FRAMEWORK} + ) +endif() diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp new file mode 100644 index 000000000..9480dbc62 --- /dev/null +++ b/examples/mnist/main-mtl.cpp @@ -0,0 +1,110 @@ +// Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS +// + +#include "ggml/ggml.h" + +#include "main-mtl.h" + +#include "common-ggml.h" + +#include +#include +#include +#include +#include + +// evaluate the MNIST compute graph +// +// - fname_cgraph: path to the compute graph +// - n_threads: number of threads to use +// - digit: 784 pixel values +// +// returns 0 - 9 prediction +int mnist_eval( + const char * fname_cgraph, + const int n_threads, + std::vector digit + ) { + // load the compute graph + struct ggml_context * ctx_data = NULL; + struct ggml_context * ctx_eval = NULL; + + struct ggml_cgraph gf = ggml_cgraph_import(fname_cgraph, &ctx_data, &ctx_eval); + gf.n_threads = n_threads; + + // allocate eval context + // needed during ggml_graph_compute() to allocate a work tensor + static size_t buf_size = gf.work_size; // TODO + static void * buf = malloc(buf_size); + + struct ggml_init_params params = { + .mem_size = buf_size, + .mem_buffer = buf, + }; + + struct ggml_context * ctx_work = ggml_init(params); + + struct ggml_tensor * input = ggml_get_tensor_by_name(&gf, "input"); + memcpy(input->data, digit.data(), ggml_nbytes(input)); + + //ggml_graph_compute(ctx_work, &gf); + mnist_mtl_eval(ctx_data, ctx_eval, ctx_work, &gf); + + const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gf, "probs")); + + const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; + + ggml_free(ctx_work); + ggml_free(ctx_data); + ggml_free(ctx_eval); + + return prediction; +} + +int main(int argc, char ** argv) { + srand(time(NULL)); + ggml_time_init(); + + if (argc != 3) { + fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]); + exit(0); + } + + uint8_t buf[784]; + std::vector digit; + + // read a random digit from the test set + { + std::ifstream fin(argv[2], std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]); + return 1; + } + + // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000) + fin.seekg(16 + 784 * (rand() % 10000)); + fin.read((char *) &buf, sizeof(buf)); + } + + // render the digit in ASCII + { + digit.resize(sizeof(buf)); + + for (int row = 0; row < 28; row++) { + for (int col = 0; col < 28; col++) { + fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_'); + digit[row*28 + col] = ((float)buf[row*28 + col]); + } + + fprintf(stderr, "\n"); + } + + fprintf(stderr, "\n"); + } + + const int prediction = mnist_eval(argv[1], 1, digit); + + fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction); + + return 0; +} diff --git a/examples/mnist/main-mtl.h b/examples/mnist/main-mtl.h new file mode 100644 index 000000000..8dee89701 --- /dev/null +++ b/examples/mnist/main-mtl.h @@ -0,0 +1,18 @@ +#pragma once + +struct ggml_context; +struct ggml_cgraph; + +#ifdef __cplusplus +extern "C" { +#endif + +int mnist_mtl_eval( + struct ggml_context * ctx_data, + struct ggml_context * ctx_eval, + struct ggml_context * ctx_work, + struct ggml_cgraph * gf); + +#ifdef __cplusplus +} +#endif diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m new file mode 100644 index 000000000..8e82089dc --- /dev/null +++ b/examples/mnist/main-mtl.m @@ -0,0 +1,14 @@ +#import "main-mtl.h" + +#import +#import +#import + +int mnist_mtl_eval( + struct ggml_context * ctx_data, + struct ggml_context * ctx_eval, + struct ggml_context * ctx_work, + struct ggml_cgraph * gf) { + printf("mnist_mtl_eval\n"); + return 0; +} From bb126f9f7d6d04fa23b8ad0cbf588e38c27bf085 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 14:07:19 +0300 Subject: [PATCH 12/27] mnist : working on the Metal pipeline (WIP) --- examples/mnist/main-mtl.cpp | 8 ++-- examples/mnist/main-mtl.h | 8 +++- examples/mnist/main-mtl.m | 88 ++++++++++++++++++++++++++++++++++++- include/ggml/ggml.h | 5 ++- src/ggml.c | 8 ++++ 5 files changed, 108 insertions(+), 9 deletions(-) diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index 9480dbc62..c5c1ed821 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -48,11 +48,9 @@ int mnist_eval( memcpy(input->data, digit.data(), ggml_nbytes(input)); //ggml_graph_compute(ctx_work, &gf); - mnist_mtl_eval(ctx_data, ctx_eval, ctx_work, &gf); - - const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gf, "probs")); - - const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; + auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf); + const int prediction = mnist_mtl_eval(ctx_mtl, &gf); + mnist_mtl_free(ctx_mtl); ggml_free(ctx_work); ggml_free(ctx_data); diff --git a/examples/mnist/main-mtl.h b/examples/mnist/main-mtl.h index 8dee89701..0fa6f7ba5 100644 --- a/examples/mnist/main-mtl.h +++ b/examples/mnist/main-mtl.h @@ -7,12 +7,18 @@ struct ggml_cgraph; extern "C" { #endif -int mnist_mtl_eval( +struct ggml_mtl_context; + +struct ggml_mtl_context * mnist_mtl_init( struct ggml_context * ctx_data, struct ggml_context * ctx_eval, struct ggml_context * ctx_work, struct ggml_cgraph * gf); +void mnist_mtl_free(struct ggml_mtl_context * ctx); + +int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf); + #ifdef __cplusplus } #endif diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index 8e82089dc..f48bbb177 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -1,14 +1,98 @@ #import "main-mtl.h" +#import "ggml/ggml.h" + #import #import #import -int mnist_mtl_eval( +struct ggml_mtl_context { + id device; + id queue; + id command_buffer; + + id heap_data; + id heap_eval; +}; + +struct ggml_mtl_context * mnist_mtl_init( struct ggml_context * ctx_data, struct ggml_context * ctx_eval, struct ggml_context * ctx_work, struct ggml_cgraph * gf) { - printf("mnist_mtl_eval\n"); + fprintf(stderr, "%s: allocating\n", __func__); + + struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context)); + + ctx->device = MTLCreateSystemDefaultDevice(); + ctx->queue = [ctx->device newCommandQueue]; + ctx->command_buffer = [ctx->queue commandBuffer]; + + // pin ctx_data memory to GPU + // use MTLStorageModeShared to allow us to initialize the weights from the CPU + // TODO: how to use MTLStorageModeManaged? + // TODO: see if we can avoid this copy somehow + { + const void * mem_buffer = ggml_get_mem_buffer(ctx_data); + const size_t mem_size = ggml_get_mem_size(ctx_data); + + MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new]; + heap_desc.storageMode = MTLStorageModeShared; + heap_desc.size = mem_size; + + ctx->heap_data = [ctx->device newHeapWithDescriptor:heap_desc]; + [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile]; + + id buffer = [ctx->heap_data newBufferWithLength:mem_size options:MTLResourceStorageModeShared]; + + // copy data from CPU to GPU + memcpy([buffer contents], mem_buffer, mem_size); + } + + // pin ctx_eval memory to GPU + // this heap will be used for the intermediate results of the evaluation + { + const size_t mem_size = ggml_get_mem_size(ctx_eval); + + MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new]; + heap_desc.storageMode = MTLStorageModePrivate; // GPU only + heap_desc.size = mem_size; + + ctx->heap_eval = [ctx->device newHeapWithDescriptor:heap_desc]; + [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile]; + } + + return ctx; +} + +void mnist_mtl_free(struct ggml_mtl_context * ctx) { + fprintf(stderr, "%s: deallocating\n", __func__); + + free(ctx); +} + +int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf) { + fprintf(stderr, "%s: evaluating\n", __func__); + + // create a new encoder for the command buffer + id encoder = [ctx->command_buffer computeCommandEncoder]; + + for (int i = 0; i < gf->n_nodes; ++i) { + fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + + // TODO ... + } + + // finish encoding + [encoder endEncoding]; + + [ctx->command_buffer commit]; + [ctx->command_buffer waitUntilCompleted]; + + { + const double time_elapsed = [ctx->command_buffer GPUEndTime] - [ctx->command_buffer GPUStartTime]; + fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed); + } + return 0; } diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index 558138280..ad08adeb1 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -451,9 +451,12 @@ extern "C" { GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); - GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); + GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); + GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx); + GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx); + GGML_API struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, diff --git a/src/ggml.c b/src/ggml.c index 07ea92a58..f3831a4fb 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -4030,6 +4030,14 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) { ctx->no_alloc = no_alloc; } +void * ggml_get_mem_buffer(struct ggml_context * ctx) { + return ctx->mem_buffer; +} + +size_t ggml_get_mem_size(struct ggml_context * ctx) { + return ctx->mem_size; +} + // IMPORTANT: // when creating "opt" tensors, always save and load the scratch buffer // this is an error prone process, but it is necessary to support inplace From 24ea9dddf5907aabd684f2e6fb87e123cc88d5f7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 14:15:43 +0300 Subject: [PATCH 13/27] mnist : prepare the Metal encoder (WIP) --- examples/mnist/main-mtl.m | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index f48bbb177..add85a77d 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -9,7 +9,6 @@ struct ggml_mtl_context { id device; id queue; - id command_buffer; id heap_data; id heap_eval; @@ -26,7 +25,6 @@ ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; - ctx->command_buffer = [ctx->queue commandBuffer]; // pin ctx_data memory to GPU // use MTLStorageModeShared to allow us to initialize the weights from the CPU @@ -74,23 +72,39 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) { int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf) { fprintf(stderr, "%s: evaluating\n", __func__); - // create a new encoder for the command buffer - id encoder = [ctx->command_buffer computeCommandEncoder]; + id command_buffer = [ctx->queue commandBuffer]; + id encoder = [command_buffer computeCommandEncoder]; for (int i = 0; i < gf->n_nodes; ++i) { fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); - // TODO ... + switch (gf->nodes[i]->op) { + case GGML_OP_ADD: + { + } break; + case GGML_OP_RELU: + { + } break; + case GGML_OP_SOFT_MAX: + { + } break; + case GGML_OP_MUL_MAT: + { + } break; + default: + fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + GGML_ASSERT(false); + return -1; + } } - // finish encoding [encoder endEncoding]; - [ctx->command_buffer commit]; - [ctx->command_buffer waitUntilCompleted]; + [command_buffer commit]; + [command_buffer waitUntilCompleted]; { - const double time_elapsed = [ctx->command_buffer GPUEndTime] - [ctx->command_buffer GPUStartTime]; + const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime]; fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed); } From 2ec1dff65449e021cfffef58c0a7649db9fdcd9a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 15:28:01 +0300 Subject: [PATCH 14/27] mnist : first Metal kernel for F32 ADD --- examples/mnist/main-mtl.h | 12 +++--- examples/mnist/main-mtl.m | 79 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/examples/mnist/main-mtl.h b/examples/mnist/main-mtl.h index 0fa6f7ba5..4e661a4d3 100644 --- a/examples/mnist/main-mtl.h +++ b/examples/mnist/main-mtl.h @@ -10,14 +10,16 @@ extern "C" { struct ggml_mtl_context; struct ggml_mtl_context * mnist_mtl_init( - struct ggml_context * ctx_data, - struct ggml_context * ctx_eval, - struct ggml_context * ctx_work, - struct ggml_cgraph * gf); + struct ggml_context * ctx_data, + struct ggml_context * ctx_eval, + struct ggml_context * ctx_work, + struct ggml_cgraph * gf); void mnist_mtl_free(struct ggml_mtl_context * ctx); -int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf); +int mnist_mtl_eval( + struct ggml_mtl_context * ctx, + struct ggml_cgraph * gf); #ifdef __cplusplus } diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index add85a77d..3021f17a4 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -7,13 +7,36 @@ #import struct ggml_mtl_context { - id device; + struct ggml_context * ctx_data; + struct ggml_context * ctx_eval; + struct ggml_context * ctx_work; + + id device; id queue; + id library; id heap_data; id heap_eval; + + // custom kernels + id function_add; + id pipeline_add; }; +// MSL code +NSString * const msl_library_mnist = @"\ +#include \n\ +using namespace metal; \n\ + \n\ +kernel void kernel_add( \n\ + device const float * src0, \n\ + device const float * src1, \n\ + device float * dst, \n\ + uint gid[[thread_position_in_grid]]) { \n\ + dst[gid] = src0[gid] + src1[gid]; \n\ +} \n\ +"; + struct ggml_mtl_context * mnist_mtl_init( struct ggml_context * ctx_data, struct ggml_context * ctx_eval, @@ -23,8 +46,24 @@ struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context)); - ctx->device = MTLCreateSystemDefaultDevice(); - ctx->queue = [ctx->device newCommandQueue]; + ctx->ctx_data = ctx_data; + ctx->ctx_eval = ctx_eval; + ctx->ctx_work = ctx_work; + + ctx->device = MTLCreateSystemDefaultDevice(); + ctx->queue = [ctx->device newCommandQueue]; + + // compile from source string and show compile log + NSError * error = nil; + ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error]; + if (error) { + fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + exit(1); + } + + // load kernels + ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"]; + ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil]; // pin ctx_data memory to GPU // use MTLStorageModeShared to allow us to initialize the weights from the CPU @@ -69,7 +108,28 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) { free(ctx); } -int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf) { +// make a view of the respective MTL heap +id mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t) { + const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data); + const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval); + + const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval); + + const size_t t_size = ggml_nbytes(t); + const size_t t_offs = is_data ? offs_data : offs_eval; + + if (is_data) { + fprintf(stderr, "%s: data tensor '%s'\n", __func__, t->name); + return [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs]; + } else { + fprintf(stderr, "%s: eval tensor '%s'\n", __func__, t->name); + return [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs]; + } +} + +int mnist_mtl_eval( + struct ggml_mtl_context * ctx, + struct ggml_cgraph * gf) { fprintf(stderr, "%s: evaluating\n", __func__); id command_buffer = [ctx->queue commandBuffer]; @@ -81,6 +141,17 @@ int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf) { switch (gf->nodes[i]->op) { case GGML_OP_ADD: { + id id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0); + id id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i]); + + [encoder setComputePipelineState:ctx->pipeline_add]; + [encoder setBuffer:id_src0 offset:0 atIndex:0]; + [encoder setBuffer:id_src1 offset:0 atIndex:1]; + [encoder setBuffer:id_dst offset:0 atIndex:2]; + + const int64_t n = ggml_nelements(gf->nodes[i]); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_RELU: { From 966f9e6bdf0d7548470d83869622002b558e4b29 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 16:06:55 +0300 Subject: [PATCH 15/27] mnist : looks like MTLHeap does not work --- examples/mnist/main-mtl.m | 79 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index 3021f17a4..fbad028c7 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -21,6 +21,12 @@ // custom kernels id function_add; id pipeline_add; + + id function_relu; + id pipeline_relu; + + id function_softmax; + id pipeline_softmax; }; // MSL code @@ -34,6 +40,13 @@ kernel void kernel_add( device float * dst, \n\ uint gid[[thread_position_in_grid]]) { \n\ dst[gid] = src0[gid] + src1[gid]; \n\ +} \n\ + \n\ +kernel void kernel_relu( \n\ + device const float * src, \n\ + device float * dst, \n\ + uint gid[[thread_position_in_grid]]) { \n\ + dst[gid] = max(0.0f, src[gid]); \n\ } \n\ "; @@ -53,6 +66,14 @@ kernel void kernel_add( ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; + // determine if we can use MPS + if (MPSSupportsMTLDevice(ctx->device)) { + fprintf(stderr, "%s: using MPS\n", __func__); + } else { + fprintf(stderr, "%s: not using MPS\n", __func__); + GGML_ASSERT(false && "MPS not supported"); + } + // compile from source string and show compile log NSError * error = nil; ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error]; @@ -65,6 +86,9 @@ kernel void kernel_add( ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"]; ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil]; + ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"]; + ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil]; + // pin ctx_data memory to GPU // use MTLStorageModeShared to allow us to initialize the weights from the CPU // TODO: how to use MTLStorageModeManaged? @@ -77,13 +101,24 @@ kernel void kernel_add( heap_desc.storageMode = MTLStorageModeShared; heap_desc.size = mem_size; + printf("heap_desc.size = %zu\n", mem_size); + ctx->heap_data = [ctx->device newHeapWithDescriptor:heap_desc]; - [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile]; + [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed? + ctx->heap_data.label = @"heap_data"; + + printf("ctx->heap_data.size = %zu\n", [ctx->heap_data size]); id buffer = [ctx->heap_data newBufferWithLength:mem_size options:MTLResourceStorageModeShared]; + if (!buffer) { + fprintf(stderr, "%s: error: failed to allocate buffer\n", __func__); + exit(1); + } // copy data from CPU to GPU memcpy([buffer contents], mem_buffer, mem_size); + + fprintf(stderr, "%s: allocated data heap, size = %zu\n", __func__, mem_size); } // pin ctx_eval memory to GPU @@ -96,7 +131,9 @@ kernel void kernel_add( heap_desc.size = mem_size; ctx->heap_eval = [ctx->device newHeapWithDescriptor:heap_desc]; - [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile]; + [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed? + + fprintf(stderr, "%s: allocated eval heap, size = %zu\n", __func__, mem_size); } return ctx; @@ -118,13 +155,22 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) { const size_t t_size = ggml_nbytes(t); const size_t t_offs = is_data ? offs_data : offs_eval; + id result; + if (is_data) { - fprintf(stderr, "%s: data tensor '%s'\n", __func__, t->name); - return [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs]; + fprintf(stderr, "%s: data tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + result = [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs]; } else { - fprintf(stderr, "%s: eval tensor '%s'\n", __func__, t->name); - return [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs]; + fprintf(stderr, "%s: eval tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs]; + } + + NSLog(@"%s: buffer = %p\n", __func__, result); + if (result == nil) { + fprintf(stderr, "%s: error: buffer is nil\n", __func__); } + + return result; } int mnist_mtl_eval( @@ -155,9 +201,30 @@ int mnist_mtl_eval( } break; case GGML_OP_RELU: { + id id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i]); + + [encoder setComputePipelineState:ctx->pipeline_relu]; + [encoder setBuffer:id_src offset:0 atIndex:0]; + [encoder setBuffer:id_dst offset:0 atIndex:1]; + + const int64_t n = ggml_nelements(gf->nodes[i]); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_SOFT_MAX: { + // use MPSMatrixSoftMax + //id id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0); + //id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i]); + + //MPSMatrixDescriptor * desc = [MPSMatrixDescriptor + // matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32]; + + //MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src descriptor:desc]; + //MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst descriptor:desc]; + + //MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device]; + //[softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst]; } break; case GGML_OP_MUL_MAT: { From 1bc91815ce29815b1309916010322eced68356d5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 16:34:47 +0300 Subject: [PATCH 16/27] mnist : initial full pass of MNIST on the GPU (not verified) --- examples/mnist/main-mtl.m | 178 +++++++++++++++++++++++++++++++++----- 1 file changed, 156 insertions(+), 22 deletions(-) diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index fbad028c7..bb0cf924a 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -6,6 +6,9 @@ #import #import +// TODO: couldn't get this to work +//#define GGML_MTL_HEAP + struct ggml_mtl_context { struct ggml_context * ctx_data; struct ggml_context * ctx_eval; @@ -15,8 +18,13 @@ id queue; id library; +#ifdef GGML_MTL_HEAP id heap_data; id heap_eval; +#else + id buffer_data; + id buffer_eval; +#endif // custom kernels id function_add; @@ -89,6 +97,9 @@ kernel void kernel_relu( ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"]; ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil]; +#ifdef GGML_MTL_HEAP + // MTLHeap approach + // pin ctx_data memory to GPU // use MTLStorageModeShared to allow us to initialize the weights from the CPU // TODO: how to use MTLStorageModeManaged? @@ -135,6 +146,32 @@ kernel void kernel_relu( fprintf(stderr, "%s: allocated eval heap, size = %zu\n", __func__, mem_size); } +#else + // MTLBuffer approach + + // pin ctx_data memory to GPU + // use MTLStorageModeShared to allow us to initialize the weights from the CPU + // TODO: how to use MTLStorageModeManaged? + // TODO: see if we can avoid this copy somehow + { + const void * mem_buffer = ggml_get_mem_buffer(ctx_data); + const size_t mem_size = ggml_get_mem_size(ctx_data); + + ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared]; + + fprintf(stderr, "%s: allocated data buffer, size = %zu\n", __func__, mem_size); + } + + // pin ctx_eval memory to GPU + // this heap will be used for the intermediate results of the evaluation + { + const size_t mem_size = ggml_get_mem_size(ctx_eval); + + ctx->buffer_eval = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModePrivate]; + + fprintf(stderr, "%s: allocated eval buffer, size = %zu\n", __func__, mem_size); + } +#endif return ctx; } @@ -145,8 +182,10 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) { free(ctx); } +#ifdef GGML_MTL_HEAP + // make a view of the respective MTL heap -id mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t) { +id mnist_mtl_get_buffer_on_heap(struct ggml_mtl_context * ctx, struct ggml_tensor * t) { const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data); const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval); @@ -165,21 +204,62 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) { result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs]; } - NSLog(@"%s: buffer = %p\n", __func__, result); if (result == nil) { fprintf(stderr, "%s: error: buffer is nil\n", __func__); + GGML_ASSERT(false); + } + + return result; +} + +#else + +// get data / eval buffer + offset +id mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) { + const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data); + const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval); + + const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval); + + const size_t t_size = ggml_nbytes(t); + const size_t t_offs = is_data ? offs_data : offs_eval; + + id result; + + if (is_data) { + fprintf(stderr, "%s: data tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + result = ctx->buffer_data; + } else { + fprintf(stderr, "%s: eval tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + result = ctx->buffer_eval; + } + + if (result == nil) { + fprintf(stderr, "%s: error: buffer is nil\n", __func__); + GGML_ASSERT(false); + } + + if (offs != nil) { + *offs = t_offs; } return result; } +#endif + int mnist_mtl_eval( struct ggml_mtl_context * ctx, struct ggml_cgraph * gf) { fprintf(stderr, "%s: evaluating\n", __func__); id command_buffer = [ctx->queue commandBuffer]; - id encoder = [command_buffer computeCommandEncoder]; + //id encoder = [command_buffer computeCommandEncoder]; + id encoder = nil; + + size_t offs_src0; + size_t offs_src1; + size_t offs_dst; for (int i = 0; i < gf->n_nodes; ++i) { fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); @@ -187,47 +267,97 @@ int mnist_mtl_eval( switch (gf->nodes[i]->op) { case GGML_OP_ADD: { - id id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0); - id id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1); - id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i]); + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + id id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0); + id id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst); [encoder setComputePipelineState:ctx->pipeline_add]; - [encoder setBuffer:id_src0 offset:0 atIndex:0]; - [encoder setBuffer:id_src1 offset:0 atIndex:1]; - [encoder setBuffer:id_dst offset:0 atIndex:2]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; const int64_t n = ggml_nelements(gf->nodes[i]); [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_RELU: { - id id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0); - id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i]); + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + id id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst); [encoder setComputePipelineState:ctx->pipeline_relu]; - [encoder setBuffer:id_src offset:0 atIndex:0]; - [encoder setBuffer:id_dst offset:0 atIndex:1]; + [encoder setBuffer:id_src offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; const int64_t n = ggml_nelements(gf->nodes[i]); [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_SOFT_MAX: { + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; + } + // use MPSMatrixSoftMax - //id id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0); - //id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i]); + id id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst); + + MPSMatrixDescriptor * desc = [MPSMatrixDescriptor + matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32]; - //MPSMatrixDescriptor * desc = [MPSMatrixDescriptor - // matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32]; + MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src offset:offs_src0 descriptor:desc]; + MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst descriptor:desc]; - //MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src descriptor:desc]; - //MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst descriptor:desc]; + MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device]; - //MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device]; - //[softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst]; + [softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst]; } break; case GGML_OP_MUL_MAT: { + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; + } + + // use MPSMatrixMultiplication + id id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0); + id id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst); + + const int64_t ncols0 = gf->nodes[i]->src0->ne[0]; + const int64_t nrows0 = gf->nodes[i]->src0->ne[1]; + + const int64_t ncols1 = gf->nodes[i]->src1->ne[0]; + const int64_t nrows1 = gf->nodes[i]->src1->ne[1]; + + const int64_t ncols2 = gf->nodes[i]->ne[0]; + const int64_t nrows2 = gf->nodes[i]->ne[1]; + + GGML_ASSERT(ncols0 == ncols1); + + MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor + matrixDescriptorWithRows:nrows0 columns:ncols0 rowBytes:gf->nodes[i]->src0->nb[1] dataType:MPSDataTypeFloat32]; + MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor + matrixDescriptorWithRows:nrows1 columns:ncols1 rowBytes:gf->nodes[i]->src1->nb[1] dataType:MPSDataTypeFloat32]; + MPSMatrixDescriptor * desc2 = [MPSMatrixDescriptor + matrixDescriptorWithRows:nrows2 columns:ncols2 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32]; + + MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0 descriptor:desc0]; + MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1 descriptor:desc1]; + MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst descriptor:desc2]; + + MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] initWithDevice:ctx->device + transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0]; + + [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst]; } break; default: fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); @@ -236,7 +366,11 @@ int mnist_mtl_eval( } } - [encoder endEncoding]; + //[encoder endEncoding]; + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; + } [command_buffer commit]; [command_buffer waitUntilCompleted]; From 4134bac4e522ab5b39f90c05f011369de1b93131 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 16:42:19 +0300 Subject: [PATCH 17/27] mnist : minor cleanup --- examples/mnist/main-mtl.m | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index bb0cf924a..e248e0d77 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -197,10 +197,10 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) { id result; if (is_data) { - fprintf(stderr, "%s: data tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); result = [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs]; } else { - fprintf(stderr, "%s: eval tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs]; } @@ -227,10 +227,10 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) { id result; if (is_data) { - fprintf(stderr, "%s: data tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); result = ctx->buffer_data; } else { - fprintf(stderr, "%s: eval tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); result = ctx->buffer_eval; } @@ -254,7 +254,6 @@ int mnist_mtl_eval( fprintf(stderr, "%s: evaluating\n", __func__); id command_buffer = [ctx->queue commandBuffer]; - //id encoder = [command_buffer computeCommandEncoder]; id encoder = nil; size_t offs_src0; @@ -366,7 +365,6 @@ int mnist_mtl_eval( } } - //[encoder endEncoding]; if (encoder != nil) { [encoder endEncoding]; encoder = nil; From a556b576c504d4ec83c7f51ade7e1c8193a26641 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 17:16:53 +0300 Subject: [PATCH 18/27] mnist : full GPU inference works --- examples/mnist/main-mtl.cpp | 1 - examples/mnist/main-mtl.m | 52 ++++++++++++++++++++++++++++++++----- 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index c5c1ed821..86467119d 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -47,7 +47,6 @@ int mnist_eval( struct ggml_tensor * input = ggml_get_tensor_by_name(&gf, "input"); memcpy(input->data, digit.data(), ggml_nbytes(input)); - //ggml_graph_compute(ctx_work, &gf); auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf); const int prediction = mnist_mtl_eval(ctx_mtl, &gf); mnist_mtl_free(ctx_mtl); diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index e248e0d77..bd41d8dc7 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -26,15 +26,14 @@ id buffer_eval; #endif + id results; + // custom kernels id function_add; id pipeline_add; id function_relu; id pipeline_relu; - - id function_softmax; - id pipeline_softmax; }; // MSL code @@ -173,6 +172,15 @@ kernel void kernel_relu( } #endif + // allocate buffer for result extraction + { + const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]); + + ctx->results = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared]; + + fprintf(stderr, "%s: allocated results buffer, size = %zu\n", __func__, mem_size); + } + return ctx; } @@ -280,6 +288,7 @@ int mnist_mtl_eval( [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; const int64_t n = ggml_nelements(gf->nodes[i]); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_RELU: @@ -296,6 +305,7 @@ int mnist_mtl_eval( [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; const int64_t n = ggml_nelements(gf->nodes[i]); + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_SOFT_MAX: @@ -365,9 +375,21 @@ int mnist_mtl_eval( } } - if (encoder != nil) { - [encoder endEncoding]; - encoder = nil; + // extract results from the GPU + { + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; + } + + struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1]; + + id id_src = mnist_mtl_get_buffer(ctx, output, &offs_src0); + id id_dst = ctx->results; + + id encoder_blit = [command_buffer blitCommandEncoder]; + [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(output)]; + [encoder_blit endEncoding]; } [command_buffer commit]; @@ -378,5 +400,21 @@ int mnist_mtl_eval( fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed); } - return 0; + // select the most probable digit + + const float * probs = ctx->results.contents; + + int pred = 0; + float prob = probs[0]; + + for (int i = 0; i < 10; ++i) { + fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]); + + if (probs[i] > prob) { + pred = i; + prob = probs[i]; + } + } + + return pred; } From 8f8653b47209591d435e84c75a8cbef4340f343b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 17:21:21 +0300 Subject: [PATCH 19/27] mnist : use custom soft_max kernel since MPSMatrixSoftMax is bugged --- examples/mnist/main-mtl.m | 45 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index bd41d8dc7..6fe006226 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -34,6 +34,9 @@ id function_relu; id pipeline_relu; + + id function_soft_max; + id pipeline_soft_max; }; // MSL code @@ -41,6 +44,8 @@ #include \n\ using namespace metal; \n\ \n\ +#define MAX(x, y) ((x) > (y) ? (x) : (y)) \n\ + \n\ kernel void kernel_add( \n\ device const float * src0, \n\ device const float * src1, \n\ @@ -54,6 +59,24 @@ kernel void kernel_relu( device float * dst, \n\ uint gid[[thread_position_in_grid]]) { \n\ dst[gid] = max(0.0f, src[gid]); \n\ +} \n\ + \n\ +kernel void kernel_soft_max( \n\ + device const float * src, \n\ + device float * dst, \n\ + uint gid[[thread_position_in_grid]]) { \n\ + float max = 0.0f; \n\ + for (int i = 0; i < 10; i++) { \n\ + max = MAX(max, src[i]); \n\ + } \n\ + float sum = 0.0f; \n\ + for (int i = 0; i < 10; i++) { \n\ + dst[i] = exp(src[i] - max); \n\ + sum += dst[i]; \n\ + } \n\ + for (int i = 0; i < 10; i++) { \n\ + dst[i] /= sum; \n\ + } \n\ } \n\ "; @@ -96,6 +119,9 @@ kernel void kernel_relu( ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"]; ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil]; + ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max"]; + ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil]; + #ifdef GGML_MTL_HEAP // MTLHeap approach @@ -310,6 +336,9 @@ int mnist_mtl_eval( } break; case GGML_OP_SOFT_MAX: { +#if 0 + // NOTE: MPSMatrixSoftMax is not working properly, probably there is a bug + if (encoder != nil) { [encoder endEncoding]; encoder = nil; @@ -328,6 +357,22 @@ int mnist_mtl_eval( MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device]; [softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst]; +#else + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + id id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst); + + [encoder setComputePipelineState:ctx->pipeline_soft_max]; + [encoder setBuffer:id_src offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(gf->nodes[i]); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; +#endif } break; case GGML_OP_MUL_MAT: { From 3b97377edd3160e4454f2e6f415af64bffdff693 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 21:02:33 +0300 Subject: [PATCH 20/27] mnist : use constant for soft_max instead of hardcoded 10 --- examples/mnist/main-cpu.cpp | 2 -- examples/mnist/main-mtl.cpp | 9 +++++++ examples/mnist/main-mtl.m | 49 +++++++++++++++++++++++-------------- 3 files changed, 40 insertions(+), 20 deletions(-) diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp index 1a07b7453..9b1a425cf 100644 --- a/examples/mnist/main-cpu.cpp +++ b/examples/mnist/main-cpu.cpp @@ -12,7 +12,6 @@ #include "ggml/ggml.h" -#include "common.h" #include "common-ggml.h" #include @@ -20,7 +19,6 @@ #include #include #include -#include // evaluate the MNIST compute graph // diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index 86467119d..6f5160faf 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -1,5 +1,14 @@ // Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS // +// You can generate a compute graph using the "mnist" tool: +// +// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte +// +// This command creates the "mnist.ggml" file, which contains the generated compute graph. +// Now, you can re-use the compute graph on the GPU with the "mnist-mtl" tool: +// +// $ ./bin/mnist-mtl ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte +// #include "ggml/ggml.h" diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index 6fe006226..e3a7739e7 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -46,6 +46,8 @@ \n\ #define MAX(x, y) ((x) > (y) ? (x) : (y)) \n\ \n\ +constant int k_digits [[function_constant(0)]]; \n\ + \n\ kernel void kernel_add( \n\ device const float * src0, \n\ device const float * src1, \n\ @@ -66,15 +68,15 @@ kernel void kernel_soft_max( device float * dst, \n\ uint gid[[thread_position_in_grid]]) { \n\ float max = 0.0f; \n\ - for (int i = 0; i < 10; i++) { \n\ + for (int i = 0; i < k_digits; i++) { \n\ max = MAX(max, src[i]); \n\ } \n\ float sum = 0.0f; \n\ - for (int i = 0; i < 10; i++) { \n\ + for (int i = 0; i < k_digits; i++) { \n\ dst[i] = exp(src[i] - max); \n\ sum += dst[i]; \n\ } \n\ - for (int i = 0; i < 10; i++) { \n\ + for (int i = 0; i < k_digits; i++) { \n\ dst[i] /= sum; \n\ } \n\ } \n\ @@ -113,14 +115,24 @@ kernel void kernel_soft_max( } // load kernels - ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"]; - ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil]; + { + const int k_digits = ggml_get_tensor_by_name(gf, "probs")->ne[0]; - ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"]; - ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil]; + MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new]; + [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"]; - ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max"]; - ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil]; + ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"]; + ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil]; + fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add); + + ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"]; + ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil]; + fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu); + + ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil]; + ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil]; + fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max); + } #ifdef GGML_MTL_HEAP // MTLHeap approach @@ -446,18 +458,19 @@ int mnist_mtl_eval( } // select the most probable digit + int pred = -1; + { + const float * probs = ctx->results.contents; - const float * probs = ctx->results.contents; - - int pred = 0; - float prob = probs[0]; + float prob = probs[0]; - for (int i = 0; i < 10; ++i) { - fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]); + for (int i = 0; i < 10; ++i) { + fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]); - if (probs[i] > prob) { - pred = i; - prob = probs[i]; + if (probs[i] > prob) { + pred = i; + prob = probs[i]; + } } } From e350f138d4a2c2360870fbfe952bedf2d154c07b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 21:16:56 +0300 Subject: [PATCH 21/27] mnist : check multiple predictions (Metal) --- examples/mnist/main-mtl.cpp | 19 +++++++++++++++---- examples/mnist/main-mtl.m | 25 +++++++++++++++++-------- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index 6f5160faf..b6305bee9 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -53,11 +53,22 @@ int mnist_eval( struct ggml_context * ctx_work = ggml_init(params); - struct ggml_tensor * input = ggml_get_tensor_by_name(&gf, "input"); - memcpy(input->data, digit.data(), ggml_nbytes(input)); - auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf); - const int prediction = mnist_mtl_eval(ctx_mtl, &gf); + + int prediction = -1; + + for (int i = 0; i < 1; ++i) { + struct ggml_tensor * input = ggml_get_tensor_by_name(&gf, "input"); + + if (i % 2 == 0) { + memcpy(input->data, digit.data(), ggml_nbytes(input)); + } else { + memset(input->data, 0, ggml_nbytes(input)); + } + + prediction = mnist_mtl_eval(ctx_mtl, &gf); + } + mnist_mtl_free(ctx_mtl); ggml_free(ctx_work); diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index e3a7739e7..41152694f 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -26,7 +26,7 @@ id buffer_eval; #endif - id results; + id out; // custom kernels id function_add; @@ -214,9 +214,9 @@ kernel void kernel_soft_max( { const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]); - ctx->results = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared]; + ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared]; - fprintf(stderr, "%s: allocated results buffer, size = %zu\n", __func__, mem_size); + fprintf(stderr, "%s: allocated out buffer, size = %zu\n", __func__, mem_size); } return ctx; @@ -306,6 +306,15 @@ int mnist_mtl_eval( size_t offs_src1; size_t offs_dst; + // copy the input data to the GPU + { + struct ggml_tensor * inp = ggml_get_tensor_by_name(gf, "input"); + + id id_dst = mnist_mtl_get_buffer(ctx, inp, &offs_src0); + + memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp)); + } + for (int i = 0; i < gf->n_nodes; ++i) { fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); @@ -439,13 +448,13 @@ int mnist_mtl_eval( encoder = nil; } - struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1]; + struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1]; - id id_src = mnist_mtl_get_buffer(ctx, output, &offs_src0); - id id_dst = ctx->results; + id id_src = mnist_mtl_get_buffer(ctx, out, &offs_src0); + id id_dst = ctx->out; id encoder_blit = [command_buffer blitCommandEncoder]; - [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(output)]; + [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)]; [encoder_blit endEncoding]; } @@ -460,7 +469,7 @@ int mnist_mtl_eval( // select the most probable digit int pred = -1; { - const float * probs = ctx->results.contents; + const float * probs = ctx->out.contents; float prob = probs[0]; From 4fa01f0385c3104796f94f73e2b2e39950b3be9a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 21:23:28 +0300 Subject: [PATCH 22/27] mnist : minor --- examples/mnist/main-mtl.cpp | 2 ++ examples/mnist/main-mtl.m | 24 +++++++++++++----------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index b6305bee9..1885232fc 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -53,6 +53,7 @@ int mnist_eval( struct ggml_context * ctx_work = ggml_init(params); + // this allocates all Metal resources and memory buffers auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf); int prediction = -1; @@ -66,6 +67,7 @@ int mnist_eval( memset(input->data, 0, ggml_nbytes(input)); } + // the actual inference happens here prediction = mnist_mtl_eval(ctx_mtl, &gf); } diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index 41152694f..e70c0aa1d 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -95,8 +95,8 @@ kernel void kernel_soft_max( ctx->ctx_eval = ctx_eval; ctx->ctx_work = ctx_work; - ctx->device = MTLCreateSystemDefaultDevice(); - ctx->queue = [ctx->device newCommandQueue]; + ctx->device = MTLCreateSystemDefaultDevice(); + ctx->queue = [ctx->device newCommandQueue]; // determine if we can use MPS if (MPSSupportsMTLDevice(ctx->device)) { @@ -107,11 +107,13 @@ kernel void kernel_soft_max( } // compile from source string and show compile log - NSError * error = nil; - ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error]; - if (error) { - fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); + { + NSError * error = nil; + ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error]; + if (error) { + fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + exit(1); + } } // load kernels @@ -200,7 +202,7 @@ kernel void kernel_soft_max( } // pin ctx_eval memory to GPU - // this heap will be used for the intermediate results of the evaluation + // this buffer will be used for the intermediate results of the evaluation { const size_t mem_size = ggml_get_mem_size(ctx_eval); @@ -467,7 +469,7 @@ int mnist_mtl_eval( } // select the most probable digit - int pred = -1; + int result = -1; { const float * probs = ctx->out.contents; @@ -477,11 +479,11 @@ int mnist_mtl_eval( fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]); if (probs[i] > prob) { - pred = i; + result = i; prob = probs[i]; } } } - return pred; + return result; } From 79dcbfdf08171fb083b997275ad307701b612d62 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 21:33:53 +0300 Subject: [PATCH 23/27] ggml : move cgraph import / export to ggml --- examples/common-ggml.cpp | 463 ------------------------------------ examples/common-ggml.h | 4 - examples/mnist/main-cpu.cpp | 2 +- examples/mnist/main-mtl.cpp | 2 +- examples/mnist/main.cpp | 3 +- include/ggml/ggml.h | 3 + src/ggml.c | 455 +++++++++++++++++++++++++++++++++++ 7 files changed, 461 insertions(+), 471 deletions(-) diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index ac16994c0..9215dbeab 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -233,466 +233,3 @@ bool ggml_common_quantize_0( return true; } - -#define GGML_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - -void ggml_cgraph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { - const int64_t * ne = tensor->ne; - const size_t * nb = tensor->nb; - - fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n", - ggml_type_name(tensor->type), - ggml_op_name (tensor->op), - tensor->n_dims, - ne[0], ne[1], ne[2], ne[3], - nb[0], nb[1], nb[2], nb[3], - tensor->data, - tensor->name); -} - -void ggml_cgraph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { - const int64_t * ne = tensor->ne; - const size_t * nb = tensor->nb; - - fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n", - arg, - ggml_type_name(tensor->type), - ggml_op_name (tensor->op), - tensor->n_dims, - ne[0], ne[1], ne[2], ne[3], - nb[0], nb[1], nb[2], nb[3], - tensor->n_tasks, - tensor->data, - tensor->name); -} - -void ggml_cgraph_export(const struct ggml_cgraph * cgraph, const char * fname) { - assert(cgraph->work == NULL); - assert(cgraph->work_size == 0); - - uint64_t size_eval = 0; - - // compute size of intermediate results - // TODO: does not take into account scratch buffers !!!! - for (int i = 0; i < cgraph->n_nodes; ++i) { - size_eval += ggml_nbytes(cgraph->nodes[i]); - } - - // print - { - FILE * fout = stdout; - - fprintf(fout, "\n"); - fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); - fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); - fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); - fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); - fprintf(fout, "%-16s %8llu\n", "eval", size_eval); - - // header - fprintf(fout, "\n"); - fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n", - "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME"); - - for (int i = 0; i < cgraph->n_leafs; ++i) { - ggml_cgraph_export_leaf(cgraph->leafs[i], fout); - - GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); - GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); - GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); - } - - // header - fprintf(fout, "\n"); - fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n", - "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME"); - - for (int i = 0; i < cgraph->n_nodes; ++i) { - ggml_cgraph_export_node(cgraph->nodes[i], "DST", fout); - - if (cgraph->nodes[i]->src0) { - ggml_cgraph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); - } - - if (cgraph->nodes[i]->src1) { - ggml_cgraph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); - } - - for (int j = 0; j < GGML_MAX_OPT; ++j) { - if (cgraph->nodes[i]->opt[j]) { - ggml_cgraph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); - } - } - - fprintf(fout, "\n"); - } - - fprintf(fout, "\n"); - } - - // write binary data - { - FILE * fout = fopen(fname, "wb"); - - if (!fout) { - fprintf(stderr, "%s: failed to open %s\n", __func__, fname); - return; - } - - // header - { - const uint32_t magic = GGML_FILE_MAGIC; - const uint32_t version = GGML_FILE_VERSION; - const uint32_t leafs = cgraph->n_leafs; - const uint32_t nodes = cgraph->n_nodes; - - fwrite(&magic, sizeof(uint32_t), 1, fout); - fwrite(&version, sizeof(uint32_t), 1, fout); - fwrite(&leafs, sizeof(uint32_t), 1, fout); - fwrite(&nodes, sizeof(uint32_t), 1, fout); - fwrite(&size_eval, sizeof(uint64_t), 1, fout); - } - - // leafs - { - for (int i = 0; i < cgraph->n_leafs; ++i) { - const struct ggml_tensor * tensor = cgraph->leafs[i]; - - const uint32_t type = tensor->type; - const uint32_t op = tensor->op; - const uint32_t n_dims = tensor->n_dims; - - fwrite(&type, sizeof(uint32_t), 1, fout); - fwrite(&op, sizeof(uint32_t), 1, fout); - fwrite(&n_dims, sizeof(uint32_t), 1, fout); - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - const uint64_t ne = tensor->ne[j]; - const uint64_t nb = tensor->nb[j]; - - fwrite(&ne, sizeof(uint64_t), 1, fout); - fwrite(&nb, sizeof(uint64_t), 1, fout); - } - - // store the pointer address - { - const uint64_t ptr = (uint64_t) tensor->data; - - fwrite(&ptr, sizeof(uint64_t), 1, fout); - } - - fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); - - // dump the data - // TODO: pad this to 32 byte boundary - { - const size_t size = ggml_nbytes(tensor); - - fwrite(tensor->data, sizeof(char), size, fout); - } - } - } - - // nodes - { - for (int i = 0; i < cgraph->n_nodes; ++i) { - const struct ggml_tensor * tensor = cgraph->nodes[i]; - - const uint32_t type = tensor->type; - const uint32_t op = tensor->op; - const uint32_t n_dims = tensor->n_dims; - - fwrite(&type, sizeof(uint32_t), 1, fout); - fwrite(&op, sizeof(uint32_t), 1, fout); - fwrite(&n_dims, sizeof(uint32_t), 1, fout); - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - const uint64_t ne = tensor->ne[j]; - const uint64_t nb = tensor->nb[j]; - - fwrite(&ne, sizeof(uint64_t), 1, fout); - fwrite(&nb, sizeof(uint64_t), 1, fout); - } - - // store the pointer address - { - const uint64_t ptr = (uint64_t) tensor->data; - - fwrite(&ptr, sizeof(uint64_t), 1, fout); - } - - fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); - - // output the op arguments - { - struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; - - args[0] = tensor->src0; - args[1] = tensor->src1; - - for (int j = 0; j < GGML_MAX_OPT; ++j) { - args[2 + j] = tensor->opt[j]; - } - - for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { - if (args[j]) { - int32_t idx = -1; - - // check if leaf - { - for (int k = 0; k < cgraph->n_leafs; ++k) { - if (args[j] == cgraph->leafs[k]) { - idx = k; - break; - } - } - } - - // check if node - if (idx == -1) { - for (int k = 0; k < cgraph->n_nodes; ++k) { - if (args[j] == cgraph->nodes[k]) { - idx = GGML_MAX_NODES + k; - break; - } - } - } - - if (idx == -1) { - fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i); - return; - } - - fwrite(&idx, sizeof(int32_t), 1, fout); - } else { - const int32_t nul = -1; - - fwrite(&nul, sizeof(int32_t), 1, fout); - } - } - } - } - } - - fclose(fout); - } -} - -ggml_cgraph ggml_cgraph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { - assert(*ctx_data == NULL); - assert(*ctx_eval == NULL); - - ggml_cgraph result; - - struct ggml_tensor * data = NULL; - - // read file into data - { - FILE * fin = fopen(fname, "rb"); - - if (!fin) { - fprintf(stderr, "%s: failed to open %s\n", __func__, fname); - return result; - } - - size_t fsize = 0; - - fseek(fin, 0, SEEK_END); - fsize = ftell(fin); - fseek(fin, 0, SEEK_SET); - - // create the data context - { - const size_t overhead = 1*ggml_tensor_overhead(); - - struct ggml_init_params params = { - .mem_size = fsize + overhead, - .mem_buffer = NULL, - .no_alloc = false, - }; - - *ctx_data = ggml_init(params); - - if (!*ctx_data) { - fprintf(stderr, "%s: failed to create ggml context\n", __func__); - return result; - } - } - - data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize); - - fread(data->data, sizeof(char), fsize, fin); - - fclose(fin); - } - - // populate result - { - const char * ptr = (const char *) data->data; - - const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic); - - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic); - return result; - } - - const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version); - - if (version != GGML_FILE_VERSION) { - fprintf(stderr, "%s: invalid version number\n", __func__); - return result; - } - - const uint32_t leafs = *(const uint32_t *) ptr; ptr += sizeof(leafs); - const uint32_t nodes = *(const uint32_t *) ptr; ptr += sizeof(nodes); - const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval); - - result.n_leafs = leafs; - result.n_nodes = nodes; - - // create the data context - { - const size_t overhead = (leafs + nodes)*ggml_tensor_overhead(); - - struct ggml_init_params params = { - .mem_size = size_eval + overhead, - .mem_buffer = NULL, - .no_alloc = true, - }; - - *ctx_eval = ggml_init(params); - - if (!*ctx_eval) { - fprintf(stderr, "%s: failed to create ggml context\n", __func__); - return result; - } - } - - // leafs - { - uint32_t type; - uint32_t op; - uint32_t n_dims; - - for (int i = 0; i < leafs; ++i) { - type = *(const uint32_t *) ptr; ptr += sizeof(type); - op = *(const uint32_t *) ptr; ptr += sizeof(op); - n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); - - int64_t ne[GGML_MAX_DIMS]; - size_t nb[GGML_MAX_DIMS]; - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - uint64_t ne_cur; - uint64_t nb_cur; - - ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); - nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); - - ne[j] = ne_cur; - nb[j] = nb_cur; - } - - struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); - - tensor->op = (enum ggml_op) op; - - uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); - - memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; - - tensor->data = (void *) ptr; - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - tensor->nb[j] = nb[j]; - } - - result.leafs[i] = tensor; - - ptr += ggml_nbytes(tensor); - - fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); - } - } - - ggml_set_no_alloc(*ctx_eval, false); - - // nodes - { - uint32_t type; - uint32_t op; - uint32_t n_dims; - - for (int i = 0; i < nodes; ++i) { - type = *(const uint32_t *) ptr; ptr += sizeof(type); - op = *(const uint32_t *) ptr; ptr += sizeof(op); - n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); - - int64_t ne[GGML_MAX_DIMS]; - size_t nb[GGML_MAX_DIMS]; - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - uint64_t ne_cur; - uint64_t nb_cur; - - ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); - nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); - - ne[j] = ne_cur; - nb[j] = nb_cur; - } - - struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); - - tensor->op = (enum ggml_op) op; - - uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); - - memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; - - for (int j = 0; j < GGML_MAX_DIMS; ++j) { - tensor->nb[j] = nb[j]; - } - - // parse args - { - struct ggml_tensor ** args[2 + GGML_MAX_OPT] = { - &tensor->src0, - &tensor->src1, - }; - - for (int j = 0; j < GGML_MAX_OPT; ++j) { - args[2 + j] = &tensor->opt[j]; - } - - for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { - const uint32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx); - - if (arg_idx == -1) { - continue; - } - - if (arg_idx < GGML_MAX_NODES) { - *args[j] = result.leafs[arg_idx]; - } else { - *args[j] = result.nodes[arg_idx - GGML_MAX_NODES]; - } - } - } - - result.nodes[i] = tensor; - - fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); - } - } - } - - return result; -} diff --git a/examples/common-ggml.h b/examples/common-ggml.h index 1a85abed3..477de341a 100644 --- a/examples/common-ggml.h +++ b/examples/common-ggml.h @@ -16,7 +16,3 @@ bool ggml_common_quantize_0( const ggml_ftype ftype, const std::vector & to_quant, const std::vector & to_skip); - -// these will move to ggml when ready -void ggml_cgraph_export(const struct ggml_cgraph * cgraph, const char * fname); -ggml_cgraph ggml_cgraph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp index 9b1a425cf..4e86ef825 100644 --- a/examples/mnist/main-cpu.cpp +++ b/examples/mnist/main-cpu.cpp @@ -36,7 +36,7 @@ int mnist_eval( struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_eval = NULL; - struct ggml_cgraph gfi = ggml_cgraph_import(fname_cgraph, &ctx_data, &ctx_eval); + struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); gfi.n_threads = n_threads; // allocate eval context diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index 1885232fc..19122fcca 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -38,7 +38,7 @@ int mnist_eval( struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_eval = NULL; - struct ggml_cgraph gf = ggml_cgraph_import(fname_cgraph, &ctx_data, &ctx_eval); + struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); gf.n_threads = n_threads; // allocate eval context diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index c6ecec2cd..315ff987e 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -1,7 +1,6 @@ #include "ggml/ggml.h" #include "common.h" -#include "common-ggml.h" #include #include @@ -206,7 +205,7 @@ int mnist_eval( if (fname_cgraph) { // export the compute graph for later use // see the "mnist-cpu" example - ggml_cgraph_export(&gf, "mnist.ggml"); + ggml_graph_export(&gf, "mnist.ggml"); fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph); } diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index ad08adeb1..3320157fe 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -983,6 +983,9 @@ extern "C" { GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name); + GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); + GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); + // print info and performance information for the graph GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); diff --git a/src/ggml.c b/src/ggml.c index f3831a4fb..5f6426db7 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -14560,6 +14560,461 @@ struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const return NULL; } +static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n", + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->data, + tensor->name); +} + +static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n", + arg, + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->n_tasks, + tensor->data, + tensor->name); +} + +void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { + assert(cgraph->work == NULL); + assert(cgraph->work_size == 0); + + uint64_t size_eval = 0; + + // compute size of intermediate results + // TODO: does not take into account scratch buffers !!!! + for (int i = 0; i < cgraph->n_nodes; ++i) { + size_eval += ggml_nbytes(cgraph->nodes[i]); + } + + // print + { + FILE * fout = stdout; + + fprintf(fout, "\n"); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %8llu\n", "eval", size_eval); + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n", + "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME"); + + for (int i = 0; i < cgraph->n_leafs; ++i) { + ggml_graph_export_leaf(cgraph->leafs[i], fout); + + GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); + GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); + GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); + } + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n", + "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME"); + + for (int i = 0; i < cgraph->n_nodes; ++i) { + ggml_graph_export_node(cgraph->nodes[i], "DST", fout); + + if (cgraph->nodes[i]->src0) { + ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); + } + + if (cgraph->nodes[i]->src1) { + ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); + } + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + if (cgraph->nodes[i]->opt[j]) { + ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); + } + } + + fprintf(fout, "\n"); + } + + fprintf(fout, "\n"); + } + + // write binary data + { + FILE * fout = fopen(fname, "wb"); + + if (!fout) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return; + } + + // header + { + const uint32_t magic = GGML_FILE_MAGIC; + const uint32_t version = GGML_FILE_VERSION; + const uint32_t n_leafs = cgraph->n_leafs; + const uint32_t nodes = cgraph->n_nodes; + + fwrite(&magic, sizeof(uint32_t), 1, fout); + fwrite(&version, sizeof(uint32_t), 1, fout); + fwrite(&n_leafs, sizeof(uint32_t), 1, fout); + fwrite(&nodes, sizeof(uint32_t), 1, fout); + fwrite(&size_eval, sizeof(uint64_t), 1, fout); + } + + // leafs + { + for (int i = 0; i < cgraph->n_leafs; ++i) { + const struct ggml_tensor * tensor = cgraph->leafs[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); + } + + // store the pointer address + { + const uint64_t ptr = (uint64_t) tensor->data; + + fwrite(&ptr, sizeof(uint64_t), 1, fout); + } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + + // dump the data + // TODO: pad this to 32 byte boundary + { + const size_t size = ggml_nbytes(tensor); + + fwrite(tensor->data, sizeof(char), size, fout); + } + } + } + + // nodes + { + for (int i = 0; i < cgraph->n_nodes; ++i) { + const struct ggml_tensor * tensor = cgraph->nodes[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); + } + + // store the pointer address + { + const uint64_t ptr = (uint64_t) tensor->data; + + fwrite(&ptr, sizeof(uint64_t), 1, fout); + } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + + // output the op arguments + { + struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; + + args[0] = tensor->src0; + args[1] = tensor->src1; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + args[2 + j] = tensor->opt[j]; + } + + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + if (args[j]) { + int32_t idx = -1; + + // check if leaf + { + for (int k = 0; k < cgraph->n_leafs; ++k) { + if (args[j] == cgraph->leafs[k]) { + idx = k; + break; + } + } + } + + // check if node + if (idx == -1) { + for (int k = 0; k < cgraph->n_nodes; ++k) { + if (args[j] == cgraph->nodes[k]) { + idx = GGML_MAX_NODES + k; + break; + } + } + } + + if (idx == -1) { + fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i); + return; + } + + fwrite(&idx, sizeof(int32_t), 1, fout); + } else { + const int32_t nul = -1; + + fwrite(&nul, sizeof(int32_t), 1, fout); + } + } + } + } + } + + fclose(fout); + } +} + +struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { + assert(*ctx_data == NULL); + assert(*ctx_eval == NULL); + + struct ggml_cgraph result = { 0 }; + + struct ggml_tensor * data = NULL; + + // read file into data + { + FILE * fin = fopen(fname, "rb"); + + if (!fin) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return result; + } + + size_t fsize = 0; + + fseek(fin, 0, SEEK_END); + fsize = ftell(fin); + fseek(fin, 0, SEEK_SET); + + // create the data context + { + const size_t overhead = 1*ggml_tensor_overhead(); + + struct ggml_init_params params = { + .mem_size = fsize + overhead, + .mem_buffer = NULL, + .no_alloc = false, + }; + + *ctx_data = ggml_init(params); + + if (!*ctx_data) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize); + + fread(data->data, sizeof(char), fsize, fin); + + fclose(fin); + } + + // populate result + { + char * ptr = (char *) data->data; + + const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic); + + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic); + return result; + } + + const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version); + + if (version != GGML_FILE_VERSION) { + fprintf(stderr, "%s: invalid version number\n", __func__); + return result; + } + + const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs); + const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes); + const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval); + + result.n_leafs = n_leafs; + result.n_nodes = n_nodes; + + // create the data context + { + const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead(); + + struct ggml_init_params params = { + .mem_size = size_eval + overhead, + .mem_buffer = NULL, + .no_alloc = true, + }; + + *ctx_eval = ggml_init(params); + + if (!*ctx_eval) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + // leafs + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (uint32_t i = 0; i < n_leafs; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + + tensor->data = (void *) ptr; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + result.leafs[i] = tensor; + + ptr += ggml_nbytes(tensor); + + fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + + ggml_set_no_alloc(*ctx_eval, false); + + // nodes + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (uint32_t i = 0; i < n_nodes; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + // parse args + { + struct ggml_tensor ** args[2 + GGML_MAX_OPT] = { + &tensor->src0, + &tensor->src1, + }; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + args[2 + j] = &tensor->opt[j]; + } + + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx); + + if (arg_idx == -1) { + continue; + } + + if (arg_idx < GGML_MAX_NODES) { + *args[j] = result.leafs[arg_idx]; + } else { + *args[j] = result.nodes[arg_idx - GGML_MAX_NODES]; + } + } + } + + result.nodes[i] = tensor; + + fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + } + + return result; +} + void ggml_graph_print(const struct ggml_cgraph * cgraph) { int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; From 25adade79da83bb76ffddbac6755383d8e819f03 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 May 2023 23:06:49 +0300 Subject: [PATCH 24/27] mnist : remove common dependencies --- examples/mnist/CMakeLists.txt | 6 ++---- examples/mnist/main-cpu.cpp | 3 +-- examples/mnist/main-mtl.cpp | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt index 222d7cd2b..3ce092490 100644 --- a/examples/mnist/CMakeLists.txt +++ b/examples/mnist/CMakeLists.txt @@ -3,14 +3,14 @@ set(TEST_TARGET mnist) add_executable(${TEST_TARGET} main.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) +target_link_libraries(${TEST_TARGET} PRIVATE ggml common) # # mnist-cpu set(TEST_TARGET mnist-cpu) add_executable(${TEST_TARGET} main-cpu.cpp) -target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) if (APPLE) # @@ -25,8 +25,6 @@ if (APPLE) add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m) target_link_libraries(${TEST_TARGET} PRIVATE ggml - common - common-ggml ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK} diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp index 4e86ef825..46b6bee1c 100644 --- a/examples/mnist/main-cpu.cpp +++ b/examples/mnist/main-cpu.cpp @@ -12,13 +12,12 @@ #include "ggml/ggml.h" -#include "common-ggml.h" - #include #include #include #include #include +#include // evaluate the MNIST compute graph // diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index 19122fcca..018c5b87a 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -14,13 +14,12 @@ #include "main-mtl.h" -#include "common-ggml.h" - #include #include #include #include #include +#include // evaluate the MNIST compute graph // From e6dc506d77b0f11bca9ae2250ff1d08ee6c4a7cb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 29 May 2023 19:14:35 +0300 Subject: [PATCH 25/27] mnist : fix soft_max threadgroup size --- examples/mnist/main-mtl.m | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index e70c0aa1d..2d4fb9b1e 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -392,9 +392,7 @@ int mnist_mtl_eval( [encoder setBuffer:id_src offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - const int64_t n = ggml_nelements(gf->nodes[i]); - - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; #endif } break; case GGML_OP_MUL_MAT: From f9b04dff5766d910b94e6f6ef27f98258c8ad820 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 29 May 2023 19:16:36 +0300 Subject: [PATCH 26/27] mnist : init no_alloc member --- examples/mnist/main-cpu.cpp | 1 + examples/mnist/main-mtl.cpp | 1 + examples/mnist/main.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp index 46b6bee1c..c8e30b0c2 100644 --- a/examples/mnist/main-cpu.cpp +++ b/examples/mnist/main-cpu.cpp @@ -46,6 +46,7 @@ int mnist_eval( struct ggml_init_params params = { .mem_size = buf_size, .mem_buffer = buf, + .no_alloc = false, }; struct ggml_context * ctx0 = ggml_init(params); diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index 018c5b87a..09866a3c1 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -48,6 +48,7 @@ int mnist_eval( struct ggml_init_params params = { .mem_size = buf_size, .mem_buffer = buf, + .no_alloc = false, }; struct ggml_context * ctx_work = ggml_init(params); diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index 315ff987e..512748003 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -177,6 +177,7 @@ int mnist_eval( struct ggml_init_params params = { .mem_size = buf_size, .mem_buffer = buf, + .no_alloc = false, }; struct ggml_context * ctx0 = ggml_init(params); From c8013c52020b72abbc1d1c47bff01eefe8d19f4d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 29 May 2023 19:24:50 +0300 Subject: [PATCH 27/27] ggml : improve "get tensor" API --- examples/mnist/main-cpu.cpp | 4 ++-- examples/mnist/main-mtl.cpp | 2 +- examples/mnist/main-mtl.m | 4 ++-- include/ggml/ggml.h | 4 +++- src/ggml.c | 19 ++++++++++++++++++- 5 files changed, 26 insertions(+), 7 deletions(-) diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp index c8e30b0c2..48e0ae60e 100644 --- a/examples/mnist/main-cpu.cpp +++ b/examples/mnist/main-cpu.cpp @@ -51,12 +51,12 @@ int mnist_eval( struct ggml_context * ctx0 = ggml_init(params); - struct ggml_tensor * input = ggml_get_tensor_by_name(&gfi, "input"); + struct ggml_tensor * input = ggml_graph_get_tensor(&gfi, "input"); memcpy(input->data, digit.data(), ggml_nbytes(input)); ggml_graph_compute(ctx0, &gfi); - const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gfi, "probs")); + const float * probs_data = ggml_get_data_f32(ggml_graph_get_tensor(&gfi, "probs")); const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp index 09866a3c1..fafe8e610 100644 --- a/examples/mnist/main-mtl.cpp +++ b/examples/mnist/main-mtl.cpp @@ -59,7 +59,7 @@ int mnist_eval( int prediction = -1; for (int i = 0; i < 1; ++i) { - struct ggml_tensor * input = ggml_get_tensor_by_name(&gf, "input"); + struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "input"); if (i % 2 == 0) { memcpy(input->data, digit.data(), ggml_nbytes(input)); diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m index 2d4fb9b1e..21bfe7ad8 100644 --- a/examples/mnist/main-mtl.m +++ b/examples/mnist/main-mtl.m @@ -118,7 +118,7 @@ kernel void kernel_soft_max( // load kernels { - const int k_digits = ggml_get_tensor_by_name(gf, "probs")->ne[0]; + const int k_digits = ggml_graph_get_tensor(gf, "probs")->ne[0]; MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new]; [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"]; @@ -310,7 +310,7 @@ int mnist_mtl_eval( // copy the input data to the GPU { - struct ggml_tensor * inp = ggml_get_tensor_by_name(gf, "input"); + struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input"); id id_dst = mnist_mtl_get_buffer(ctx, inp, &offs_src0); diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index 3320157fe..60c0ad8bf 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -495,6 +495,8 @@ extern "C" { GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); @@ -981,7 +983,7 @@ extern "C" { GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); - GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name); + GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); diff --git a/src/ggml.c b/src/ggml.c index 5f6426db7..7785fa705 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -4531,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor( return result; } +struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) { + struct ggml_object * obj = ctx->objects_begin; + + char * const mem_buffer = ctx->mem_buffer; + + while (obj != NULL) { + struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); + if (strcmp(cur->name, name) == 0) { + return cur; + } + + obj = obj->next; + } + + return NULL; +} + //////////////////////////////////////////////////////////////////////////////// // ggml_dup @@ -14540,7 +14557,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } } -struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) { +struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * leaf = cgraph->leafs[i];