From f477d4f7ece7d840f85dd0fe5d7177a7afdb775c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 24 Apr 2023 21:51:34 +0300
Subject: [PATCH 01/27] ggml : cgraph export brainstorming

---
 examples/common-ggml.cpp      | 92 +++++++++++++++++++++++++++++++++++
 examples/common-ggml.h        |  4 ++
 examples/mnist/CMakeLists.txt |  2 +-
 examples/mnist/main.cpp       |  3 ++
 4 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
index 9215dbeab..6100ad4d0 100644
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -233,3 +233,95 @@ bool ggml_common_quantize_0(
 
     return true;
 }
+
+#define GGML_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
+void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
+    const int64_t * ne = tensor->ne;
+    const size_t  * nb = tensor->nb;
+
+    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p\n",
+            ggml_type_name(tensor->type),
+            ggml_op_name  (tensor->op),
+            tensor->n_dims,
+            ne[0], ne[1], ne[2], ne[3],
+            nb[0], nb[1], nb[2], nb[3],
+            tensor->data);
+}
+
+void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
+    const int64_t * ne = tensor->ne;
+    const size_t  * nb = tensor->nb;
+
+    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p\n",
+            arg,
+            ggml_type_name(tensor->type),
+            ggml_op_name  (tensor->op),
+            tensor->n_dims,
+            ne[0], ne[1], ne[2], ne[3],
+            nb[0], nb[1], nb[2], nb[3],
+            tensor->n_tasks,
+            tensor->data);
+}
+
+void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
+    FILE * fout = stdout;
+
+    fprintf(fout, "\n");
+    fprintf(fout, "%-16s %8x\n", "magic",   GGML_FILE_MAGIC);
+    fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
+    fprintf(fout, "%-16s %8d\n", "leafs",   cgraph->n_leafs);
+    fprintf(fout, "%-16s %8d\n", "nodes",   cgraph->n_nodes);
+
+    // header
+    fprintf(fout, "\n");
+    fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s\n",
+            "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA");
+
+    for (int i = 0; i < cgraph->n_leafs; ++i) {
+        const int64_t * ne = cgraph->leafs[i]->ne;
+        const size_t  * nb = cgraph->leafs[i]->nb;
+
+        ggml_graph_export_leaf(cgraph->leafs[i], fout);
+
+        GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
+        GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
+        GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
+    }
+
+    // header
+    fprintf(fout, "\n");
+    fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s\n",
+            "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA");
+
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        const int64_t * ne = cgraph->nodes[i]->ne;
+        const size_t  * nb = cgraph->nodes[i]->nb;
+
+        ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
+
+        if (cgraph->nodes[i]->src0) {
+            ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
+        }
+
+        if (cgraph->nodes[i]->src1) {
+            ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
+        }
+
+        for (int j = 0; j < GGML_MAX_OPT; ++j) {
+            if (cgraph->nodes[i]->opt[j]) {
+                ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
+            }
+        }
+
+        fprintf(fout, "\n");
+    }
+
+    fprintf(fout, "\n");
+}
diff --git a/examples/common-ggml.h b/examples/common-ggml.h
index 477de341a..1c6fcd34b 100644
--- a/examples/common-ggml.h
+++ b/examples/common-ggml.h
@@ -16,3 +16,7 @@ bool ggml_common_quantize_0(
         const ggml_ftype ftype,
         const std::vector<std::string> & to_quant,
         const std::vector<std::string> & to_skip);
+
+struct ggml_cgraph;
+
+void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt
index 91b802ae5..1a676d9df 100644
--- a/examples/mnist/CMakeLists.txt
+++ b/examples/mnist/CMakeLists.txt
@@ -3,5 +3,5 @@
 
 set(TEST_TARGET mnist)
 add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
 
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index c84eedd92..2bfebd0da 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -1,6 +1,7 @@
 #include "ggml/ggml.h"
 
 #include "common.h"
+#include "common-ggml.h"
 
 #include <cmath>
 #include <cstdio>
@@ -204,6 +205,8 @@ int mnist_eval(
 
     const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
 
+    ggml_graph_export(&gf, "mnist.ggml");
+
     ggml_free(ctx0);
 
     return prediction;

From 2a0342114acd8fad9100ab7cc3a22b5bfb67c430 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 21 May 2023 16:11:50 +0300
Subject: [PATCH 02/27] mnist : code style

---
 examples/mnist/main.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index 2bfebd0da..655ab559d 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -205,6 +205,7 @@ int mnist_eval(
 
     const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
 
+    // export the computation graph
     ggml_graph_export(&gf, "mnist.ggml");
 
     ggml_free(ctx0);

From 85dcc0c3fc3bc8568e573176b55da9fc7f387cab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 21 May 2023 22:46:03 +0300
Subject: [PATCH 03/27] mnist : minor

---
 examples/common-ggml.cpp | 106 ++++++++++++++++++++++++---------------
 1 file changed, 65 insertions(+), 41 deletions(-)

diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
index 6100ad4d0..b79aab835 100644
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -246,20 +246,21 @@ void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p\n",
+    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
             ggml_type_name(tensor->type),
             ggml_op_name  (tensor->op),
             tensor->n_dims,
             ne[0], ne[1], ne[2], ne[3],
             nb[0], nb[1], nb[2], nb[3],
-            tensor->data);
+            tensor->data,
+            tensor->name);
 }
 
 void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p\n",
+    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
             arg,
             ggml_type_name(tensor->type),
             ggml_op_name  (tensor->op),
@@ -267,61 +268,84 @@ void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg,
             ne[0], ne[1], ne[2], ne[3],
             nb[0], nb[1], nb[2], nb[3],
             tensor->n_tasks,
-            tensor->data);
+            tensor->data,
+            tensor->name);
 }
 
 void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    FILE * fout = stdout;
+    // print
+    {
+        FILE * fout = stdout;
 
-    fprintf(fout, "\n");
-    fprintf(fout, "%-16s %8x\n", "magic",   GGML_FILE_MAGIC);
-    fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
-    fprintf(fout, "%-16s %8d\n", "leafs",   cgraph->n_leafs);
-    fprintf(fout, "%-16s %8d\n", "nodes",   cgraph->n_nodes);
+        fprintf(fout, "\n");
+        fprintf(fout, "%-16s %8x\n", "magic",   GGML_FILE_MAGIC);
+        fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
+        fprintf(fout, "%-16s %8d\n", "leafs",   cgraph->n_leafs);
+        fprintf(fout, "%-16s %8d\n", "nodes",   cgraph->n_nodes);
 
-    // header
-    fprintf(fout, "\n");
-    fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s\n",
-            "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA");
+        // header
+        fprintf(fout, "\n");
+        fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
+                "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
 
-    for (int i = 0; i < cgraph->n_leafs; ++i) {
-        const int64_t * ne = cgraph->leafs[i]->ne;
-        const size_t  * nb = cgraph->leafs[i]->nb;
+        for (int i = 0; i < cgraph->n_leafs; ++i) {
+            ggml_graph_export_leaf(cgraph->leafs[i], fout);
 
-        ggml_graph_export_leaf(cgraph->leafs[i], fout);
+            GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
+            GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
+            GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
+        }
 
-        GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
-        GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
-        GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
-    }
+        // header
+        fprintf(fout, "\n");
+        fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
+                "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
 
-    // header
-    fprintf(fout, "\n");
-    fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s\n",
-            "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA");
+        for (int i = 0; i < cgraph->n_nodes; ++i) {
+            ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
 
-    for (int i = 0; i < cgraph->n_nodes; ++i) {
-        const int64_t * ne = cgraph->nodes[i]->ne;
-        const size_t  * nb = cgraph->nodes[i]->nb;
+            if (cgraph->nodes[i]->src0) {
+                ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
+            }
 
-        ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
+            if (cgraph->nodes[i]->src1) {
+                ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
+            }
 
-        if (cgraph->nodes[i]->src0) {
-            ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
+            for (int j = 0; j < GGML_MAX_OPT; ++j) {
+                if (cgraph->nodes[i]->opt[j]) {
+                    ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
+                }
+            }
+
+            fprintf(fout, "\n");
         }
 
-        if (cgraph->nodes[i]->src1) {
-            ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
+        fprintf(fout, "\n");
+    }
+
+    // write binary data
+    {
+        FILE * fout = fopen(fname, "wb");
+
+        if (!fout) {
+            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            return;
         }
 
-        for (int j = 0; j < GGML_MAX_OPT; ++j) {
-            if (cgraph->nodes[i]->opt[j]) {
-                ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
-            }
+        // header
+        {
+            uint32_t magic   = GGML_FILE_MAGIC;
+            uint32_t version = GGML_FILE_VERSION;
+            uint32_t leafs   = cgraph->n_leafs;
+            uint32_t nodes   = cgraph->n_nodes;
+
+            fwrite(&magic,   sizeof(uint32_t), 1, fout);
+            fwrite(&version, sizeof(uint32_t), 1, fout);
+            fwrite(&leafs,   sizeof(uint32_t), 1, fout);
+            fwrite(&nodes,   sizeof(uint32_t), 1, fout);
         }
 
-        fprintf(fout, "\n");
+        fclose(fout);
     }
-
-    fprintf(fout, "\n");
 }

From 95c85071b7c62eb9426a2e9bb0f7d54a0695027c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 24 May 2023 15:15:28 +0300
Subject: [PATCH 04/27] ggml : initial cgraph export

---
 examples/common-ggml.cpp | 90 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 86 insertions(+), 4 deletions(-)

diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
index b79aab835..84a028204 100644
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -335,10 +335,10 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
 
         // header
         {
-            uint32_t magic   = GGML_FILE_MAGIC;
-            uint32_t version = GGML_FILE_VERSION;
-            uint32_t leafs   = cgraph->n_leafs;
-            uint32_t nodes   = cgraph->n_nodes;
+            const uint32_t magic   = GGML_FILE_MAGIC;
+            const uint32_t version = GGML_FILE_VERSION;
+            const uint32_t leafs   = cgraph->n_leafs;
+            const uint32_t nodes   = cgraph->n_nodes;
 
             fwrite(&magic,   sizeof(uint32_t), 1, fout);
             fwrite(&version, sizeof(uint32_t), 1, fout);
@@ -346,6 +346,88 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
             fwrite(&nodes,   sizeof(uint32_t), 1, fout);
         }
 
+        // leafs
+        {
+            const uint32_t n_leafs = cgraph->n_leafs;
+
+            fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
+
+            for (int i = 0; i < cgraph->n_leafs; ++i) {
+                const struct ggml_tensor * tensor = cgraph->leafs[i];
+
+                const uint32_t type   = tensor->type;
+                const uint32_t op     = tensor->op;
+                const uint32_t n_dims = tensor->n_dims;
+
+                fwrite(&type,   sizeof(uint32_t), 1, fout);
+                fwrite(&op,     sizeof(uint32_t), 1, fout);
+                fwrite(&n_dims, sizeof(uint32_t), 1, fout);
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    const int64_t ne = tensor->ne[j];
+                    const size_t  nb = tensor->nb[j];
+
+                    fwrite(&ne, sizeof(int64_t), 1, fout);
+                    fwrite(&nb, sizeof(size_t),  1, fout);
+                }
+
+                // store the pointer address
+                fwrite(&tensor->data, sizeof(void *), 1, fout);
+
+                {
+                    const size_t len = strlen(tensor->name);
+
+                    fwrite(&len, sizeof(size_t), 1, fout);
+                    fwrite(tensor->name, sizeof(char), len, fout);
+                }
+
+                // dump the data
+                // TODO: pad this to 32 byte boundary
+                {
+                    const size_t size = ggml_nbytes(tensor);
+
+                    fwrite(tensor->data, sizeof(char), size, fout);
+                }
+            }
+        }
+
+        // nodes
+        {
+            const uint32_t n_nodes = cgraph->n_nodes;
+
+            fwrite(&n_nodes, sizeof(uint32_t), 1, fout);
+
+            for (int i = 0; i < cgraph->n_nodes; ++i) {
+                const struct ggml_tensor * tensor = cgraph->nodes[i];
+
+                const uint32_t type   = tensor->type;
+                const uint32_t op     = tensor->op;
+                const uint32_t n_dims = tensor->n_dims;
+
+                fwrite(&type,   sizeof(uint32_t), 1, fout);
+                fwrite(&op,     sizeof(uint32_t), 1, fout);
+                fwrite(&n_dims, sizeof(uint32_t), 1, fout);
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    const int64_t ne = tensor->ne[j];
+                    const size_t  nb = tensor->nb[j];
+
+                    fwrite(&ne, sizeof(int64_t), 1, fout);
+                    fwrite(&nb, sizeof(size_t),  1, fout);
+                }
+
+                // store the pointer address
+                fwrite(&tensor->data, sizeof(void *), 1, fout);
+
+                {
+                    const size_t len = strlen(tensor->name);
+
+                    fwrite(&len, sizeof(size_t), 1, fout);
+                    fwrite(tensor->name, sizeof(char), len, fout);
+                }
+            }
+        }
+
         fclose(fout);
     }
 }

From 312018907ac2b6e6dfc6884150822758abb8587c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 24 May 2023 21:47:30 +0300
Subject: [PATCH 05/27] ggml : initial graph import (wip)

---
 examples/common-ggml.cpp | 258 +++++++++++++++++++++++++++++++++------
 examples/common-ggml.h   |   5 +-
 examples/mnist/main.cpp  |  18 ++-
 3 files changed, 240 insertions(+), 41 deletions(-)

diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
index 84a028204..3e355ae24 100644
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -273,15 +273,27 @@ void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg,
 }
 
 void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
+    assert(cgraph->work      == NULL);
+    assert(cgraph->work_size == 0);
+
+    uint64_t size_eval = 0;
+
+    // compute size of intermediate results
+    // TODO: does not take into account scratch buffers !!!!
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        size_eval += ggml_nbytes(cgraph->nodes[i]);
+    }
+
     // print
     {
         FILE * fout = stdout;
 
         fprintf(fout, "\n");
-        fprintf(fout, "%-16s %8x\n", "magic",   GGML_FILE_MAGIC);
-        fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
-        fprintf(fout, "%-16s %8d\n", "leafs",   cgraph->n_leafs);
-        fprintf(fout, "%-16s %8d\n", "nodes",   cgraph->n_nodes);
+        fprintf(fout, "%-16s %8x\n",   "magic",   GGML_FILE_MAGIC);
+        fprintf(fout, "%-16s %8d\n",   "version", GGML_FILE_VERSION);
+        fprintf(fout, "%-16s %8d\n",   "leafs",   cgraph->n_leafs);
+        fprintf(fout, "%-16s %8d\n",   "nodes",   cgraph->n_nodes);
+        fprintf(fout, "%-16s %8llu\n", "eval",    size_eval);
 
         // header
         fprintf(fout, "\n");
@@ -340,18 +352,15 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
             const uint32_t leafs   = cgraph->n_leafs;
             const uint32_t nodes   = cgraph->n_nodes;
 
-            fwrite(&magic,   sizeof(uint32_t), 1, fout);
-            fwrite(&version, sizeof(uint32_t), 1, fout);
-            fwrite(&leafs,   sizeof(uint32_t), 1, fout);
-            fwrite(&nodes,   sizeof(uint32_t), 1, fout);
+            fwrite(&magic,     sizeof(uint32_t), 1, fout);
+            fwrite(&version,   sizeof(uint32_t), 1, fout);
+            fwrite(&leafs,     sizeof(uint32_t), 1, fout);
+            fwrite(&nodes,     sizeof(uint32_t), 1, fout);
+            fwrite(&size_eval, sizeof(uint64_t), 1, fout);
         }
 
         // leafs
         {
-            const uint32_t n_leafs = cgraph->n_leafs;
-
-            fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
-
             for (int i = 0; i < cgraph->n_leafs; ++i) {
                 const struct ggml_tensor * tensor = cgraph->leafs[i];
 
@@ -364,23 +373,22 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                 fwrite(&n_dims, sizeof(uint32_t), 1, fout);
 
                 for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    const int64_t ne = tensor->ne[j];
-                    const size_t  nb = tensor->nb[j];
+                    const uint64_t ne = tensor->ne[j];
+                    const uint64_t nb = tensor->nb[j];
 
-                    fwrite(&ne, sizeof(int64_t), 1, fout);
-                    fwrite(&nb, sizeof(size_t),  1, fout);
+                    fwrite(&ne, sizeof(uint64_t), 1, fout);
+                    fwrite(&nb, sizeof(uint64_t), 1, fout);
                 }
 
                 // store the pointer address
-                fwrite(&tensor->data, sizeof(void *), 1, fout);
-
                 {
-                    const size_t len = strlen(tensor->name);
+                    const uint64_t ptr = (uint64_t) tensor->data;
 
-                    fwrite(&len, sizeof(size_t), 1, fout);
-                    fwrite(tensor->name, sizeof(char), len, fout);
+                    fwrite(&ptr, sizeof(uint64_t), 1, fout);
                 }
 
+                fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
+
                 // dump the data
                 // TODO: pad this to 32 byte boundary
                 {
@@ -393,10 +401,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
 
         // nodes
         {
-            const uint32_t n_nodes = cgraph->n_nodes;
-
-            fwrite(&n_nodes, sizeof(uint32_t), 1, fout);
-
             for (int i = 0; i < cgraph->n_nodes; ++i) {
                 const struct ggml_tensor * tensor = cgraph->nodes[i];
 
@@ -409,25 +413,211 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                 fwrite(&n_dims, sizeof(uint32_t), 1, fout);
 
                 for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    const int64_t ne = tensor->ne[j];
-                    const size_t  nb = tensor->nb[j];
+                    const uint64_t ne = tensor->ne[j];
+                    const uint64_t nb = tensor->nb[j];
 
-                    fwrite(&ne, sizeof(int64_t), 1, fout);
-                    fwrite(&nb, sizeof(size_t),  1, fout);
+                    fwrite(&ne, sizeof(uint64_t), 1, fout);
+                    fwrite(&nb, sizeof(uint64_t), 1, fout);
                 }
 
                 // store the pointer address
-                fwrite(&tensor->data, sizeof(void *), 1, fout);
-
                 {
-                    const size_t len = strlen(tensor->name);
+                    const uint64_t ptr = (uint64_t) tensor->data;
 
-                    fwrite(&len, sizeof(size_t), 1, fout);
-                    fwrite(tensor->name, sizeof(char), len, fout);
+                    fwrite(&ptr, sizeof(uint64_t), 1, fout);
                 }
+
+                fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
             }
         }
 
         fclose(fout);
     }
 }
+
+ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
+    assert(*ctx_data == NULL);
+    assert(*ctx_eval == NULL);
+
+    ggml_cgraph result;
+
+    struct ggml_tensor * data = NULL;
+
+    // read file into data
+    {
+        FILE * fin = fopen(fname, "rb");
+
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            return result;
+        }
+
+        size_t fsize = 0;
+
+        fseek(fin, 0, SEEK_END);
+        fsize = ftell(fin);
+        fseek(fin, 0, SEEK_SET);
+
+        // create the data context
+        {
+            const size_t overhead = 1*GGML_TENSOR_OVERHEAD;
+
+            struct ggml_init_params params = {
+                .mem_size   = fsize + overhead,
+                .mem_buffer = NULL,
+                .no_alloc   = false,
+            };
+
+            *ctx_data = ggml_init(params);
+
+            if (!*ctx_data) {
+                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+                return result;
+            }
+        }
+
+        data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
+
+        fread(data->data, sizeof(char), fsize, fin);
+
+        fclose(fin);
+    }
+
+    // populate result
+    {
+        const char * ptr = (const char *) data->data;
+
+        const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
+
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
+            return result;
+        }
+
+        const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
+
+        if (version != GGML_FILE_VERSION) {
+            fprintf(stderr, "%s: invalid version number\n", __func__);
+            return result;
+        }
+
+        const uint32_t leafs     = *(const uint32_t *) ptr; ptr += sizeof(leafs);
+        const uint32_t nodes     = *(const uint32_t *) ptr; ptr += sizeof(nodes);
+        const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
+
+        result.n_leafs   = leafs;
+        result.n_nodes   = nodes;
+
+        // create the data context
+        {
+            const size_t overhead = (leafs + nodes)*GGML_TENSOR_OVERHEAD;
+
+            struct ggml_init_params params = {
+                .mem_size   = size_eval + overhead,
+                .mem_buffer = NULL,
+                .no_alloc   = true,
+            };
+
+            *ctx_eval = ggml_init(params);
+
+            if (!*ctx_eval) {
+                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+                return result;
+            }
+        }
+
+        // leafs
+        {
+            uint32_t type;
+            uint32_t op;
+            uint32_t n_dims;
+
+            for (int i = 0; i < leafs; ++i) {
+                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
+                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+
+                int64_t ne[GGML_MAX_DIMS];
+                size_t  nb[GGML_MAX_DIMS];
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    uint64_t ne_cur;
+                    uint64_t nb_cur;
+
+                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+                    ne[j] = ne_cur;
+                    nb[j] = nb_cur;
+                }
+
+                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+                tensor->op = (enum ggml_op) op;
+
+                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+
+                memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+
+                tensor->data = (void *) ptr;
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    tensor->nb[j] = nb[j];
+                }
+
+                result.leafs[i] = tensor;
+
+                ptr += ggml_nbytes(tensor);
+
+                fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+            }
+        }
+
+        ggml_set_no_alloc(*ctx_eval, false);
+
+        // nodes
+        {
+            uint32_t type;
+            uint32_t op;
+            uint32_t n_dims;
+
+            for (int i = 0; i < nodes; ++i) {
+                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
+                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+
+                int64_t ne[GGML_MAX_DIMS];
+                size_t  nb[GGML_MAX_DIMS];
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    uint64_t ne_cur;
+                    uint64_t nb_cur;
+
+                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+                    ne[j] = ne_cur;
+                    nb[j] = nb_cur;
+                }
+
+                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+                tensor->op = (enum ggml_op) op;
+
+                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+
+                memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    tensor->nb[j] = nb[j];
+                }
+
+                result.nodes[i] = tensor;
+
+                fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+            }
+        }
+    }
+
+    return result;
+}
diff --git a/examples/common-ggml.h b/examples/common-ggml.h
index 1c6fcd34b..9260a7fd4 100644
--- a/examples/common-ggml.h
+++ b/examples/common-ggml.h
@@ -17,6 +17,5 @@ bool ggml_common_quantize_0(
         const std::vector<std::string> & to_quant,
         const std::vector<std::string> & to_skip);
 
-struct ggml_cgraph;
-
-void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+void        ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index 655ab559d..304ba13b6 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -194,8 +194,9 @@ int mnist_eval(
     // soft max
     ggml_tensor * probs = ggml_soft_max(ctx0, fc2);
 
-    // run the computation
+    // build / export / run the computation graph
     ggml_build_forward_expand(&gf, probs);
+    ggml_graph_export        (&gf, "mnist.ggml"); // export before running to avoid work tensor
     ggml_graph_compute       (ctx0, &gf);
 
     //ggml_graph_print   (&gf);
@@ -205,11 +206,20 @@ int mnist_eval(
 
     const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
 
-    // export the computation graph
-    ggml_graph_export(&gf, "mnist.ggml");
-
     ggml_free(ctx0);
 
+    // TMP
+    // import the computation graph
+    {
+        struct ggml_context * ctx_data = NULL;
+        struct ggml_context * ctx_eval = NULL;
+
+        struct ggml_cgraph gfi = ggml_graph_import("mnist.ggml", &ctx_data, &ctx_eval);
+        gfi.n_threads = n_threads;
+
+        ggml_graph_compute(ctx0, &gfi);
+    }
+
     return prediction;
 }
 

From d2d1c220da782becf74a806e16bed3d1f129e604 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 25 May 2023 23:02:20 +0300
Subject: [PATCH 06/27] ggml : import op args correctly

---
 examples/common-ggml.cpp | 75 ++++++++++++++++++++++++++++++++++++++++
 examples/mnist/main.cpp  |  3 +-
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
index 3e355ae24..544c0b644 100644
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -428,6 +428,55 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                 }
 
                 fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
+
+                // output the op arguments
+                {
+                    struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
+
+                    args[0] = tensor->src0;
+                    args[1] = tensor->src1;
+
+                    for (int j = 0; j < GGML_MAX_OPT; ++j) {
+                        args[2 + j] = tensor->opt[j];
+                    }
+
+                    for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+                        if (args[j]) {
+                            int32_t idx = -1;
+
+                            // check if leaf
+                            {
+                                for (int k = 0; k < cgraph->n_leafs; ++k) {
+                                    if (args[j] == cgraph->leafs[k]) {
+                                        idx = k;
+                                        break;
+                                    }
+                                }
+                            }
+
+                            // check if node
+                            if (idx == -1) {
+                                for (int k = 0; k < cgraph->n_nodes; ++k) {
+                                    if (args[j] == cgraph->nodes[k]) {
+                                        idx = GGML_MAX_NODES + k;
+                                        break;
+                                    }
+                                }
+                            }
+
+                            if (idx == -1) {
+                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
+                                return;
+                            }
+
+                            fwrite(&idx, sizeof(int32_t), 1, fout);
+                        } else {
+                            const int32_t nul = -1;
+
+                            fwrite(&nul, sizeof(int32_t), 1, fout);
+                        }
+                    }
+                }
             }
         }
 
@@ -612,6 +661,32 @@ ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_dat
                     tensor->nb[j] = nb[j];
                 }
 
+                // parse args
+                {
+                    struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
+                        &tensor->src0,
+                        &tensor->src1,
+                    };
+
+                    for (int j = 0; j < GGML_MAX_OPT; ++j) {
+                        args[2 + j] = &tensor->opt[j];
+                    }
+
+                    for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+                        const uint32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
+
+                        if (arg_idx == -1) {
+                            continue;
+                        }
+
+                        if (arg_idx < GGML_MAX_NODES) {
+                            *args[j] = result.leafs[arg_idx];
+                        } else {
+                            *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
+                        }
+                    }
+                }
+
                 result.nodes[i] = tensor;
 
                 fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index 304ba13b6..28f187b53 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -196,7 +196,6 @@ int mnist_eval(
 
     // build / export / run the computation graph
     ggml_build_forward_expand(&gf, probs);
-    ggml_graph_export        (&gf, "mnist.ggml"); // export before running to avoid work tensor
     ggml_graph_compute       (ctx0, &gf);
 
     //ggml_graph_print   (&gf);
@@ -208,6 +207,8 @@ int mnist_eval(
 
     ggml_free(ctx0);
 
+    ggml_graph_export(&gf, "mnist.ggml");
+
     // TMP
     // import the computation graph
     {

From 4cfd92b82e435a068b95b2fd1261e66fc6eeda31 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 May 2023 11:23:52 +0300
Subject: [PATCH 07/27] ggml : add ggml_get_tensor_by_name()

---
 examples/mnist/main.cpp | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index 28f187b53..eab4a711c 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -193,6 +193,7 @@ int mnist_eval(
 
     // soft max
     ggml_tensor * probs = ggml_soft_max(ctx0, fc2);
+    ggml_set_name(probs, "probs");
 
     // build / export / run the computation graph
     ggml_build_forward_expand(&gf, probs);
@@ -201,25 +202,27 @@ int mnist_eval(
     //ggml_graph_print   (&gf);
     ggml_graph_dump_dot(&gf, NULL, "mnist.dot");
 
+    ggml_graph_export(&gf, "mnist.ggml");
+
+#if 0
     const float * probs_data = ggml_get_data_f32(probs);
 
     const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+#else
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
 
-    ggml_free(ctx0);
+    struct ggml_cgraph gfi = ggml_graph_import("mnist.ggml", &ctx_data, &ctx_eval);
+    gfi.n_threads = n_threads;
 
-    ggml_graph_export(&gf, "mnist.ggml");
+    ggml_graph_compute(ctx0, &gfi);
 
-    // TMP
-    // import the computation graph
-    {
-        struct ggml_context * ctx_data = NULL;
-        struct ggml_context * ctx_eval = NULL;
+    const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gfi, "probs"));
 
-        struct ggml_cgraph gfi = ggml_graph_import("mnist.ggml", &ctx_data, &ctx_eval);
-        gfi.n_threads = n_threads;
+    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+#endif
 
-        ggml_graph_compute(ctx0, &gfi);
-    }
+    ggml_free(ctx0);
 
     return prediction;
 }

From b0450c273bd077d555def520cd26158427023fe6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 May 2023 16:00:20 +0300
Subject: [PATCH 08/27] mnist : add compute graph evaluation on CPU example

---
 examples/mnist/CMakeLists.txt |   6 ++
 examples/mnist/main-cpu.cpp   | 118 ++++++++++++++++++++++++++++++++++
 examples/mnist/main.cpp       |  40 ++++++------
 3 files changed, 142 insertions(+), 22 deletions(-)
 create mode 100644 examples/mnist/main-cpu.cpp

diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt
index 1a676d9df..c3750954d 100644
--- a/examples/mnist/CMakeLists.txt
+++ b/examples/mnist/CMakeLists.txt
@@ -5,3 +5,9 @@ set(TEST_TARGET mnist)
 add_executable(${TEST_TARGET} main.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
 
+#
+# mnist-cpu
+
+set(TEST_TARGET mnist-cpu)
+add_executable(${TEST_TARGET} main-cpu.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
new file mode 100644
index 000000000..86ca7259f
--- /dev/null
+++ b/examples/mnist/main-cpu.cpp
@@ -0,0 +1,118 @@
+// Use a pre-generated MNIST compute graph for inference on the CPU
+//
+// You can generate a compute graph using the "mnist" tool:
+//
+// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+// This command creates the "mnist.ggml" file, which contains the generated compute graph.
+// Now, you can re-use the compute graph with the "mnist-cpu" tool:
+//
+// $ ./bin/mnist-cpu ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+
+#include "ggml/ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <string>
+
+// evaluate the MNIST compute graph
+//
+//   - fname_cgraph: path to the compute graph
+//   - n_threads:    number of threads to use
+//   - digit:        784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+        const char * fname_cgraph,
+        const int n_threads,
+        std::vector<float> digit
+        ) {
+    // load the compute graph
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
+
+    struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    gfi.n_threads = n_threads;
+
+    // allocate eval context
+    // needed during ggml_graph_compute() to allocate a work tensor
+    static size_t buf_size = gfi.work_size; // TODO
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params params = {
+        .mem_size   = buf_size,
+        .mem_buffer = buf,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * input = ggml_get_tensor_by_name(&gfi, "input");
+    memcpy(input->data, digit.data(), ggml_nbytes(input));
+
+    ggml_graph_compute(ctx0, &gfi);
+
+    const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gfi, "probs"));
+
+    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+
+    ggml_free(ctx0);
+    ggml_free(ctx_data);
+    ggml_free(ctx_eval);
+
+    return prediction;
+}
+
+int main(int argc, char ** argv) {
+    srand(time(NULL));
+    ggml_time_init();
+
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+        exit(0);
+    }
+
+    uint8_t buf[784];
+    std::vector<float> digit;
+
+    // read a random digit from the test set
+    {
+        std::ifstream fin(argv[2], std::ios::binary);
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+            return 1;
+        }
+
+        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+        fin.seekg(16 + 784 * (rand() % 10000));
+        fin.read((char *) &buf, sizeof(buf));
+    }
+
+    // render the digit in ASCII
+    {
+        digit.resize(sizeof(buf));
+
+        for (int row = 0; row < 28; row++) {
+            for (int col = 0; col < 28; col++) {
+                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+                digit[row*28 + col] = ((float)buf[row*28 + col]);
+            }
+
+            fprintf(stderr, "\n");
+        }
+
+        fprintf(stderr, "\n");
+    }
+
+    const int prediction = mnist_eval(argv[1], 1, digit);
+
+    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+    return 0;
+}
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index eab4a711c..ced167c00 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -166,7 +166,8 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) {
 int mnist_eval(
         const mnist_model & model,
         const int n_threads,
-        std::vector<float> digit
+        std::vector<float> digit,
+        const char * fname_cgraph
         ) {
 
     const auto & hparams = model.hparams;
@@ -202,25 +203,17 @@ int mnist_eval(
     //ggml_graph_print   (&gf);
     ggml_graph_dump_dot(&gf, NULL, "mnist.dot");
 
-    ggml_graph_export(&gf, "mnist.ggml");
+    if (fname_cgraph) {
+        // export the compute graph for later use
+        // see the "mnist-cpu" example
+        ggml_graph_export(&gf, "mnist.ggml");
 
-#if 0
-    const float * probs_data = ggml_get_data_f32(probs);
-
-    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
-#else
-    struct ggml_context * ctx_data = NULL;
-    struct ggml_context * ctx_eval = NULL;
-
-    struct ggml_cgraph gfi = ggml_graph_import("mnist.ggml", &ctx_data, &ctx_eval);
-    gfi.n_threads = n_threads;
-
-    ggml_graph_compute(ctx0, &gfi);
+        fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph);
+    }
 
-    const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gfi, "probs"));
+    const float * probs_data = ggml_get_data_f32(probs);
 
     const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
-#endif
 
     ggml_free(ctx0);
 
@@ -231,30 +224,31 @@ int mnist_eval(
 extern "C" {
 #endif
 
-int wasm_eval(uint8_t *digitPtr)
-{
+int wasm_eval(uint8_t * digitPtr) {
     mnist_model model;
     if (!mnist_model_load("models/mnist/ggml-model-f32.bin", model)) {
         fprintf(stderr, "error loading model\n");
         return -1;
     }
     std::vector<float> digit(digitPtr, digitPtr + 784);
-    int result = mnist_eval(model, 1, digit);
+    int result = mnist_eval(model, 1, digit, nullptr);
     ggml_free(model.ctx);
+
     return result;
 }
 
-int wasm_random_digit(char *digitPtr)
-{
+int wasm_random_digit(char * digitPtr) {
     auto fin = std::ifstream("models/mnist/t10k-images.idx3-ubyte", std::ios::binary);
     if (!fin) {
         fprintf(stderr, "failed to open digits file\n");
         return 0;
     }
     srand(time(NULL));
+
     // Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
     fin.seekg(16 + 784 * (rand() % 10000));
     fin.read(digitPtr, 784);
+
     return 1;
 }
 
@@ -318,7 +312,9 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "\n");
     }
 
-    fprintf(stdout, "%s: predicted digit is %d\n", __func__, mnist_eval(model, 1, digit));
+    const int prediction = mnist_eval(model, 1, digit, "mnist.ggml");
+
+    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
 
     ggml_free(model.ctx);
 

From ddea4885e5e80dec7880b875fbc1cac41ef343be Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 May 2023 16:15:10 +0300
Subject: [PATCH 09/27] ggml : add ggml_tensor_overhead()

---
 examples/common-ggml.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
index 544c0b644..0db2c034b 100644
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -509,7 +509,7 @@ ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_dat
 
         // create the data context
         {
-            const size_t overhead = 1*GGML_TENSOR_OVERHEAD;
+            const size_t overhead = 1*ggml_tensor_overhead();
 
             struct ggml_init_params params = {
                 .mem_size   = fsize + overhead,
@@ -559,7 +559,7 @@ ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_dat
 
         // create the data context
         {
-            const size_t overhead = (leafs + nodes)*GGML_TENSOR_OVERHEAD;
+            const size_t overhead = (leafs + nodes)*ggml_tensor_overhead();
 
             struct ggml_init_params params = {
                 .mem_size   = size_eval + overhead,

From f698dbf059c3da3397dc5f8a6654f21dd26153fd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 May 2023 17:10:18 +0300
Subject: [PATCH 10/27] ggml : rename new functions to ggml_cgraph_...

---
 examples/common-ggml.cpp    | 18 +++++++++---------
 examples/common-ggml.h      |  5 +++--
 examples/mnist/main-cpu.cpp |  2 +-
 examples/mnist/main.cpp     |  2 +-
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
index 0db2c034b..ac16994c0 100644
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -242,7 +242,7 @@ bool ggml_common_quantize_0(
         } \
     } while (0)
 
-void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
+void ggml_cgraph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
@@ -256,7 +256,7 @@ void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
             tensor->name);
 }
 
-void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
+void ggml_cgraph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
@@ -272,7 +272,7 @@ void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg,
             tensor->name);
 }
 
-void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
+void ggml_cgraph_export(const struct ggml_cgraph * cgraph, const char * fname) {
     assert(cgraph->work      == NULL);
     assert(cgraph->work_size == 0);
 
@@ -301,7 +301,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                 "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
 
         for (int i = 0; i < cgraph->n_leafs; ++i) {
-            ggml_graph_export_leaf(cgraph->leafs[i], fout);
+            ggml_cgraph_export_leaf(cgraph->leafs[i], fout);
 
             GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
             GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
@@ -314,19 +314,19 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
                 "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
 
         for (int i = 0; i < cgraph->n_nodes; ++i) {
-            ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
+            ggml_cgraph_export_node(cgraph->nodes[i], "DST", fout);
 
             if (cgraph->nodes[i]->src0) {
-                ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
+                ggml_cgraph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
             }
 
             if (cgraph->nodes[i]->src1) {
-                ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
+                ggml_cgraph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
             }
 
             for (int j = 0; j < GGML_MAX_OPT; ++j) {
                 if (cgraph->nodes[i]->opt[j]) {
-                    ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
+                    ggml_cgraph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
                 }
             }
 
@@ -484,7 +484,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
     }
 }
 
-ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
+ggml_cgraph ggml_cgraph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
     assert(*ctx_data == NULL);
     assert(*ctx_eval == NULL);
 
diff --git a/examples/common-ggml.h b/examples/common-ggml.h
index 9260a7fd4..1a85abed3 100644
--- a/examples/common-ggml.h
+++ b/examples/common-ggml.h
@@ -17,5 +17,6 @@ bool ggml_common_quantize_0(
         const std::vector<std::string> & to_quant,
         const std::vector<std::string> & to_skip);
 
-void        ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
-ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
+// these will move to ggml when ready
+void        ggml_cgraph_export(const struct ggml_cgraph * cgraph, const char * fname);
+ggml_cgraph ggml_cgraph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
index 86ca7259f..1a07b7453 100644
--- a/examples/mnist/main-cpu.cpp
+++ b/examples/mnist/main-cpu.cpp
@@ -38,7 +38,7 @@ int mnist_eval(
     struct ggml_context * ctx_data = NULL;
     struct ggml_context * ctx_eval = NULL;
 
-    struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    struct ggml_cgraph gfi = ggml_cgraph_import(fname_cgraph, &ctx_data, &ctx_eval);
     gfi.n_threads = n_threads;
 
     // allocate eval context
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index ced167c00..c6ecec2cd 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -206,7 +206,7 @@ int mnist_eval(
     if (fname_cgraph) {
         // export the compute graph for later use
         // see the "mnist-cpu" example
-        ggml_graph_export(&gf, "mnist.ggml");
+        ggml_cgraph_export(&gf, "mnist.ggml");
 
         fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph);
     }

From bf93623ef83121241c0458e9e701307445f29c7b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 27 May 2023 18:19:42 +0300
Subject: [PATCH 11/27] mnist : add Metal inference skeleton (WIP)

---
 examples/mnist/CMakeLists.txt |  22 +++++++
 examples/mnist/main-mtl.cpp   | 110 ++++++++++++++++++++++++++++++++++
 examples/mnist/main-mtl.h     |  18 ++++++
 examples/mnist/main-mtl.m     |  14 +++++
 4 files changed, 164 insertions(+)
 create mode 100644 examples/mnist/main-mtl.cpp
 create mode 100644 examples/mnist/main-mtl.h
 create mode 100644 examples/mnist/main-mtl.m

diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt
index c3750954d..222d7cd2b 100644
--- a/examples/mnist/CMakeLists.txt
+++ b/examples/mnist/CMakeLists.txt
@@ -11,3 +11,25 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
 set(TEST_TARGET mnist-cpu)
 add_executable(${TEST_TARGET} main-cpu.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+if (APPLE)
+    #
+    # mnist-mtl
+
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+    set(TEST_TARGET mnist-mtl)
+    add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m)
+    target_link_libraries(${TEST_TARGET} PRIVATE
+        ggml
+        common
+        common-ggml
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        ${METALPERFORMANCE_FRAMEWORK}
+    )
+endif()
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
new file mode 100644
index 000000000..9480dbc62
--- /dev/null
+++ b/examples/mnist/main-mtl.cpp
@@ -0,0 +1,110 @@
+// Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS
+//
+
+#include "ggml/ggml.h"
+
+#include "main-mtl.h"
+
+#include "common-ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+
+// evaluate the MNIST compute graph
+//
+//   - fname_cgraph: path to the compute graph
+//   - n_threads:    number of threads to use
+//   - digit:        784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+        const char * fname_cgraph,
+        const int n_threads,
+        std::vector<float> digit
+        ) {
+    // load the compute graph
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
+
+    struct ggml_cgraph gf = ggml_cgraph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    gf.n_threads = n_threads;
+
+    // allocate eval context
+    // needed during ggml_graph_compute() to allocate a work tensor
+    static size_t buf_size = gf.work_size; // TODO
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params params = {
+        .mem_size   = buf_size,
+        .mem_buffer = buf,
+    };
+
+    struct ggml_context * ctx_work = ggml_init(params);
+
+    struct ggml_tensor * input = ggml_get_tensor_by_name(&gf, "input");
+    memcpy(input->data, digit.data(), ggml_nbytes(input));
+
+    //ggml_graph_compute(ctx_work, &gf);
+    mnist_mtl_eval(ctx_data, ctx_eval, ctx_work, &gf);
+
+    const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gf, "probs"));
+
+    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+
+    ggml_free(ctx_work);
+    ggml_free(ctx_data);
+    ggml_free(ctx_eval);
+
+    return prediction;
+}
+
+int main(int argc, char ** argv) {
+    srand(time(NULL));
+    ggml_time_init();
+
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+        exit(0);
+    }
+
+    uint8_t buf[784];
+    std::vector<float> digit;
+
+    // read a random digit from the test set
+    {
+        std::ifstream fin(argv[2], std::ios::binary);
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+            return 1;
+        }
+
+        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+        fin.seekg(16 + 784 * (rand() % 10000));
+        fin.read((char *) &buf, sizeof(buf));
+    }
+
+    // render the digit in ASCII
+    {
+        digit.resize(sizeof(buf));
+
+        for (int row = 0; row < 28; row++) {
+            for (int col = 0; col < 28; col++) {
+                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+                digit[row*28 + col] = ((float)buf[row*28 + col]);
+            }
+
+            fprintf(stderr, "\n");
+        }
+
+        fprintf(stderr, "\n");
+    }
+
+    const int prediction = mnist_eval(argv[1], 1, digit);
+
+    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+    return 0;
+}
diff --git a/examples/mnist/main-mtl.h b/examples/mnist/main-mtl.h
new file mode 100644
index 000000000..8dee89701
--- /dev/null
+++ b/examples/mnist/main-mtl.h
@@ -0,0 +1,18 @@
+#pragma once
+
+struct ggml_context;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int mnist_mtl_eval(
+    struct ggml_context * ctx_data,
+    struct ggml_context * ctx_eval,
+    struct ggml_context * ctx_work,
+    struct ggml_cgraph  * gf);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
new file mode 100644
index 000000000..8e82089dc
--- /dev/null
+++ b/examples/mnist/main-mtl.m
@@ -0,0 +1,14 @@
+#import "main-mtl.h"
+
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+int mnist_mtl_eval(
+    struct ggml_context * ctx_data,
+    struct ggml_context * ctx_eval,
+    struct ggml_context * ctx_work,
+    struct ggml_cgraph  * gf) {
+    printf("mnist_mtl_eval\n");
+    return 0;
+}

From bb126f9f7d6d04fa23b8ad0cbf588e38c27bf085 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 14:07:19 +0300
Subject: [PATCH 12/27] mnist : working on the Metal pipeline (WIP)

---
 examples/mnist/main-mtl.cpp |  8 ++--
 examples/mnist/main-mtl.h   |  8 +++-
 examples/mnist/main-mtl.m   | 88 ++++++++++++++++++++++++++++++++++++-
 include/ggml/ggml.h         |  5 ++-
 src/ggml.c                  |  8 ++++
 5 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index 9480dbc62..c5c1ed821 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -48,11 +48,9 @@ int mnist_eval(
     memcpy(input->data, digit.data(), ggml_nbytes(input));
 
     //ggml_graph_compute(ctx_work, &gf);
-    mnist_mtl_eval(ctx_data, ctx_eval, ctx_work, &gf);
-
-    const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gf, "probs"));
-
-    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+    auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
+    const int prediction = mnist_mtl_eval(ctx_mtl, &gf);
+    mnist_mtl_free(ctx_mtl);
 
     ggml_free(ctx_work);
     ggml_free(ctx_data);
diff --git a/examples/mnist/main-mtl.h b/examples/mnist/main-mtl.h
index 8dee89701..0fa6f7ba5 100644
--- a/examples/mnist/main-mtl.h
+++ b/examples/mnist/main-mtl.h
@@ -7,12 +7,18 @@ struct ggml_cgraph;
 extern "C" {
 #endif
 
-int mnist_mtl_eval(
+struct ggml_mtl_context;
+
+struct ggml_mtl_context * mnist_mtl_init(
     struct ggml_context * ctx_data,
     struct ggml_context * ctx_eval,
     struct ggml_context * ctx_work,
     struct ggml_cgraph  * gf);
 
+void mnist_mtl_free(struct ggml_mtl_context * ctx);
+
+int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index 8e82089dc..f48bbb177 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -1,14 +1,98 @@
 #import "main-mtl.h"
 
+#import "ggml/ggml.h"
+
 #import <Foundation/Foundation.h>
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
-int mnist_mtl_eval(
+struct ggml_mtl_context {
+    id<MTLDevice> device;
+    id<MTLCommandQueue> queue;
+    id<MTLCommandBuffer> command_buffer;
+
+    id<MTLHeap> heap_data;
+    id<MTLHeap> heap_eval;
+};
+
+struct ggml_mtl_context * mnist_mtl_init(
     struct ggml_context * ctx_data,
     struct ggml_context * ctx_eval,
     struct ggml_context * ctx_work,
     struct ggml_cgraph  * gf) {
-    printf("mnist_mtl_eval\n");
+    fprintf(stderr, "%s: allocating\n", __func__);
+
+    struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
+
+    ctx->device         = MTLCreateSystemDefaultDevice();
+    ctx->queue          = [ctx->device newCommandQueue];
+    ctx->command_buffer = [ctx->queue commandBuffer];
+
+    // pin ctx_data memory to GPU
+    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
+    // TODO: how to use MTLStorageModeManaged?
+    // TODO: see if we can avoid this copy somehow
+    {
+        const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
+        const size_t mem_size   = ggml_get_mem_size(ctx_data);
+
+        MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new];
+        heap_desc.storageMode = MTLStorageModeShared;
+        heap_desc.size        = mem_size;
+
+        ctx->heap_data = [ctx->device newHeapWithDescriptor:heap_desc];
+        [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile];
+
+        id<MTLBuffer> buffer = [ctx->heap_data newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
+
+        // copy data from CPU to GPU
+        memcpy([buffer contents], mem_buffer, mem_size);
+    }
+
+    // pin ctx_eval memory to GPU
+    // this heap will be used for the intermediate results of the evaluation
+    {
+        const size_t mem_size = ggml_get_mem_size(ctx_eval);
+
+        MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new];
+        heap_desc.storageMode = MTLStorageModePrivate; // GPU only
+        heap_desc.size        = mem_size;
+
+        ctx->heap_eval = [ctx->device newHeapWithDescriptor:heap_desc];
+        [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile];
+    }
+
+    return ctx;
+}
+
+void mnist_mtl_free(struct ggml_mtl_context * ctx) {
+    fprintf(stderr, "%s: deallocating\n", __func__);
+
+    free(ctx);
+}
+
+int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf) {
+    fprintf(stderr, "%s: evaluating\n", __func__);
+
+    // create a new encoder for the command buffer
+    id<MTLComputeCommandEncoder> encoder = [ctx->command_buffer computeCommandEncoder];
+
+    for (int i = 0; i < gf->n_nodes; ++i) {
+        fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+
+        // TODO ...
+    }
+
+    // finish encoding
+    [encoder endEncoding];
+
+    [ctx->command_buffer commit];
+    [ctx->command_buffer waitUntilCompleted];
+
+    {
+        const double time_elapsed = [ctx->command_buffer GPUEndTime] - [ctx->command_buffer GPUStartTime];
+        fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed);
+    }
+
     return 0;
 }
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index 558138280..ad08adeb1 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -451,9 +451,12 @@ extern "C" {
 
     GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
 
-    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
     GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
 
+    GGML_API void *  ggml_get_mem_buffer(struct ggml_context * ctx);
+    GGML_API size_t  ggml_get_mem_size  (struct ggml_context * ctx);
+
     GGML_API struct ggml_tensor * ggml_new_tensor(
             struct ggml_context * ctx,
             enum   ggml_type type,
diff --git a/src/ggml.c b/src/ggml.c
index 07ea92a58..f3831a4fb 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4030,6 +4030,14 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
     ctx->no_alloc = no_alloc;
 }
 
+void * ggml_get_mem_buffer(struct ggml_context * ctx) {
+    return ctx->mem_buffer;
+}
+
+size_t ggml_get_mem_size(struct ggml_context * ctx) {
+    return ctx->mem_size;
+}
+
 // IMPORTANT:
 // when creating "opt" tensors, always save and load the scratch buffer
 // this is an error prone process, but it is necessary to support inplace

From 24ea9dddf5907aabd684f2e6fb87e123cc88d5f7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 14:15:43 +0300
Subject: [PATCH 13/27] mnist : prepare the Metal encoder (WIP)

---
 examples/mnist/main-mtl.m | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index f48bbb177..add85a77d 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -9,7 +9,6 @@
 struct ggml_mtl_context {
     id<MTLDevice> device;
     id<MTLCommandQueue> queue;
-    id<MTLCommandBuffer> command_buffer;
 
     id<MTLHeap> heap_data;
     id<MTLHeap> heap_eval;
@@ -26,7 +25,6 @@
 
     ctx->device         = MTLCreateSystemDefaultDevice();
     ctx->queue          = [ctx->device newCommandQueue];
-    ctx->command_buffer = [ctx->queue commandBuffer];
 
     // pin ctx_data memory to GPU
     // use MTLStorageModeShared to allow us to initialize the weights from the CPU
@@ -74,23 +72,39 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) {
 int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf) {
     fprintf(stderr, "%s: evaluating\n", __func__);
 
-    // create a new encoder for the command buffer
-    id<MTLComputeCommandEncoder> encoder = [ctx->command_buffer computeCommandEncoder];
+    id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
+    id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
 
     for (int i = 0; i < gf->n_nodes; ++i) {
         fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
-        // TODO ...
+        switch (gf->nodes[i]->op) {
+            case GGML_OP_ADD:
+                {
+                } break;
+            case GGML_OP_RELU:
+                {
+                } break;
+            case GGML_OP_SOFT_MAX:
+                {
+                } break;
+            case GGML_OP_MUL_MAT:
+                {
+                } break;
+            default:
+                fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+                GGML_ASSERT(false);
+                return -1;
+        }
     }
 
-    // finish encoding
     [encoder endEncoding];
 
-    [ctx->command_buffer commit];
-    [ctx->command_buffer waitUntilCompleted];
+    [command_buffer commit];
+    [command_buffer waitUntilCompleted];
 
     {
-        const double time_elapsed = [ctx->command_buffer GPUEndTime] - [ctx->command_buffer GPUStartTime];
+        const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
         fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed);
     }
 

From 2ec1dff65449e021cfffef58c0a7649db9fdcd9a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 15:28:01 +0300
Subject: [PATCH 14/27] mnist : first Metal kernel for F32 ADD

---
 examples/mnist/main-mtl.h | 12 +++---
 examples/mnist/main-mtl.m | 79 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 82 insertions(+), 9 deletions(-)

diff --git a/examples/mnist/main-mtl.h b/examples/mnist/main-mtl.h
index 0fa6f7ba5..4e661a4d3 100644
--- a/examples/mnist/main-mtl.h
+++ b/examples/mnist/main-mtl.h
@@ -10,14 +10,16 @@ extern "C" {
 struct ggml_mtl_context;
 
 struct ggml_mtl_context * mnist_mtl_init(
-    struct ggml_context * ctx_data,
-    struct ggml_context * ctx_eval,
-    struct ggml_context * ctx_work,
-    struct ggml_cgraph  * gf);
+        struct ggml_context * ctx_data,
+        struct ggml_context * ctx_eval,
+        struct ggml_context * ctx_work,
+        struct ggml_cgraph  * gf);
 
 void mnist_mtl_free(struct ggml_mtl_context * ctx);
 
-int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf);
+int mnist_mtl_eval(
+        struct ggml_mtl_context * ctx,
+        struct ggml_cgraph      * gf);
 
 #ifdef __cplusplus
 }
diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index add85a77d..3021f17a4 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -7,13 +7,36 @@
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
 struct ggml_mtl_context {
-    id<MTLDevice> device;
+    struct ggml_context * ctx_data;
+    struct ggml_context * ctx_eval;
+    struct ggml_context * ctx_work;
+
+    id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
+    id<MTLLibrary>      library;
 
     id<MTLHeap> heap_data;
     id<MTLHeap> heap_eval;
+
+    // custom kernels
+    id<MTLFunction>             function_add;
+    id<MTLComputePipelineState> pipeline_add;
 };
 
+// MSL code
+NSString * const msl_library_mnist = @"\
+#include <metal_stdlib>                                                                 \n\
+using namespace metal;                                                                  \n\
+                                                                                        \n\
+kernel void kernel_add(                                                                 \n\
+        device const float * src0,                                                      \n\
+        device const float * src1,                                                      \n\
+        device float * dst,                                                             \n\
+        uint gid[[thread_position_in_grid]]) {                                          \n\
+    dst[gid] = src0[gid] + src1[gid];                                                   \n\
+}                                                                                       \n\
+";
+
 struct ggml_mtl_context * mnist_mtl_init(
     struct ggml_context * ctx_data,
     struct ggml_context * ctx_eval,
@@ -23,8 +46,24 @@
 
     struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
 
-    ctx->device         = MTLCreateSystemDefaultDevice();
-    ctx->queue          = [ctx->device newCommandQueue];
+    ctx->ctx_data = ctx_data;
+    ctx->ctx_eval = ctx_eval;
+    ctx->ctx_work = ctx_work;
+
+    ctx->device  = MTLCreateSystemDefaultDevice();
+    ctx->queue   = [ctx->device newCommandQueue];
+
+    // compile from source string and show compile log
+    NSError * error = nil;
+    ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error];
+    if (error) {
+        fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+        exit(1);
+    }
+
+    // load kernels
+    ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
+    ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
 
     // pin ctx_data memory to GPU
     // use MTLStorageModeShared to allow us to initialize the weights from the CPU
@@ -69,7 +108,28 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) {
     free(ctx);
 }
 
-int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf) {
+// make a view of the respective MTL heap
+id<MTLBuffer> mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t) {
+    const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
+    const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
+
+    const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
+
+    const size_t t_size = ggml_nbytes(t);
+    const size_t t_offs = is_data ? offs_data : offs_eval;
+
+    if (is_data) {
+        fprintf(stderr, "%s: data tensor '%s'\n", __func__, t->name);
+        return [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs];
+    } else {
+        fprintf(stderr, "%s: eval tensor '%s'\n", __func__, t->name);
+        return [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs];
+    }
+}
+
+int mnist_mtl_eval(
+        struct ggml_mtl_context * ctx,
+        struct ggml_cgraph      * gf) {
     fprintf(stderr, "%s: evaluating\n", __func__);
 
     id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
@@ -81,6 +141,17 @@ int mnist_mtl_eval(struct ggml_mtl_context * ctx, struct ggml_cgraph * gf) {
         switch (gf->nodes[i]->op) {
             case GGML_OP_ADD:
                 {
+                    id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0);
+                    id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1);
+                    id<MTLBuffer> id_dst  = mnist_mtl_get_buffer(ctx, gf->nodes[i]);
+
+                    [encoder setComputePipelineState:ctx->pipeline_add];
+                    [encoder setBuffer:id_src0 offset:0 atIndex:0];
+                    [encoder setBuffer:id_src1 offset:0 atIndex:1];
+                    [encoder setBuffer:id_dst  offset:0 atIndex:2];
+
+                    const int64_t n = ggml_nelements(gf->nodes[i]);
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_RELU:
                 {

From 966f9e6bdf0d7548470d83869622002b558e4b29 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 16:06:55 +0300
Subject: [PATCH 15/27] mnist : looks like MTLHeap does not work

---
 examples/mnist/main-mtl.m | 79 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index 3021f17a4..fbad028c7 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -21,6 +21,12 @@
     // custom kernels
     id<MTLFunction>             function_add;
     id<MTLComputePipelineState> pipeline_add;
+
+    id<MTLFunction>             function_relu;
+    id<MTLComputePipelineState> pipeline_relu;
+
+    id<MTLFunction>             function_softmax;
+    id<MTLComputePipelineState> pipeline_softmax;
 };
 
 // MSL code
@@ -34,6 +40,13 @@ kernel void kernel_add(
         device float * dst,                                                             \n\
         uint gid[[thread_position_in_grid]]) {                                          \n\
     dst[gid] = src0[gid] + src1[gid];                                                   \n\
+}                                                                                       \n\
+                                                                                        \n\
+kernel void kernel_relu(                                                                \n\
+        device const float * src,                                                       \n\
+        device       float * dst,                                                       \n\
+        uint gid[[thread_position_in_grid]]) {                                          \n\
+    dst[gid] = max(0.0f, src[gid]);                                                     \n\
 }                                                                                       \n\
 ";
 
@@ -53,6 +66,14 @@ kernel void kernel_add(
     ctx->device  = MTLCreateSystemDefaultDevice();
     ctx->queue   = [ctx->device newCommandQueue];
 
+    // determine if we can use MPS
+    if (MPSSupportsMTLDevice(ctx->device)) {
+        fprintf(stderr, "%s: using MPS\n", __func__);
+    } else {
+        fprintf(stderr, "%s: not using MPS\n", __func__);
+        GGML_ASSERT(false && "MPS not supported");
+    }
+
     // compile from source string and show compile log
     NSError * error = nil;
     ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error];
@@ -65,6 +86,9 @@ kernel void kernel_add(
     ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
     ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
 
+    ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
+    ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
+
     // pin ctx_data memory to GPU
     // use MTLStorageModeShared to allow us to initialize the weights from the CPU
     // TODO: how to use MTLStorageModeManaged?
@@ -77,13 +101,24 @@ kernel void kernel_add(
         heap_desc.storageMode = MTLStorageModeShared;
         heap_desc.size        = mem_size;
 
+        printf("heap_desc.size = %zu\n", mem_size);
+
         ctx->heap_data = [ctx->device newHeapWithDescriptor:heap_desc];
-        [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile];
+        [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed?
+        ctx->heap_data.label = @"heap_data";
+
+        printf("ctx->heap_data.size = %zu\n", [ctx->heap_data size]);
 
         id<MTLBuffer> buffer = [ctx->heap_data newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
+        if (!buffer) {
+            fprintf(stderr, "%s: error: failed to allocate buffer\n", __func__);
+            exit(1);
+        }
 
         // copy data from CPU to GPU
         memcpy([buffer contents], mem_buffer, mem_size);
+
+        fprintf(stderr, "%s: allocated data heap, size = %zu\n", __func__, mem_size);
     }
 
     // pin ctx_eval memory to GPU
@@ -96,7 +131,9 @@ kernel void kernel_add(
         heap_desc.size        = mem_size;
 
         ctx->heap_eval = [ctx->device newHeapWithDescriptor:heap_desc];
-        [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile];
+        [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed?
+
+        fprintf(stderr, "%s: allocated eval heap, size = %zu\n", __func__, mem_size);
     }
 
     return ctx;
@@ -118,13 +155,22 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) {
     const size_t t_size = ggml_nbytes(t);
     const size_t t_offs = is_data ? offs_data : offs_eval;
 
+    id<MTLBuffer> result;
+
     if (is_data) {
-        fprintf(stderr, "%s: data tensor '%s'\n", __func__, t->name);
-        return [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs];
+        fprintf(stderr, "%s: data tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        result = [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs];
     } else {
-        fprintf(stderr, "%s: eval tensor '%s'\n", __func__, t->name);
-        return [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs];
+        fprintf(stderr, "%s: eval tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs];
+    }
+
+    NSLog(@"%s: buffer = %p\n", __func__, result);
+    if (result == nil) {
+        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
     }
+
+    return result;
 }
 
 int mnist_mtl_eval(
@@ -155,9 +201,30 @@ int mnist_mtl_eval(
                 } break;
             case GGML_OP_RELU:
                 {
+                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0);
+                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i]);
+
+                    [encoder setComputePipelineState:ctx->pipeline_relu];
+                    [encoder setBuffer:id_src offset:0 atIndex:0];
+                    [encoder setBuffer:id_dst offset:0 atIndex:1];
+
+                    const int64_t n = ggml_nelements(gf->nodes[i]);
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_SOFT_MAX:
                 {
+                    // use MPSMatrixSoftMax
+                    //id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0);
+                    //id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i]);
+
+                    //MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
+                    //    matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+
+                    //MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src descriptor:desc];
+                    //MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst descriptor:desc];
+
+                    //MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device];
+                    //[softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst];
                 } break;
             case GGML_OP_MUL_MAT:
                 {

From 1bc91815ce29815b1309916010322eced68356d5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 16:34:47 +0300
Subject: [PATCH 16/27] mnist : initial full pass of MNIST on the GPU (not
 verified)

---
 examples/mnist/main-mtl.m | 178 +++++++++++++++++++++++++++++++++-----
 1 file changed, 156 insertions(+), 22 deletions(-)

diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index fbad028c7..bb0cf924a 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -6,6 +6,9 @@
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 
+// TODO: couldn't get this to work
+//#define GGML_MTL_HEAP
+
 struct ggml_mtl_context {
     struct ggml_context * ctx_data;
     struct ggml_context * ctx_eval;
@@ -15,8 +18,13 @@
     id<MTLCommandQueue> queue;
     id<MTLLibrary>      library;
 
+#ifdef GGML_MTL_HEAP
     id<MTLHeap> heap_data;
     id<MTLHeap> heap_eval;
+#else
+    id<MTLBuffer> buffer_data;
+    id<MTLBuffer> buffer_eval;
+#endif
 
     // custom kernels
     id<MTLFunction>             function_add;
@@ -89,6 +97,9 @@ kernel void kernel_relu(
     ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
     ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
 
+#ifdef GGML_MTL_HEAP
+    // MTLHeap approach
+
     // pin ctx_data memory to GPU
     // use MTLStorageModeShared to allow us to initialize the weights from the CPU
     // TODO: how to use MTLStorageModeManaged?
@@ -135,6 +146,32 @@ kernel void kernel_relu(
 
         fprintf(stderr, "%s: allocated eval heap, size = %zu\n", __func__, mem_size);
     }
+#else
+    // MTLBuffer approach
+
+    // pin ctx_data memory to GPU
+    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
+    // TODO: how to use MTLStorageModeManaged?
+    // TODO: see if we can avoid this copy somehow
+    {
+        const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
+        const size_t mem_size   = ggml_get_mem_size(ctx_data);
+
+        ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
+
+        fprintf(stderr, "%s: allocated data buffer, size = %zu\n", __func__, mem_size);
+    }
+
+    // pin ctx_eval memory to GPU
+    // this heap will be used for the intermediate results of the evaluation
+    {
+        const size_t mem_size = ggml_get_mem_size(ctx_eval);
+
+        ctx->buffer_eval = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModePrivate];
+
+        fprintf(stderr, "%s: allocated eval buffer, size = %zu\n", __func__, mem_size);
+    }
+#endif
 
     return ctx;
 }
@@ -145,8 +182,10 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) {
     free(ctx);
 }
 
+#ifdef GGML_MTL_HEAP
+
 // make a view of the respective MTL heap
-id<MTLBuffer> mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t) {
+id<MTLBuffer> mnist_mtl_get_buffer_on_heap(struct ggml_mtl_context * ctx, struct ggml_tensor * t) {
     const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
     const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
 
@@ -165,21 +204,62 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) {
         result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs];
     }
 
-    NSLog(@"%s: buffer = %p\n", __func__, result);
     if (result == nil) {
         fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+        GGML_ASSERT(false);
+    }
+
+    return result;
+}
+
+#else
+
+// get data / eval buffer + offset
+id<MTLBuffer> mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
+    const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
+    const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
+
+    const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
+
+    const size_t t_size = ggml_nbytes(t);
+    const size_t t_offs = is_data ? offs_data : offs_eval;
+
+    id<MTLBuffer> result;
+
+    if (is_data) {
+        fprintf(stderr, "%s: data tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        result = ctx->buffer_data;
+    } else {
+        fprintf(stderr, "%s: eval tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        result = ctx->buffer_eval;
+    }
+
+    if (result == nil) {
+        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+        GGML_ASSERT(false);
+    }
+
+    if (offs != nil) {
+        *offs = t_offs;
     }
 
     return result;
 }
 
+#endif
+
 int mnist_mtl_eval(
         struct ggml_mtl_context * ctx,
         struct ggml_cgraph      * gf) {
     fprintf(stderr, "%s: evaluating\n", __func__);
 
     id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
-    id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+    //id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+    id<MTLComputeCommandEncoder> encoder = nil;
+
+    size_t offs_src0;
+    size_t offs_src1;
+    size_t offs_dst;
 
     for (int i = 0; i < gf->n_nodes; ++i) {
         fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
@@ -187,47 +267,97 @@ int mnist_mtl_eval(
         switch (gf->nodes[i]->op) {
             case GGML_OP_ADD:
                 {
-                    id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0);
-                    id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1);
-                    id<MTLBuffer> id_dst  = mnist_mtl_get_buffer(ctx, gf->nodes[i]);
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
+                    id<MTLBuffer> id_dst  = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
 
                     [encoder setComputePipelineState:ctx->pipeline_add];
-                    [encoder setBuffer:id_src0 offset:0 atIndex:0];
-                    [encoder setBuffer:id_src1 offset:0 atIndex:1];
-                    [encoder setBuffer:id_dst  offset:0 atIndex:2];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
 
                     const int64_t n = ggml_nelements(gf->nodes[i]);
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_RELU:
                 {
-                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0);
-                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i]);
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
 
                     [encoder setComputePipelineState:ctx->pipeline_relu];
-                    [encoder setBuffer:id_src offset:0 atIndex:0];
-                    [encoder setBuffer:id_dst offset:0 atIndex:1];
+                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
 
                     const int64_t n = ggml_nelements(gf->nodes[i]);
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_SOFT_MAX:
                 {
+                    if (encoder != nil) {
+                        [encoder endEncoding];
+                        encoder = nil;
+                    }
+
                     // use MPSMatrixSoftMax
-                    //id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0);
-                    //id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i]);
+                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
 
-                    //MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
-                    //    matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+                    MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src offset:offs_src0 descriptor:desc];
+                    MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst  descriptor:desc];
 
-                    //MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src descriptor:desc];
-                    //MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst descriptor:desc];
+                    MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device];
 
-                    //MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device];
-                    //[softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst];
+                    [softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst];
                 } break;
             case GGML_OP_MUL_MAT:
                 {
+                    if (encoder != nil) {
+                        [encoder endEncoding];
+                        encoder = nil;
+                    }
+
+                    // use MPSMatrixMultiplication
+                    id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
+                    id<MTLBuffer> id_dst  = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    const int64_t ncols0 = gf->nodes[i]->src0->ne[0];
+                    const int64_t nrows0 = gf->nodes[i]->src0->ne[1];
+
+                    const int64_t ncols1 = gf->nodes[i]->src1->ne[0];
+                    const int64_t nrows1 = gf->nodes[i]->src1->ne[1];
+
+                    const int64_t ncols2 = gf->nodes[i]->ne[0];
+                    const int64_t nrows2 = gf->nodes[i]->ne[1];
+
+                    GGML_ASSERT(ncols0 == ncols1);
+
+                    MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:nrows0 columns:ncols0 rowBytes:gf->nodes[i]->src0->nb[1] dataType:MPSDataTypeFloat32];
+                    MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:nrows1 columns:ncols1 rowBytes:gf->nodes[i]->src1->nb[1] dataType:MPSDataTypeFloat32];
+                    MPSMatrixDescriptor * desc2 = [MPSMatrixDescriptor
+                        matrixDescriptorWithRows:nrows2 columns:ncols2 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+
+                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0 descriptor:desc0];
+                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1 descriptor:desc1];
+                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst  descriptor:desc2];
+
+                    MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] initWithDevice:ctx->device
+                        transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0];
+
+                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
                 } break;
             default:
                 fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
@@ -236,7 +366,11 @@ int mnist_mtl_eval(
         }
     }
 
-    [encoder endEncoding];
+    //[encoder endEncoding];
+    if (encoder != nil) {
+        [encoder endEncoding];
+        encoder = nil;
+    }
 
     [command_buffer commit];
     [command_buffer waitUntilCompleted];

From 4134bac4e522ab5b39f90c05f011369de1b93131 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 16:42:19 +0300
Subject: [PATCH 17/27] mnist : minor cleanup

---
 examples/mnist/main-mtl.m | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index bb0cf924a..e248e0d77 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -197,10 +197,10 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) {
     id<MTLBuffer> result;
 
     if (is_data) {
-        fprintf(stderr, "%s: data tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
         result = [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs];
     } else {
-        fprintf(stderr, "%s: eval tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
         result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs];
     }
 
@@ -227,10 +227,10 @@ void mnist_mtl_free(struct ggml_mtl_context * ctx) {
     id<MTLBuffer> result;
 
     if (is_data) {
-        fprintf(stderr, "%s: data tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
         result = ctx->buffer_data;
     } else {
-        fprintf(stderr, "%s: eval tensor '%8s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+        fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
         result = ctx->buffer_eval;
     }
 
@@ -254,7 +254,6 @@ int mnist_mtl_eval(
     fprintf(stderr, "%s: evaluating\n", __func__);
 
     id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
-    //id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
     id<MTLComputeCommandEncoder> encoder = nil;
 
     size_t offs_src0;
@@ -366,7 +365,6 @@ int mnist_mtl_eval(
         }
     }
 
-    //[encoder endEncoding];
     if (encoder != nil) {
         [encoder endEncoding];
         encoder = nil;

From a556b576c504d4ec83c7f51ade7e1c8193a26641 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 17:16:53 +0300
Subject: [PATCH 18/27] mnist : full GPU inference works

---
 examples/mnist/main-mtl.cpp |  1 -
 examples/mnist/main-mtl.m   | 52 ++++++++++++++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index c5c1ed821..86467119d 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -47,7 +47,6 @@ int mnist_eval(
     struct ggml_tensor * input = ggml_get_tensor_by_name(&gf, "input");
     memcpy(input->data, digit.data(), ggml_nbytes(input));
 
-    //ggml_graph_compute(ctx_work, &gf);
     auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
     const int prediction = mnist_mtl_eval(ctx_mtl, &gf);
     mnist_mtl_free(ctx_mtl);
diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index e248e0d77..bd41d8dc7 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -26,15 +26,14 @@
     id<MTLBuffer> buffer_eval;
 #endif
 
+    id<MTLBuffer> results;
+
     // custom kernels
     id<MTLFunction>             function_add;
     id<MTLComputePipelineState> pipeline_add;
 
     id<MTLFunction>             function_relu;
     id<MTLComputePipelineState> pipeline_relu;
-
-    id<MTLFunction>             function_softmax;
-    id<MTLComputePipelineState> pipeline_softmax;
 };
 
 // MSL code
@@ -173,6 +172,15 @@ kernel void kernel_relu(
     }
 #endif
 
+    // allocate buffer for result extraction
+    {
+        const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]);
+
+        ctx->results = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
+
+        fprintf(stderr, "%s: allocated results buffer, size = %zu\n", __func__, mem_size);
+    }
+
     return ctx;
 }
 
@@ -280,6 +288,7 @@ int mnist_mtl_eval(
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
 
                     const int64_t n = ggml_nelements(gf->nodes[i]);
+
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_RELU:
@@ -296,6 +305,7 @@ int mnist_mtl_eval(
                     [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
 
                     const int64_t n = ggml_nelements(gf->nodes[i]);
+
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
             case GGML_OP_SOFT_MAX:
@@ -365,9 +375,21 @@ int mnist_mtl_eval(
         }
     }
 
-    if (encoder != nil) {
-        [encoder endEncoding];
-        encoder = nil;
+    // extract results from the GPU
+    {
+        if (encoder != nil) {
+            [encoder endEncoding];
+            encoder = nil;
+        }
+
+        struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1];
+
+        id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, output, &offs_src0);
+        id<MTLBuffer> id_dst = ctx->results;
+
+        id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
+        [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(output)];
+        [encoder_blit endEncoding];
     }
 
     [command_buffer commit];
@@ -378,5 +400,21 @@ int mnist_mtl_eval(
         fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed);
     }
 
-    return 0;
+    // select the most probable digit
+
+    const float * probs = ctx->results.contents;
+
+    int   pred = 0;
+    float prob = probs[0];
+
+    for (int i = 0; i < 10; ++i) {
+        fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]);
+
+        if (probs[i] > prob) {
+            pred = i;
+            prob = probs[i];
+        }
+    }
+
+    return pred;
 }

From 8f8653b47209591d435e84c75a8cbef4340f343b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 17:21:21 +0300
Subject: [PATCH 19/27] mnist : use custom soft_max kernel since
 MPSMatrixSoftMax is bugged

---
 examples/mnist/main-mtl.m | 45 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index bd41d8dc7..6fe006226 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -34,6 +34,9 @@
 
     id<MTLFunction>             function_relu;
     id<MTLComputePipelineState> pipeline_relu;
+
+    id<MTLFunction>             function_soft_max;
+    id<MTLComputePipelineState> pipeline_soft_max;
 };
 
 // MSL code
@@ -41,6 +44,8 @@
 #include <metal_stdlib>                                                                 \n\
 using namespace metal;                                                                  \n\
                                                                                         \n\
+#define MAX(x, y) ((x) > (y) ? (x) : (y))                                               \n\
+                                                                                        \n\
 kernel void kernel_add(                                                                 \n\
         device const float * src0,                                                      \n\
         device const float * src1,                                                      \n\
@@ -54,6 +59,24 @@ kernel void kernel_relu(
         device       float * dst,                                                       \n\
         uint gid[[thread_position_in_grid]]) {                                          \n\
     dst[gid] = max(0.0f, src[gid]);                                                     \n\
+}                                                                                       \n\
+                                                                                        \n\
+kernel void kernel_soft_max(                                                            \n\
+        device const float * src,                                                       \n\
+        device       float * dst,                                                       \n\
+        uint gid[[thread_position_in_grid]]) {                                          \n\
+    float max = 0.0f;                                                                   \n\
+    for (int i = 0; i < 10; i++) {                                                      \n\
+        max = MAX(max, src[i]);                                                         \n\
+    }                                                                                   \n\
+    float sum = 0.0f;                                                                   \n\
+    for (int i = 0; i < 10; i++) {                                                      \n\
+        dst[i] = exp(src[i] - max);                                                     \n\
+        sum += dst[i];                                                                  \n\
+    }                                                                                   \n\
+    for (int i = 0; i < 10; i++) {                                                      \n\
+        dst[i] /= sum;                                                                  \n\
+    }                                                                                   \n\
 }                                                                                       \n\
 ";
 
@@ -96,6 +119,9 @@ kernel void kernel_relu(
     ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
     ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
 
+    ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max"];
+    ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
+
 #ifdef GGML_MTL_HEAP
     // MTLHeap approach
 
@@ -310,6 +336,9 @@ int mnist_mtl_eval(
                 } break;
             case GGML_OP_SOFT_MAX:
                 {
+#if 0
+                    // NOTE: MPSMatrixSoftMax is not working properly, probably there is a bug
+
                     if (encoder != nil) {
                         [encoder endEncoding];
                         encoder = nil;
@@ -328,6 +357,22 @@ int mnist_mtl_eval(
                     MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device];
 
                     [softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst];
+#else
+                    if (encoder == nil) {
+                        encoder = [command_buffer computeCommandEncoder];
+                    }
+
+                    id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+                    id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i],       &offs_dst);
+
+                    [encoder setComputePipelineState:ctx->pipeline_soft_max];
+                    [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(gf->nodes[i]);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+#endif
                 } break;
             case GGML_OP_MUL_MAT:
                 {

From 3b97377edd3160e4454f2e6f415af64bffdff693 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 21:02:33 +0300
Subject: [PATCH 20/27] mnist : use constant for soft_max instead of hardcoded
 10

---
 examples/mnist/main-cpu.cpp |  2 --
 examples/mnist/main-mtl.cpp |  9 +++++++
 examples/mnist/main-mtl.m   | 49 +++++++++++++++++++++++--------------
 3 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
index 1a07b7453..9b1a425cf 100644
--- a/examples/mnist/main-cpu.cpp
+++ b/examples/mnist/main-cpu.cpp
@@ -12,7 +12,6 @@
 
 #include "ggml/ggml.h"
 
-#include "common.h"
 #include "common-ggml.h"
 
 #include <cmath>
@@ -20,7 +19,6 @@
 #include <cstring>
 #include <ctime>
 #include <fstream>
-#include <string>
 
 // evaluate the MNIST compute graph
 //
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index 86467119d..6f5160faf 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -1,5 +1,14 @@
 // Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS
 //
+// You can generate a compute graph using the "mnist" tool:
+//
+// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+// This command creates the "mnist.ggml" file, which contains the generated compute graph.
+// Now, you can re-use the compute graph on the GPU with the "mnist-mtl" tool:
+//
+// $ ./bin/mnist-mtl ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
 
 #include "ggml/ggml.h"
 
diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index 6fe006226..e3a7739e7 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -46,6 +46,8 @@
                                                                                         \n\
 #define MAX(x, y) ((x) > (y) ? (x) : (y))                                               \n\
                                                                                         \n\
+constant int k_digits [[function_constant(0)]];                                         \n\
+                                                                                        \n\
 kernel void kernel_add(                                                                 \n\
         device const float * src0,                                                      \n\
         device const float * src1,                                                      \n\
@@ -66,15 +68,15 @@ kernel void kernel_soft_max(
         device       float * dst,                                                       \n\
         uint gid[[thread_position_in_grid]]) {                                          \n\
     float max = 0.0f;                                                                   \n\
-    for (int i = 0; i < 10; i++) {                                                      \n\
+    for (int i = 0; i < k_digits; i++) {                                                \n\
         max = MAX(max, src[i]);                                                         \n\
     }                                                                                   \n\
     float sum = 0.0f;                                                                   \n\
-    for (int i = 0; i < 10; i++) {                                                      \n\
+    for (int i = 0; i < k_digits; i++) {                                                \n\
         dst[i] = exp(src[i] - max);                                                     \n\
         sum += dst[i];                                                                  \n\
     }                                                                                   \n\
-    for (int i = 0; i < 10; i++) {                                                      \n\
+    for (int i = 0; i < k_digits; i++) {                                                \n\
         dst[i] /= sum;                                                                  \n\
     }                                                                                   \n\
 }                                                                                       \n\
@@ -113,14 +115,24 @@ kernel void kernel_soft_max(
     }
 
     // load kernels
-    ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
-    ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
+    {
+        const int k_digits = ggml_get_tensor_by_name(gf, "probs")->ne[0];
 
-    ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
-    ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
+        MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new];
+        [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"];
 
-    ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max"];
-    ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
+        ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
+        ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
+        fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add);
+
+        ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
+        ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
+        fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu);
+
+        ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
+        ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
+        fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max);
+    }
 
 #ifdef GGML_MTL_HEAP
     // MTLHeap approach
@@ -446,18 +458,19 @@ int mnist_mtl_eval(
     }
 
     // select the most probable digit
+    int pred = -1;
+    {
+        const float * probs = ctx->results.contents;
 
-    const float * probs = ctx->results.contents;
-
-    int   pred = 0;
-    float prob = probs[0];
+        float prob = probs[0];
 
-    for (int i = 0; i < 10; ++i) {
-        fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]);
+        for (int i = 0; i < 10; ++i) {
+            fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]);
 
-        if (probs[i] > prob) {
-            pred = i;
-            prob = probs[i];
+            if (probs[i] > prob) {
+                pred = i;
+                prob = probs[i];
+            }
         }
     }
 

From e350f138d4a2c2360870fbfe952bedf2d154c07b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 21:16:56 +0300
Subject: [PATCH 21/27] mnist : check multiple predictions (Metal)

---
 examples/mnist/main-mtl.cpp | 19 +++++++++++++++----
 examples/mnist/main-mtl.m   | 25 +++++++++++++++++--------
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index 6f5160faf..b6305bee9 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -53,11 +53,22 @@ int mnist_eval(
 
     struct ggml_context * ctx_work = ggml_init(params);
 
-    struct ggml_tensor * input = ggml_get_tensor_by_name(&gf, "input");
-    memcpy(input->data, digit.data(), ggml_nbytes(input));
-
     auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
-    const int prediction = mnist_mtl_eval(ctx_mtl, &gf);
+
+    int prediction = -1;
+
+    for (int i = 0; i < 1; ++i) {
+        struct ggml_tensor * input = ggml_get_tensor_by_name(&gf, "input");
+
+        if (i % 2 == 0) {
+            memcpy(input->data, digit.data(), ggml_nbytes(input));
+        } else {
+            memset(input->data, 0, ggml_nbytes(input));
+        }
+
+        prediction = mnist_mtl_eval(ctx_mtl, &gf);
+    }
+
     mnist_mtl_free(ctx_mtl);
 
     ggml_free(ctx_work);
diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index e3a7739e7..41152694f 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -26,7 +26,7 @@
     id<MTLBuffer> buffer_eval;
 #endif
 
-    id<MTLBuffer> results;
+    id<MTLBuffer> out;
 
     // custom kernels
     id<MTLFunction>             function_add;
@@ -214,9 +214,9 @@ kernel void kernel_soft_max(
     {
         const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]);
 
-        ctx->results = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
+        ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
 
-        fprintf(stderr, "%s: allocated results buffer, size = %zu\n", __func__, mem_size);
+        fprintf(stderr, "%s: allocated out buffer, size = %zu\n", __func__, mem_size);
     }
 
     return ctx;
@@ -306,6 +306,15 @@ int mnist_mtl_eval(
     size_t offs_src1;
     size_t offs_dst;
 
+    // copy the input data to the GPU
+    {
+        struct ggml_tensor * inp = ggml_get_tensor_by_name(gf, "input");
+
+        id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, inp, &offs_src0);
+
+        memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
+    }
+
     for (int i = 0; i < gf->n_nodes; ++i) {
         fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
 
@@ -439,13 +448,13 @@ int mnist_mtl_eval(
             encoder = nil;
         }
 
-        struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1];
+        struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
 
-        id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, output, &offs_src0);
-        id<MTLBuffer> id_dst = ctx->results;
+        id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, out, &offs_src0);
+        id<MTLBuffer> id_dst = ctx->out;
 
         id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
-        [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(output)];
+        [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
         [encoder_blit endEncoding];
     }
 
@@ -460,7 +469,7 @@ int mnist_mtl_eval(
     // select the most probable digit
     int pred = -1;
     {
-        const float * probs = ctx->results.contents;
+        const float * probs = ctx->out.contents;
 
         float prob = probs[0];
 

From 4fa01f0385c3104796f94f73e2b2e39950b3be9a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 21:23:28 +0300
Subject: [PATCH 22/27] mnist : minor

---
 examples/mnist/main-mtl.cpp |  2 ++
 examples/mnist/main-mtl.m   | 24 +++++++++++++-----------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index b6305bee9..1885232fc 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -53,6 +53,7 @@ int mnist_eval(
 
     struct ggml_context * ctx_work = ggml_init(params);
 
+    // this allocates all Metal resources and memory buffers
     auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
 
     int prediction = -1;
@@ -66,6 +67,7 @@ int mnist_eval(
             memset(input->data, 0, ggml_nbytes(input));
         }
 
+        // the actual inference happens here
         prediction = mnist_mtl_eval(ctx_mtl, &gf);
     }
 
diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index 41152694f..e70c0aa1d 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -95,8 +95,8 @@ kernel void kernel_soft_max(
     ctx->ctx_eval = ctx_eval;
     ctx->ctx_work = ctx_work;
 
-    ctx->device  = MTLCreateSystemDefaultDevice();
-    ctx->queue   = [ctx->device newCommandQueue];
+    ctx->device = MTLCreateSystemDefaultDevice();
+    ctx->queue  = [ctx->device newCommandQueue];
 
     // determine if we can use MPS
     if (MPSSupportsMTLDevice(ctx->device)) {
@@ -107,11 +107,13 @@ kernel void kernel_soft_max(
     }
 
     // compile from source string and show compile log
-    NSError * error = nil;
-    ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error];
-    if (error) {
-        fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-        exit(1);
+    {
+        NSError * error = nil;
+        ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error];
+        if (error) {
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            exit(1);
+        }
     }
 
     // load kernels
@@ -200,7 +202,7 @@ kernel void kernel_soft_max(
     }
 
     // pin ctx_eval memory to GPU
-    // this heap will be used for the intermediate results of the evaluation
+    // this buffer will be used for the intermediate results of the evaluation
     {
         const size_t mem_size = ggml_get_mem_size(ctx_eval);
 
@@ -467,7 +469,7 @@ int mnist_mtl_eval(
     }
 
     // select the most probable digit
-    int pred = -1;
+    int result = -1;
     {
         const float * probs = ctx->out.contents;
 
@@ -477,11 +479,11 @@ int mnist_mtl_eval(
             fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]);
 
             if (probs[i] > prob) {
-                pred = i;
+                result = i;
                 prob = probs[i];
             }
         }
     }
 
-    return pred;
+    return result;
 }

From 79dcbfdf08171fb083b997275ad307701b612d62 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 21:33:53 +0300
Subject: [PATCH 23/27] ggml : move cgraph import / export to ggml

---
 examples/common-ggml.cpp    | 463 ------------------------------------
 examples/common-ggml.h      |   4 -
 examples/mnist/main-cpu.cpp |   2 +-
 examples/mnist/main-mtl.cpp |   2 +-
 examples/mnist/main.cpp     |   3 +-
 include/ggml/ggml.h         |   3 +
 src/ggml.c                  | 455 +++++++++++++++++++++++++++++++++++
 7 files changed, 461 insertions(+), 471 deletions(-)

diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
index ac16994c0..9215dbeab 100644
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -233,466 +233,3 @@ bool ggml_common_quantize_0(
 
     return true;
 }
-
-#define GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-void ggml_cgraph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
-    const int64_t * ne = tensor->ne;
-    const size_t  * nb = tensor->nb;
-
-    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
-            ggml_type_name(tensor->type),
-            ggml_op_name  (tensor->op),
-            tensor->n_dims,
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
-            tensor->data,
-            tensor->name);
-}
-
-void ggml_cgraph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
-    const int64_t * ne = tensor->ne;
-    const size_t  * nb = tensor->nb;
-
-    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
-            arg,
-            ggml_type_name(tensor->type),
-            ggml_op_name  (tensor->op),
-            tensor->n_dims,
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
-            tensor->n_tasks,
-            tensor->data,
-            tensor->name);
-}
-
-void ggml_cgraph_export(const struct ggml_cgraph * cgraph, const char * fname) {
-    assert(cgraph->work      == NULL);
-    assert(cgraph->work_size == 0);
-
-    uint64_t size_eval = 0;
-
-    // compute size of intermediate results
-    // TODO: does not take into account scratch buffers !!!!
-    for (int i = 0; i < cgraph->n_nodes; ++i) {
-        size_eval += ggml_nbytes(cgraph->nodes[i]);
-    }
-
-    // print
-    {
-        FILE * fout = stdout;
-
-        fprintf(fout, "\n");
-        fprintf(fout, "%-16s %8x\n",   "magic",   GGML_FILE_MAGIC);
-        fprintf(fout, "%-16s %8d\n",   "version", GGML_FILE_VERSION);
-        fprintf(fout, "%-16s %8d\n",   "leafs",   cgraph->n_leafs);
-        fprintf(fout, "%-16s %8d\n",   "nodes",   cgraph->n_nodes);
-        fprintf(fout, "%-16s %8llu\n", "eval",    size_eval);
-
-        // header
-        fprintf(fout, "\n");
-        fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
-                "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
-
-        for (int i = 0; i < cgraph->n_leafs; ++i) {
-            ggml_cgraph_export_leaf(cgraph->leafs[i], fout);
-
-            GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
-            GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
-            GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
-        }
-
-        // header
-        fprintf(fout, "\n");
-        fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
-                "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
-
-        for (int i = 0; i < cgraph->n_nodes; ++i) {
-            ggml_cgraph_export_node(cgraph->nodes[i], "DST", fout);
-
-            if (cgraph->nodes[i]->src0) {
-                ggml_cgraph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
-            }
-
-            if (cgraph->nodes[i]->src1) {
-                ggml_cgraph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
-            }
-
-            for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                if (cgraph->nodes[i]->opt[j]) {
-                    ggml_cgraph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
-                }
-            }
-
-            fprintf(fout, "\n");
-        }
-
-        fprintf(fout, "\n");
-    }
-
-    // write binary data
-    {
-        FILE * fout = fopen(fname, "wb");
-
-        if (!fout) {
-            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
-            return;
-        }
-
-        // header
-        {
-            const uint32_t magic   = GGML_FILE_MAGIC;
-            const uint32_t version = GGML_FILE_VERSION;
-            const uint32_t leafs   = cgraph->n_leafs;
-            const uint32_t nodes   = cgraph->n_nodes;
-
-            fwrite(&magic,     sizeof(uint32_t), 1, fout);
-            fwrite(&version,   sizeof(uint32_t), 1, fout);
-            fwrite(&leafs,     sizeof(uint32_t), 1, fout);
-            fwrite(&nodes,     sizeof(uint32_t), 1, fout);
-            fwrite(&size_eval, sizeof(uint64_t), 1, fout);
-        }
-
-        // leafs
-        {
-            for (int i = 0; i < cgraph->n_leafs; ++i) {
-                const struct ggml_tensor * tensor = cgraph->leafs[i];
-
-                const uint32_t type   = tensor->type;
-                const uint32_t op     = tensor->op;
-                const uint32_t n_dims = tensor->n_dims;
-
-                fwrite(&type,   sizeof(uint32_t), 1, fout);
-                fwrite(&op,     sizeof(uint32_t), 1, fout);
-                fwrite(&n_dims, sizeof(uint32_t), 1, fout);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    const uint64_t ne = tensor->ne[j];
-                    const uint64_t nb = tensor->nb[j];
-
-                    fwrite(&ne, sizeof(uint64_t), 1, fout);
-                    fwrite(&nb, sizeof(uint64_t), 1, fout);
-                }
-
-                // store the pointer address
-                {
-                    const uint64_t ptr = (uint64_t) tensor->data;
-
-                    fwrite(&ptr, sizeof(uint64_t), 1, fout);
-                }
-
-                fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
-
-                // dump the data
-                // TODO: pad this to 32 byte boundary
-                {
-                    const size_t size = ggml_nbytes(tensor);
-
-                    fwrite(tensor->data, sizeof(char), size, fout);
-                }
-            }
-        }
-
-        // nodes
-        {
-            for (int i = 0; i < cgraph->n_nodes; ++i) {
-                const struct ggml_tensor * tensor = cgraph->nodes[i];
-
-                const uint32_t type   = tensor->type;
-                const uint32_t op     = tensor->op;
-                const uint32_t n_dims = tensor->n_dims;
-
-                fwrite(&type,   sizeof(uint32_t), 1, fout);
-                fwrite(&op,     sizeof(uint32_t), 1, fout);
-                fwrite(&n_dims, sizeof(uint32_t), 1, fout);
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    const uint64_t ne = tensor->ne[j];
-                    const uint64_t nb = tensor->nb[j];
-
-                    fwrite(&ne, sizeof(uint64_t), 1, fout);
-                    fwrite(&nb, sizeof(uint64_t), 1, fout);
-                }
-
-                // store the pointer address
-                {
-                    const uint64_t ptr = (uint64_t) tensor->data;
-
-                    fwrite(&ptr, sizeof(uint64_t), 1, fout);
-                }
-
-                fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
-
-                // output the op arguments
-                {
-                    struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
-
-                    args[0] = tensor->src0;
-                    args[1] = tensor->src1;
-
-                    for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                        args[2 + j] = tensor->opt[j];
-                    }
-
-                    for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
-                        if (args[j]) {
-                            int32_t idx = -1;
-
-                            // check if leaf
-                            {
-                                for (int k = 0; k < cgraph->n_leafs; ++k) {
-                                    if (args[j] == cgraph->leafs[k]) {
-                                        idx = k;
-                                        break;
-                                    }
-                                }
-                            }
-
-                            // check if node
-                            if (idx == -1) {
-                                for (int k = 0; k < cgraph->n_nodes; ++k) {
-                                    if (args[j] == cgraph->nodes[k]) {
-                                        idx = GGML_MAX_NODES + k;
-                                        break;
-                                    }
-                                }
-                            }
-
-                            if (idx == -1) {
-                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
-                                return;
-                            }
-
-                            fwrite(&idx, sizeof(int32_t), 1, fout);
-                        } else {
-                            const int32_t nul = -1;
-
-                            fwrite(&nul, sizeof(int32_t), 1, fout);
-                        }
-                    }
-                }
-            }
-        }
-
-        fclose(fout);
-    }
-}
-
-ggml_cgraph ggml_cgraph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
-    assert(*ctx_data == NULL);
-    assert(*ctx_eval == NULL);
-
-    ggml_cgraph result;
-
-    struct ggml_tensor * data = NULL;
-
-    // read file into data
-    {
-        FILE * fin = fopen(fname, "rb");
-
-        if (!fin) {
-            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
-            return result;
-        }
-
-        size_t fsize = 0;
-
-        fseek(fin, 0, SEEK_END);
-        fsize = ftell(fin);
-        fseek(fin, 0, SEEK_SET);
-
-        // create the data context
-        {
-            const size_t overhead = 1*ggml_tensor_overhead();
-
-            struct ggml_init_params params = {
-                .mem_size   = fsize + overhead,
-                .mem_buffer = NULL,
-                .no_alloc   = false,
-            };
-
-            *ctx_data = ggml_init(params);
-
-            if (!*ctx_data) {
-                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
-                return result;
-            }
-        }
-
-        data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
-
-        fread(data->data, sizeof(char), fsize, fin);
-
-        fclose(fin);
-    }
-
-    // populate result
-    {
-        const char * ptr = (const char *) data->data;
-
-        const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
-
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
-            return result;
-        }
-
-        const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
-
-        if (version != GGML_FILE_VERSION) {
-            fprintf(stderr, "%s: invalid version number\n", __func__);
-            return result;
-        }
-
-        const uint32_t leafs     = *(const uint32_t *) ptr; ptr += sizeof(leafs);
-        const uint32_t nodes     = *(const uint32_t *) ptr; ptr += sizeof(nodes);
-        const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
-
-        result.n_leafs   = leafs;
-        result.n_nodes   = nodes;
-
-        // create the data context
-        {
-            const size_t overhead = (leafs + nodes)*ggml_tensor_overhead();
-
-            struct ggml_init_params params = {
-                .mem_size   = size_eval + overhead,
-                .mem_buffer = NULL,
-                .no_alloc   = true,
-            };
-
-            *ctx_eval = ggml_init(params);
-
-            if (!*ctx_eval) {
-                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
-                return result;
-            }
-        }
-
-        // leafs
-        {
-            uint32_t type;
-            uint32_t op;
-            uint32_t n_dims;
-
-            for (int i = 0; i < leafs; ++i) {
-                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
-                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
-                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
-
-                int64_t ne[GGML_MAX_DIMS];
-                size_t  nb[GGML_MAX_DIMS];
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    uint64_t ne_cur;
-                    uint64_t nb_cur;
-
-                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
-                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
-
-                    ne[j] = ne_cur;
-                    nb[j] = nb_cur;
-                }
-
-                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
-
-                tensor->op = (enum ggml_op) op;
-
-                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
-
-                memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
-
-                tensor->data = (void *) ptr;
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    tensor->nb[j] = nb[j];
-                }
-
-                result.leafs[i] = tensor;
-
-                ptr += ggml_nbytes(tensor);
-
-                fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
-            }
-        }
-
-        ggml_set_no_alloc(*ctx_eval, false);
-
-        // nodes
-        {
-            uint32_t type;
-            uint32_t op;
-            uint32_t n_dims;
-
-            for (int i = 0; i < nodes; ++i) {
-                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
-                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
-                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
-
-                int64_t ne[GGML_MAX_DIMS];
-                size_t  nb[GGML_MAX_DIMS];
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    uint64_t ne_cur;
-                    uint64_t nb_cur;
-
-                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
-                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
-
-                    ne[j] = ne_cur;
-                    nb[j] = nb_cur;
-                }
-
-                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
-
-                tensor->op = (enum ggml_op) op;
-
-                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
-
-                memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
-
-                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                    tensor->nb[j] = nb[j];
-                }
-
-                // parse args
-                {
-                    struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
-                        &tensor->src0,
-                        &tensor->src1,
-                    };
-
-                    for (int j = 0; j < GGML_MAX_OPT; ++j) {
-                        args[2 + j] = &tensor->opt[j];
-                    }
-
-                    for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
-                        const uint32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
-
-                        if (arg_idx == -1) {
-                            continue;
-                        }
-
-                        if (arg_idx < GGML_MAX_NODES) {
-                            *args[j] = result.leafs[arg_idx];
-                        } else {
-                            *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
-                        }
-                    }
-                }
-
-                result.nodes[i] = tensor;
-
-                fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
-            }
-        }
-    }
-
-    return result;
-}
diff --git a/examples/common-ggml.h b/examples/common-ggml.h
index 1a85abed3..477de341a 100644
--- a/examples/common-ggml.h
+++ b/examples/common-ggml.h
@@ -16,7 +16,3 @@ bool ggml_common_quantize_0(
         const ggml_ftype ftype,
         const std::vector<std::string> & to_quant,
         const std::vector<std::string> & to_skip);
-
-// these will move to ggml when ready
-void        ggml_cgraph_export(const struct ggml_cgraph * cgraph, const char * fname);
-ggml_cgraph ggml_cgraph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
index 9b1a425cf..4e86ef825 100644
--- a/examples/mnist/main-cpu.cpp
+++ b/examples/mnist/main-cpu.cpp
@@ -36,7 +36,7 @@ int mnist_eval(
     struct ggml_context * ctx_data = NULL;
     struct ggml_context * ctx_eval = NULL;
 
-    struct ggml_cgraph gfi = ggml_cgraph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
     gfi.n_threads = n_threads;
 
     // allocate eval context
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index 1885232fc..19122fcca 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -38,7 +38,7 @@ int mnist_eval(
     struct ggml_context * ctx_data = NULL;
     struct ggml_context * ctx_eval = NULL;
 
-    struct ggml_cgraph gf = ggml_cgraph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
     gf.n_threads = n_threads;
 
     // allocate eval context
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index c6ecec2cd..315ff987e 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -1,7 +1,6 @@
 #include "ggml/ggml.h"
 
 #include "common.h"
-#include "common-ggml.h"
 
 #include <cmath>
 #include <cstdio>
@@ -206,7 +205,7 @@ int mnist_eval(
     if (fname_cgraph) {
         // export the compute graph for later use
         // see the "mnist-cpu" example
-        ggml_cgraph_export(&gf, "mnist.ggml");
+        ggml_graph_export(&gf, "mnist.ggml");
 
         fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph);
     }
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index ad08adeb1..3320157fe 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -983,6 +983,9 @@ extern "C" {
 
     GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name);
 
+    GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+    GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
+
     // print info and performance information for the graph
     GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
 
diff --git a/src/ggml.c b/src/ggml.c
index f3831a4fb..5f6426db7 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -14560,6 +14560,461 @@ struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const
     return NULL;
 }
 
+static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
+    const int64_t * ne = tensor->ne;
+    const size_t  * nb = tensor->nb;
+
+    fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
+            ggml_type_name(tensor->type),
+            ggml_op_name  (tensor->op),
+            tensor->n_dims,
+            ne[0], ne[1], ne[2], ne[3],
+            nb[0], nb[1], nb[2], nb[3],
+            tensor->data,
+            tensor->name);
+}
+
+static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
+    const int64_t * ne = tensor->ne;
+    const size_t  * nb = tensor->nb;
+
+    fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
+            arg,
+            ggml_type_name(tensor->type),
+            ggml_op_name  (tensor->op),
+            tensor->n_dims,
+            ne[0], ne[1], ne[2], ne[3],
+            nb[0], nb[1], nb[2], nb[3],
+            tensor->n_tasks,
+            tensor->data,
+            tensor->name);
+}
+
+void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
+    assert(cgraph->work      == NULL);
+    assert(cgraph->work_size == 0);
+
+    uint64_t size_eval = 0;
+
+    // compute size of intermediate results
+    // TODO: does not take into account scratch buffers !!!!
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        size_eval += ggml_nbytes(cgraph->nodes[i]);
+    }
+
+    // print
+    {
+        FILE * fout = stdout;
+
+        fprintf(fout, "\n");
+        fprintf(fout, "%-16s %8x\n",   "magic",   GGML_FILE_MAGIC);
+        fprintf(fout, "%-16s %8d\n",   "version", GGML_FILE_VERSION);
+        fprintf(fout, "%-16s %8d\n",   "leafs",   cgraph->n_leafs);
+        fprintf(fout, "%-16s %8d\n",   "nodes",   cgraph->n_nodes);
+        fprintf(fout, "%-16s %8llu\n", "eval",    size_eval);
+
+        // header
+        fprintf(fout, "\n");
+        fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
+                "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
+
+        for (int i = 0; i < cgraph->n_leafs; ++i) {
+            ggml_graph_export_leaf(cgraph->leafs[i], fout);
+
+            GGML_ASSERT(cgraph->leafs[i]->op   == GGML_OP_NONE);
+            GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
+            GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
+        }
+
+        // header
+        fprintf(fout, "\n");
+        fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
+                "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
+
+        for (int i = 0; i < cgraph->n_nodes; ++i) {
+            ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
+
+            if (cgraph->nodes[i]->src0) {
+                ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
+            }
+
+            if (cgraph->nodes[i]->src1) {
+                ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
+            }
+
+            for (int j = 0; j < GGML_MAX_OPT; ++j) {
+                if (cgraph->nodes[i]->opt[j]) {
+                    ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
+                }
+            }
+
+            fprintf(fout, "\n");
+        }
+
+        fprintf(fout, "\n");
+    }
+
+    // write binary data
+    {
+        FILE * fout = fopen(fname, "wb");
+
+        if (!fout) {
+            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            return;
+        }
+
+        // header
+        {
+            const uint32_t magic   = GGML_FILE_MAGIC;
+            const uint32_t version = GGML_FILE_VERSION;
+            const uint32_t n_leafs = cgraph->n_leafs;
+            const uint32_t nodes   = cgraph->n_nodes;
+
+            fwrite(&magic,     sizeof(uint32_t), 1, fout);
+            fwrite(&version,   sizeof(uint32_t), 1, fout);
+            fwrite(&n_leafs,   sizeof(uint32_t), 1, fout);
+            fwrite(&nodes,     sizeof(uint32_t), 1, fout);
+            fwrite(&size_eval, sizeof(uint64_t), 1, fout);
+        }
+
+        // leafs
+        {
+            for (int i = 0; i < cgraph->n_leafs; ++i) {
+                const struct ggml_tensor * tensor = cgraph->leafs[i];
+
+                const uint32_t type   = tensor->type;
+                const uint32_t op     = tensor->op;
+                const uint32_t n_dims = tensor->n_dims;
+
+                fwrite(&type,   sizeof(uint32_t), 1, fout);
+                fwrite(&op,     sizeof(uint32_t), 1, fout);
+                fwrite(&n_dims, sizeof(uint32_t), 1, fout);
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    const uint64_t ne = tensor->ne[j];
+                    const uint64_t nb = tensor->nb[j];
+
+                    fwrite(&ne, sizeof(uint64_t), 1, fout);
+                    fwrite(&nb, sizeof(uint64_t), 1, fout);
+                }
+
+                // store the pointer address
+                {
+                    const uint64_t ptr = (uint64_t) tensor->data;
+
+                    fwrite(&ptr, sizeof(uint64_t), 1, fout);
+                }
+
+                fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
+
+                // dump the data
+                // TODO: pad this to 32 byte boundary
+                {
+                    const size_t size = ggml_nbytes(tensor);
+
+                    fwrite(tensor->data, sizeof(char), size, fout);
+                }
+            }
+        }
+
+        // nodes
+        {
+            for (int i = 0; i < cgraph->n_nodes; ++i) {
+                const struct ggml_tensor * tensor = cgraph->nodes[i];
+
+                const uint32_t type   = tensor->type;
+                const uint32_t op     = tensor->op;
+                const uint32_t n_dims = tensor->n_dims;
+
+                fwrite(&type,   sizeof(uint32_t), 1, fout);
+                fwrite(&op,     sizeof(uint32_t), 1, fout);
+                fwrite(&n_dims, sizeof(uint32_t), 1, fout);
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    const uint64_t ne = tensor->ne[j];
+                    const uint64_t nb = tensor->nb[j];
+
+                    fwrite(&ne, sizeof(uint64_t), 1, fout);
+                    fwrite(&nb, sizeof(uint64_t), 1, fout);
+                }
+
+                // store the pointer address
+                {
+                    const uint64_t ptr = (uint64_t) tensor->data;
+
+                    fwrite(&ptr, sizeof(uint64_t), 1, fout);
+                }
+
+                fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
+
+                // output the op arguments
+                {
+                    struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
+
+                    args[0] = tensor->src0;
+                    args[1] = tensor->src1;
+
+                    for (int j = 0; j < GGML_MAX_OPT; ++j) {
+                        args[2 + j] = tensor->opt[j];
+                    }
+
+                    for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+                        if (args[j]) {
+                            int32_t idx = -1;
+
+                            // check if leaf
+                            {
+                                for (int k = 0; k < cgraph->n_leafs; ++k) {
+                                    if (args[j] == cgraph->leafs[k]) {
+                                        idx = k;
+                                        break;
+                                    }
+                                }
+                            }
+
+                            // check if node
+                            if (idx == -1) {
+                                for (int k = 0; k < cgraph->n_nodes; ++k) {
+                                    if (args[j] == cgraph->nodes[k]) {
+                                        idx = GGML_MAX_NODES + k;
+                                        break;
+                                    }
+                                }
+                            }
+
+                            if (idx == -1) {
+                                fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
+                                return;
+                            }
+
+                            fwrite(&idx, sizeof(int32_t), 1, fout);
+                        } else {
+                            const int32_t nul = -1;
+
+                            fwrite(&nul, sizeof(int32_t), 1, fout);
+                        }
+                    }
+                }
+            }
+        }
+
+        fclose(fout);
+    }
+}
+
+struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
+    assert(*ctx_data == NULL);
+    assert(*ctx_eval == NULL);
+
+    struct ggml_cgraph result = { 0 };
+
+    struct ggml_tensor * data = NULL;
+
+    // read file into data
+    {
+        FILE * fin = fopen(fname, "rb");
+
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+            return result;
+        }
+
+        size_t fsize = 0;
+
+        fseek(fin, 0, SEEK_END);
+        fsize = ftell(fin);
+        fseek(fin, 0, SEEK_SET);
+
+        // create the data context
+        {
+            const size_t overhead = 1*ggml_tensor_overhead();
+
+            struct ggml_init_params params = {
+                .mem_size   = fsize + overhead,
+                .mem_buffer = NULL,
+                .no_alloc   = false,
+            };
+
+            *ctx_data = ggml_init(params);
+
+            if (!*ctx_data) {
+                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+                return result;
+            }
+        }
+
+        data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
+
+        fread(data->data, sizeof(char), fsize, fin);
+
+        fclose(fin);
+    }
+
+    // populate result
+    {
+        char * ptr = (char *) data->data;
+
+        const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
+
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
+            return result;
+        }
+
+        const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
+
+        if (version != GGML_FILE_VERSION) {
+            fprintf(stderr, "%s: invalid version number\n", __func__);
+            return result;
+        }
+
+        const uint32_t n_leafs   = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
+        const uint32_t n_nodes   = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
+        const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
+
+        result.n_leafs = n_leafs;
+        result.n_nodes = n_nodes;
+
+        // create the data context
+        {
+            const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
+
+            struct ggml_init_params params = {
+                .mem_size   = size_eval + overhead,
+                .mem_buffer = NULL,
+                .no_alloc   = true,
+            };
+
+            *ctx_eval = ggml_init(params);
+
+            if (!*ctx_eval) {
+                fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+                return result;
+            }
+        }
+
+        // leafs
+        {
+            uint32_t type;
+            uint32_t op;
+            uint32_t n_dims;
+
+            for (uint32_t i = 0; i < n_leafs; ++i) {
+                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
+                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+
+                int64_t ne[GGML_MAX_DIMS];
+                size_t  nb[GGML_MAX_DIMS];
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    uint64_t ne_cur;
+                    uint64_t nb_cur;
+
+                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+                    ne[j] = ne_cur;
+                    nb[j] = nb_cur;
+                }
+
+                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+                tensor->op = (enum ggml_op) op;
+
+                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+
+                memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+
+                tensor->data = (void *) ptr;
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    tensor->nb[j] = nb[j];
+                }
+
+                result.leafs[i] = tensor;
+
+                ptr += ggml_nbytes(tensor);
+
+                fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+            }
+        }
+
+        ggml_set_no_alloc(*ctx_eval, false);
+
+        // nodes
+        {
+            uint32_t type;
+            uint32_t op;
+            uint32_t n_dims;
+
+            for (uint32_t i = 0; i < n_nodes; ++i) {
+                type   = *(const uint32_t *) ptr; ptr += sizeof(type);
+                op     = *(const uint32_t *) ptr; ptr += sizeof(op);
+                n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+
+                int64_t ne[GGML_MAX_DIMS];
+                size_t  nb[GGML_MAX_DIMS];
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    uint64_t ne_cur;
+                    uint64_t nb_cur;
+
+                    ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+                    nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+                    ne[j] = ne_cur;
+                    nb[j] = nb_cur;
+                }
+
+                struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+                tensor->op = (enum ggml_op) op;
+
+                uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+
+                memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    tensor->nb[j] = nb[j];
+                }
+
+                // parse args
+                {
+                    struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
+                        &tensor->src0,
+                        &tensor->src1,
+                    };
+
+                    for (int j = 0; j < GGML_MAX_OPT; ++j) {
+                        args[2 + j] = &tensor->opt[j];
+                    }
+
+                    for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+                        const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
+
+                        if (arg_idx == -1) {
+                            continue;
+                        }
+
+                        if (arg_idx < GGML_MAX_NODES) {
+                            *args[j] = result.leafs[arg_idx];
+                        } else {
+                            *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
+                        }
+                    }
+                }
+
+                result.nodes[i] = tensor;
+
+                fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+            }
+        }
+    }
+
+    return result;
+}
+
 void ggml_graph_print(const struct ggml_cgraph * cgraph) {
     int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
 

From 25adade79da83bb76ffddbac6755383d8e819f03 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 28 May 2023 23:06:49 +0300
Subject: [PATCH 24/27] mnist : remove common dependencies

---
 examples/mnist/CMakeLists.txt | 6 ++----
 examples/mnist/main-cpu.cpp   | 3 +--
 examples/mnist/main-mtl.cpp   | 3 +--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt
index 222d7cd2b..3ce092490 100644
--- a/examples/mnist/CMakeLists.txt
+++ b/examples/mnist/CMakeLists.txt
@@ -3,14 +3,14 @@
 
 set(TEST_TARGET mnist)
 add_executable(${TEST_TARGET} main.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
 
 #
 # mnist-cpu
 
 set(TEST_TARGET mnist-cpu)
 add_executable(${TEST_TARGET} main-cpu.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
 
 if (APPLE)
     #
@@ -25,8 +25,6 @@ if (APPLE)
     add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m)
     target_link_libraries(${TEST_TARGET} PRIVATE
         ggml
-        common
-        common-ggml
         ${FOUNDATION_LIBRARY}
         ${METAL_FRAMEWORK}
         ${METALKIT_FRAMEWORK}
diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
index 4e86ef825..46b6bee1c 100644
--- a/examples/mnist/main-cpu.cpp
+++ b/examples/mnist/main-cpu.cpp
@@ -12,13 +12,12 @@
 
 #include "ggml/ggml.h"
 
-#include "common-ggml.h"
-
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
+#include <vector>
 
 // evaluate the MNIST compute graph
 //
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index 19122fcca..018c5b87a 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -14,13 +14,12 @@
 
 #include "main-mtl.h"
 
-#include "common-ggml.h"
-
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
+#include <vector>
 
 // evaluate the MNIST compute graph
 //

From e6dc506d77b0f11bca9ae2250ff1d08ee6c4a7cb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 19:14:35 +0300
Subject: [PATCH 25/27] mnist : fix soft_max threadgroup size

---
 examples/mnist/main-mtl.m | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index e70c0aa1d..2d4fb9b1e 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -392,9 +392,7 @@ int mnist_mtl_eval(
                     [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst offset:offs_dst  atIndex:1];
 
-                    const int64_t n = ggml_nelements(gf->nodes[i]);
-
-                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
 #endif
                 } break;
             case GGML_OP_MUL_MAT:

From f9b04dff5766d910b94e6f6ef27f98258c8ad820 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 19:16:36 +0300
Subject: [PATCH 26/27] mnist : init no_alloc member

---
 examples/mnist/main-cpu.cpp | 1 +
 examples/mnist/main-mtl.cpp | 1 +
 examples/mnist/main.cpp     | 1 +
 3 files changed, 3 insertions(+)

diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
index 46b6bee1c..c8e30b0c2 100644
--- a/examples/mnist/main-cpu.cpp
+++ b/examples/mnist/main-cpu.cpp
@@ -46,6 +46,7 @@ int mnist_eval(
     struct ggml_init_params params = {
         .mem_size   = buf_size,
         .mem_buffer = buf,
+        .no_alloc   = false,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index 018c5b87a..09866a3c1 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -48,6 +48,7 @@ int mnist_eval(
     struct ggml_init_params params = {
         .mem_size   = buf_size,
         .mem_buffer = buf,
+        .no_alloc   = false,
     };
 
     struct ggml_context * ctx_work = ggml_init(params);
diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
index 315ff987e..512748003 100644
--- a/examples/mnist/main.cpp
+++ b/examples/mnist/main.cpp
@@ -177,6 +177,7 @@ int mnist_eval(
     struct ggml_init_params params = {
         .mem_size   = buf_size,
         .mem_buffer = buf,
+        .no_alloc   = false,
     };
 
     struct ggml_context * ctx0 = ggml_init(params);

From c8013c52020b72abbc1d1c47bff01eefe8d19f4d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 29 May 2023 19:24:50 +0300
Subject: [PATCH 27/27] ggml : improve "get tensor" API

---
 examples/mnist/main-cpu.cpp |  4 ++--
 examples/mnist/main-mtl.cpp |  2 +-
 examples/mnist/main-mtl.m   |  4 ++--
 include/ggml/ggml.h         |  4 +++-
 src/ggml.c                  | 19 ++++++++++++++++++-
 5 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
index c8e30b0c2..48e0ae60e 100644
--- a/examples/mnist/main-cpu.cpp
+++ b/examples/mnist/main-cpu.cpp
@@ -51,12 +51,12 @@ int mnist_eval(
 
     struct ggml_context * ctx0 = ggml_init(params);
 
-    struct ggml_tensor * input = ggml_get_tensor_by_name(&gfi, "input");
+    struct ggml_tensor * input = ggml_graph_get_tensor(&gfi, "input");
     memcpy(input->data, digit.data(), ggml_nbytes(input));
 
     ggml_graph_compute(ctx0, &gfi);
 
-    const float * probs_data = ggml_get_data_f32(ggml_get_tensor_by_name(&gfi, "probs"));
+    const float * probs_data = ggml_get_data_f32(ggml_graph_get_tensor(&gfi, "probs"));
 
     const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
 
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
index 09866a3c1..fafe8e610 100644
--- a/examples/mnist/main-mtl.cpp
+++ b/examples/mnist/main-mtl.cpp
@@ -59,7 +59,7 @@ int mnist_eval(
     int prediction = -1;
 
     for (int i = 0; i < 1; ++i) {
-        struct ggml_tensor * input = ggml_get_tensor_by_name(&gf, "input");
+        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "input");
 
         if (i % 2 == 0) {
             memcpy(input->data, digit.data(), ggml_nbytes(input));
diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
index 2d4fb9b1e..21bfe7ad8 100644
--- a/examples/mnist/main-mtl.m
+++ b/examples/mnist/main-mtl.m
@@ -118,7 +118,7 @@ kernel void kernel_soft_max(
 
     // load kernels
     {
-        const int k_digits = ggml_get_tensor_by_name(gf, "probs")->ne[0];
+        const int k_digits = ggml_graph_get_tensor(gf, "probs")->ne[0];
 
         MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new];
         [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"];
@@ -310,7 +310,7 @@ int mnist_mtl_eval(
 
     // copy the input data to the GPU
     {
-        struct ggml_tensor * inp = ggml_get_tensor_by_name(gf, "input");
+        struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
 
         id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, inp, &offs_src0);
 
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h
index 3320157fe..60c0ad8bf 100644
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -495,6 +495,8 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
     GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
 
+    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+
     GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
     GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@@ -981,7 +983,7 @@ extern "C" {
     GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
     GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 
-    GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name);
+    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
 
     GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
     GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
diff --git a/src/ggml.c b/src/ggml.c
index 5f6426db7..7785fa705 100644
--- a/src/ggml.c
+++ b/src/ggml.c
@@ -4531,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor(
     return result;
 }
 
+struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
+    struct ggml_object * obj = ctx->objects_begin;
+
+    char * const mem_buffer = ctx->mem_buffer;
+
+    while (obj != NULL) {
+        struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
+        if (strcmp(cur->name, name) == 0) {
+            return cur;
+        }
+
+        obj = obj->next;
+    }
+
+    return NULL;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 // ggml_dup
@@ -14540,7 +14557,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) {
     }
 }
 
-struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) {
+struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
     for (int i = 0; i < cgraph->n_leafs; i++) {
         struct ggml_tensor * leaf = cgraph->leafs[i];