ggml : cgraph export/import/eval example + GPU support (#108)

* ggml : cgraph export brainstorming * mnist : code style * mnist : minor * ggml : initial cgraph export * ggml : initial graph import (wip) * ggml : import op args correctly * ggml : add ggml_get_tensor_by_name() * mnist : add compute graph evaluation on CPU example * ggml : add ggml_tensor_overhead() * ggml : rename new functions to ggml_cgraph_... * mnist : add Metal inference skeleton (WIP) * mnist : working on the Metal pipeline (WIP) * mnist : prepare the Metal encoder (WIP) * mnist : first Metal kernel for F32 ADD * mnist : looks like MTLHeap does not work * mnist : initial full pass of MNIST on the GPU (not verified) * mnist : minor cleanup * mnist : full GPU inference works * mnist : use custom soft_max kernel since MPSMatrixSoftMax is bugged * mnist : use constant for soft_max instead of hardcoded 10 * mnist : check multiple predictions (Metal) * mnist : minor * ggml : move cgraph import / export to ggml * mnist : remove common dependencies * mnist : fix soft_max threadgroup size * mnist : init no_alloc member * ggml : improve "get tensor" API
ggml-org · May 29, 2023 · 3b697a2 · 3b697a2
1 parent db5eef1
commit 3b697a2
Show file tree

Hide file tree

Showing 8 changed files with 1,297 additions and 11 deletions.
diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt
@@ -5,3 +5,29 @@ set(TEST_TARGET mnist)
 add_executable(${TEST_TARGET} main.cpp)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
 
+#
+# mnist-cpu
+
+set(TEST_TARGET mnist-cpu)
+add_executable(${TEST_TARGET} main-cpu.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+
+if (APPLE)
+    #
+    # mnist-mtl
+
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+    set(TEST_TARGET mnist-mtl)
+    add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m)
+    target_link_libraries(${TEST_TARGET} PRIVATE
+        ggml
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        ${METALPERFORMANCE_FRAMEWORK}
+    )
+endif()
diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
@@ -0,0 +1,116 @@
+// Use a pre-generated MNIST compute graph for inference on the CPU
+//
+// You can generate a compute graph using the "mnist" tool:
+//
+// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+// This command creates the "mnist.ggml" file, which contains the generated compute graph.
+// Now, you can re-use the compute graph with the "mnist-cpu" tool:
+//
+// $ ./bin/mnist-cpu ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+
+#include "ggml/ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <vector>
+
+// evaluate the MNIST compute graph
+//
+//   - fname_cgraph: path to the compute graph
+//   - n_threads:    number of threads to use
+//   - digit:        784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+        const char * fname_cgraph,
+        const int n_threads,
+        std::vector<float> digit
+        ) {
+    // load the compute graph
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
+
+    struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    gfi.n_threads = n_threads;
+
+    // allocate eval context
+    // needed during ggml_graph_compute() to allocate a work tensor
+    static size_t buf_size = gfi.work_size; // TODO
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params params = {
+        .mem_size   = buf_size,
+        .mem_buffer = buf,
+        .no_alloc   = false,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+
+    struct ggml_tensor * input = ggml_graph_get_tensor(&gfi, "input");
+    memcpy(input->data, digit.data(), ggml_nbytes(input));
+
+    ggml_graph_compute(ctx0, &gfi);
+
+    const float * probs_data = ggml_get_data_f32(ggml_graph_get_tensor(&gfi, "probs"));
+
+    const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+
+    ggml_free(ctx0);
+    ggml_free(ctx_data);
+    ggml_free(ctx_eval);
+
+    return prediction;
+}
+
+int main(int argc, char ** argv) {
+    srand(time(NULL));
+    ggml_time_init();
+
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+        exit(0);
+    }
+
+    uint8_t buf[784];
+    std::vector<float> digit;
+
+    // read a random digit from the test set
+    {
+        std::ifstream fin(argv[2], std::ios::binary);
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+            return 1;
+        }
+
+        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+        fin.seekg(16 + 784 * (rand() % 10000));
+        fin.read((char *) &buf, sizeof(buf));
+    }
+
+    // render the digit in ASCII
+    {
+        digit.resize(sizeof(buf));
+
+        for (int row = 0; row < 28; row++) {
+            for (int col = 0; col < 28; col++) {
+                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+                digit[row*28 + col] = ((float)buf[row*28 + col]);
+            }
+
+            fprintf(stderr, "\n");
+        }
+
+        fprintf(stderr, "\n");
+    }
+
+    const int prediction = mnist_eval(argv[1], 1, digit);
+
+    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+    return 0;
+}
diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
@@ -0,0 +1,129 @@
+// Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS
+//
+// You can generate a compute graph using the "mnist" tool:
+//
+// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+// This command creates the "mnist.ggml" file, which contains the generated compute graph.
+// Now, you can re-use the compute graph on the GPU with the "mnist-mtl" tool:
+//
+// $ ./bin/mnist-mtl ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+
+#include "ggml/ggml.h"
+
+#include "main-mtl.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <vector>
+
+// evaluate the MNIST compute graph
+//
+//   - fname_cgraph: path to the compute graph
+//   - n_threads:    number of threads to use
+//   - digit:        784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+        const char * fname_cgraph,
+        const int n_threads,
+        std::vector<float> digit
+        ) {
+    // load the compute graph
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
+
+    struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    gf.n_threads = n_threads;
+
+    // allocate eval context
+    // needed during ggml_graph_compute() to allocate a work tensor
+    static size_t buf_size = gf.work_size; // TODO
+    static void * buf = malloc(buf_size);
+
+    struct ggml_init_params params = {
+        .mem_size   = buf_size,
+        .mem_buffer = buf,
+        .no_alloc   = false,
+    };
+
+    struct ggml_context * ctx_work = ggml_init(params);
+
+    // this allocates all Metal resources and memory buffers
+    auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
+
+    int prediction = -1;
+
+    for (int i = 0; i < 1; ++i) {
+        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "input");
+
+        if (i % 2 == 0) {
+            memcpy(input->data, digit.data(), ggml_nbytes(input));
+        } else {
+            memset(input->data, 0, ggml_nbytes(input));
+        }
+
+        // the actual inference happens here
+        prediction = mnist_mtl_eval(ctx_mtl, &gf);
+    }
+
+    mnist_mtl_free(ctx_mtl);
+
+    ggml_free(ctx_work);
+    ggml_free(ctx_data);
+    ggml_free(ctx_eval);
+
+    return prediction;
+}
+
+int main(int argc, char ** argv) {
+    srand(time(NULL));
+    ggml_time_init();
+
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+        exit(0);
+    }
+
+    uint8_t buf[784];
+    std::vector<float> digit;
+
+    // read a random digit from the test set
+    {
+        std::ifstream fin(argv[2], std::ios::binary);
+        if (!fin) {
+            fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+            return 1;
+        }
+
+        // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+        fin.seekg(16 + 784 * (rand() % 10000));
+        fin.read((char *) &buf, sizeof(buf));
+    }
+
+    // render the digit in ASCII
+    {
+        digit.resize(sizeof(buf));
+
+        for (int row = 0; row < 28; row++) {
+            for (int col = 0; col < 28; col++) {
+                fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+                digit[row*28 + col] = ((float)buf[row*28 + col]);
+            }
+
+            fprintf(stderr, "\n");
+        }
+
+        fprintf(stderr, "\n");
+    }
+
+    const int prediction = mnist_eval(argv[1], 1, digit);
+
+    fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+    return 0;
+}
diff --git a/examples/mnist/main-mtl.h b/examples/mnist/main-mtl.h
@@ -0,0 +1,26 @@
+#pragma once
+
+struct ggml_context;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_mtl_context;
+
+struct ggml_mtl_context * mnist_mtl_init(
+        struct ggml_context * ctx_data,
+        struct ggml_context * ctx_eval,
+        struct ggml_context * ctx_work,
+        struct ggml_cgraph  * gf);
+
+void mnist_mtl_free(struct ggml_mtl_context * ctx);
+
+int mnist_mtl_eval(
+        struct ggml_mtl_context * ctx,
+        struct ggml_cgraph      * gf);
+
+#ifdef __cplusplus
+}
+#endif