From 66ccdfed52132320128610a1e9ca55f971482976 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 11 Feb 2024 12:43:40 -0500
Subject: [PATCH] Update ggml

---
 ggml/experimental.py           |  23 +-
 ggml/ggml.py                   | 757 ++++++++++++++-------------------
 ggml/utils.py                  |  74 +---
 tests/test_experimental_api.py |  10 +-
 tests/test_ggml.py             | 273 +++++-------
 tests/test_ggml_backends.py    | 108 -----
 tests/test_ggml_metal.py       |  33 +-
 tests/test_utils.py            | 158 +++----
 vendor/ggml                    |   2 +-
 9 files changed, 521 insertions(+), 917 deletions(-)
 delete mode 100644 tests/test_ggml_backends.py

diff --git a/ggml/experimental.py b/ggml/experimental.py
index a42a966..9cf3039 100644
--- a/ggml/experimental.py
+++ b/ggml/experimental.py
@@ -86,11 +86,11 @@ def cpu():
             raise ValueError("Failed to initialize CPU backend")
         return Backend(backend=backend)
 
-    def new_measure(self) -> "Allocr":
-        allocr = ggml.ggml_allocr_new_measure_from_backend(self.backend)
-        return Allocr(allocr)
+    def new_graph_allocator(self) -> "GraphAllocator":
+        allocr = ggml.ggml_gallocr_new(ggml.ggml_backend_get_default_buffer_type(self.backend))
+        return GraphAllocator(allocr)
 
-    def alloc_buffer(self, size: int) -> "BackendBuffer":
+    def new_backend_buffer(self, size: int) -> "BackendBuffer":
         buffer = ggml.ggml_backend_alloc_buffer(self.backend, size)
         return BackendBuffer(buffer)
 
@@ -102,20 +102,17 @@ def __init__(self, buffer: ggml.ggml_backend_buffer_t):
     def __del__(self):
         ggml.ggml_backend_buffer_free(self.buffer)
 
-    def new_allocr(self) -> "Allocr":
-        allocr = ggml.ggml_allocr_new_from_buffer(self.buffer)
-        return Allocr(allocr)
 
-
-class Allocr:
-    def __init__(self, allocr: ggml.ggml_allocr_t):
+class GraphAllocator:
+    def __init__(self, allocr: ggml.ggml_gallocr):
         self.allocr = allocr
 
     def __del__(self):
-        ggml.ggml_allocr_free(self.allocr)
+        ggml.ggml_gallocr_free(self.allocr)
 
-    def alloc_graph(self, graph: "CGraph") -> int:
-        return ggml.ggml_allocr_alloc_graph(self.allocr, graph.cgraph)
+    def allocate_graph(self, graph: "CGraph"):
+        ggml.ggml_gallocr_reserve(self.allocr, graph.cgraph)
+        ggml.ggml_gallocr_alloc_graph(self.allocr, graph.cgraph)
 
 
 class GGML_TYPE(enum.IntEnum):
diff --git a/ggml/ggml.py b/ggml/ggml.py
index a5ebf57..0ce9850 100644
--- a/ggml/ggml.py
+++ b/ggml/ggml.py
@@ -96,7 +96,7 @@ def load_shared_library(module_name: str, lib_base_name: str):
 
     # Try to load the shared library, handling potential errors
     try:
-        return ctypes.CDLL(str(path), **cdll_args) # type: ignore
+        return ctypes.CDLL(str(path), **cdll_args)  # type: ignore
     except Exception as e:
         raise RuntimeError(f"Failed to load shared library '{path}': {e}")
 
@@ -543,8 +543,8 @@ def ggml_fp32_to_fp16_row(
 
 # enum ggml_log_level {
 #     GGML_LOG_LEVEL_ERROR = 2,
-#     GGML_LOG_LEVEL_WARN = 3,
-#     GGML_LOG_LEVEL_INFO = 4,
+#     GGML_LOG_LEVEL_WARN  = 3,
+#     GGML_LOG_LEVEL_INFO  = 4,
 #     GGML_LOG_LEVEL_DEBUG = 5
 # };
 GGML_LOG_LEVEL_ERROR = 2
@@ -552,6 +552,17 @@ def ggml_fp32_to_fp16_row(
 GGML_LOG_LEVEL_INFO = 4
 GGML_LOG_LEVEL_DEBUG = 5
 
+
+# enum ggml_tensor_flag {
+#     GGML_TENSOR_FLAG_INPUT  = 1,
+#     GGML_TENSOR_FLAG_OUTPUT = 2,
+#     GGML_TENSOR_FLAG_PARAM  = 4,
+# };
+GGML_TENSOR_FLAG_INPUT = 1
+GGML_TENSOR_FLAG_OUTPUT = 2
+GGML_TENSOR_FLAG_PARAM = 4
+
+
 # // ggml object
 # struct ggml_object {
 #     size_t offs;
@@ -600,7 +611,7 @@ class ggml_object(ctypes.Structure):
 #     // op params - allocated as int32_t for alignment
 #     int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
 
-#     bool is_param;
+#     int32_t flags;
 
 #     struct ggml_tensor * grad;
 #     struct ggml_tensor * src[GGML_MAX_SRC];
@@ -633,7 +644,7 @@ class ggml_tensor(ctypes.Structure):
         nb (ctypes.Array[ctypes.c_size_t]): stride in bytes for each dimension
         op (int): ggml operation
         op_params (ctypes.Array[ctypes.c_int32]): `GGML_MAX_OP_PARAMS`-length array of operation parameters
-        is_param (bool): is this a parameter tensor
+        flags (int): tensor flags
         grad (ggml_tensor_p): reference to gradient tensor
         src (ctypes.Array[ggml_tensor_p]): `GGML_MAX_SRC`-length array of source tensors
         perf_runs (int): number of performance runs
@@ -660,7 +671,7 @@ class ggml_tensor(ctypes.Structure):
         "op_params",
         ctypes.c_int32 * (GGML_MAX_OP_PARAMS // ctypes.sizeof(ctypes.c_int32)),
     ),
-    ("is_param", ctypes.c_bool),
+    ("flags", ctypes.c_int),
     ("grad", ctypes.POINTER(ggml_tensor)),
     ("src", ctypes.POINTER(ggml_tensor) * GGML_MAX_SRC),
     ("perf_runs", ctypes.c_int),
@@ -676,7 +687,7 @@ class ggml_tensor(ctypes.Structure):
 
 GGML_TENSOR_SIZE = ctypes.sizeof(ggml_tensor)
 
-ggml_tensor_p: TypeAlias = "ctypes._Pointer[ggml_tensor]"  # type: ignore
+ggml_tensor_p: TypeAlias = "ctypes._Pointer[ggml_tensor]" # type: ignore
 """ctypes pointer to a [ggml_tensor][ggml.ggml_tensor]
 
 Can be dereferenced to a [ggml_tensor][ggml.ggml_tensor] object using
@@ -805,7 +816,7 @@ class ggml_cgraph(ctypes.Structure):
     ]
 
 
-ggml_cgraph_p: TypeAlias = "ctypes._Pointer[ggml_cgraph]"  # type: ignore
+ggml_cgraph_p: TypeAlias = "ctypes._Pointer[ggml_cgraph]" # type: ignore
 """ctypes pointer to a [ggml_cgraph][ggml.ggml_cgraph]
 
 Can be dereferenced to a [ggml_cgraph][ggml.ggml_cgraph] object using
@@ -4994,7 +5005,8 @@ def ggml_clamp(
 #         int                  p1,
 #         int                  d0,
 #         int                  d1,
-#         bool                 is_2D);
+#         bool                 is_2D,
+#         enum ggml_type       dst_type);
 def ggml_im2col(
     ctx: ggml_context_p,
     a: ggml_tensor_p,
@@ -5006,8 +5018,9 @@ def ggml_im2col(
     d0: Union[ctypes.c_int, int],
     d1: Union[ctypes.c_int, int],
     is_2D: Union[ctypes.c_bool, bool],
+    dst_type: Union[ctypes.c_int, int],
 ) -> ggml_tensor_p:
-    return lib.ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, is_2D)
+    return lib.ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, is_2D, dst_type)
 
 
 lib.ggml_im2col.argtypes = [
@@ -5021,6 +5034,7 @@ def ggml_im2col(
     ctypes.c_int,
     ctypes.c_int,
     ctypes.c_bool,
+    ctypes.c_int,
 ]
 lib.ggml_im2col.restype = ctypes.POINTER(ggml_tensor)
 
@@ -7172,10 +7186,33 @@ def ggml_opt_resume_g(
 ]
 lib.ggml_opt_resume_g.restype = ctypes.c_int
 
+
+# //
+# // tensor flags
+# //
+# GGML_API void ggml_set_input(struct ggml_tensor * tensor);
+def ggml_set_input(tensor: ggml_tensor_p):
+    return lib.ggml_set_input(tensor)
+
+
+lib.ggml_set_input.argtypes = [ctypes.POINTER(ggml_tensor)]
+lib.ggml_set_input.restype = None
+
+
+# GGML_API void ggml_set_output(struct ggml_tensor * tensor);
+def ggml_set_output(tensor: ggml_tensor_p):
+    return lib.ggml_set_output(tensor)
+
+
+lib.ggml_set_output.argtypes = [ctypes.POINTER(ggml_tensor)]
+lib.ggml_set_output.restype = None
+
+
 # //
 # // quantization
 # //
 
+
 # // - ggml_quantize_init can be called multiple times with the same type
 # //   it will only initialize the quantization tables for the first call or after ggml_quantize_free
 # //   automatically called by ggml_quantize_chunk for convenience
@@ -8537,6 +8574,15 @@ def ggml_cpu_has_vulkan() -> int:
 lib.ggml_cpu_has_vulkan.restype = ctypes.c_int
 
 
+# GGML_API int ggml_cpu_has_kompute    (void);
+def ggml_cpu_has_kompute() -> int:
+    return lib.ggml_cpu_has_kompute()
+
+
+lib.ggml_cpu_has_kompute.argtypes = []
+lib.ggml_cpu_has_kompute.restype = ctypes.c_int
+
+
 # GGML_API int ggml_cpu_has_gpublas    (void);
 def ggml_cpu_has_gpublas() -> int:
     return lib.ggml_cpu_has_gpublas()
@@ -8563,6 +8609,7 @@ def ggml_cpu_has_ssse3() -> int:
 lib.ggml_cpu_has_ssse3.argtypes = []
 lib.ggml_cpu_has_ssse3.restype = ctypes.c_int
 
+
 # GGML_API int ggml_cpu_has_sycl       (void);
 def ggml_cpu_has_sycl() -> int:
     return lib.ggml_cpu_has_sycl()
@@ -8639,402 +8686,186 @@ def ggml_internal_get_type_traits(type: Union[ctypes.c_int, int]) -> ggml_type_t
 # source: include/ggml/ggml-alloc.h
 #####################################################
 
-# struct ggml_backend;
-# struct ggml_backend_buffer;
-# struct ggml_backend_buffer_type;
-ggml_backend_t: TypeAlias = ctypes.c_void_p
-ggml_backend_buffer_p: TypeAlias = ctypes.c_void_p
-ggml_backend_buffer_type_p: TypeAlias = ctypes.c_void_p
-
-# //
-# // Legacy API
-# //
-
-# typedef struct ggml_allocr * ggml_allocr_t;
-ggml_allocr_t = ctypes.c_void_p
-
 
-# // initialize allocator for use with CPU backend only
-# GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
-def ggml_allocr_new(
-    data: ctypes.c_void_p,
-    size: Union[ctypes.c_size_t, int],
-    alignment: Union[ctypes.c_size_t, int],
-) -> ggml_allocr_t:
-    return lib.ggml_allocr_new(data, size, alignment)
-
-
-lib.ggml_allocr_new.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t]
-lib.ggml_allocr_new.restype = ggml_allocr_t
-
-
-# GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
-def ggml_allocr_new_measure(alignment: Union[ctypes.c_size_t, int]) -> ggml_allocr_t:
-    return lib.ggml_allocr_new_measure(alignment)
-
-
-lib.ggml_allocr_new_measure.argtypes = [ctypes.c_size_t]
-lib.ggml_allocr_new_measure.restype = ggml_allocr_t
-
-
-# // initialize allocator for use with ggml-backend
-# GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-def ggml_allocr_new_from_buffer(buffer: ggml_backend_buffer_p) -> ggml_allocr_t:
-    return lib.ggml_allocr_new_from_buffer(buffer)
-
-
-lib.ggml_allocr_new_from_buffer.argtypes = [ggml_backend_buffer_p]
-lib.ggml_allocr_new_from_buffer.restype = ggml_allocr_t
-
-
-# GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-def ggml_allocr_new_from_backend(
-    backend: ggml_backend_t, size: Union[ctypes.c_size_t, int]
-) -> ggml_allocr_t:
-    return lib.ggml_allocr_new_from_backend(backend, size)
-
-
-lib.ggml_allocr_new_from_backend.argtypes = [ggml_backend_t, ctypes.c_size_t]
-lib.ggml_allocr_new_from_backend.restype = ggml_allocr_t
-
-
-# GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
-def ggml_allocr_new_measure_from_backend(backend: ggml_backend_t) -> ggml_allocr_t:
-    return lib.ggml_allocr_new_measure_from_backend(backend)
-
-
-lib.ggml_allocr_new_measure_from_backend.argtypes = [ggml_backend_t]
-lib.ggml_allocr_new_measure_from_backend.restype = ggml_allocr_t
-
-
-# GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
-def ggml_allocr_get_buffer(alloc: ggml_allocr_t) -> ggml_backend_buffer_p:
-    return lib.ggml_allocr_get_buffer(alloc)
-
-
-lib.ggml_allocr_get_buffer.argtypes = [ggml_allocr_t]
-lib.ggml_allocr_get_buffer.restype = ggml_backend_buffer_p
-
-
-# // tell the allocator to parse nodes following the order described in the list
-# // you should call this if your graph are optimized to execute out-of-order
-# GGML_API void   ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
-def ggml_allocr_set_parse_seq(
-    alloc: ggml_allocr_t,
-    list: "ctypes._Pointer(ctypes.c_int)",  # type: ignore
-    n: Union[ctypes.c_int, int],
-) -> None:
-    return lib.ggml_allocr_set_parse_seq(alloc, list, n)
-
-
-lib.ggml_allocr_set_parse_seq.argtypes = [
-    ggml_allocr_t,
-    ctypes.POINTER(ctypes.c_int),
-    ctypes.c_int,
-]
-lib.ggml_allocr_set_parse_seq.restype = None
-
-
-# GGML_API void   ggml_allocr_free       (ggml_allocr_t alloc);
-def ggml_allocr_free(alloc: ggml_allocr_t) -> None:
-    return lib.ggml_allocr_free(alloc)
-
-
-lib.ggml_allocr_free.argtypes = [ggml_allocr_t]
-lib.ggml_allocr_free.restype = None
-
-
-# GGML_API bool   ggml_allocr_is_measure (ggml_allocr_t alloc);
-def ggml_allocr_is_measure(alloc: ggml_allocr_t) -> ctypes.c_bool:
-    return lib.ggml_allocr_is_measure(alloc)
-
-
-lib.ggml_allocr_is_measure.argtypes = [ggml_allocr_t]
-lib.ggml_allocr_is_measure.restype = ctypes.c_bool
-
-
-# GGML_API void   ggml_allocr_reset      (ggml_allocr_t alloc);
-def ggml_allocr_reset(alloc: ggml_allocr_t) -> None:
-    return lib.ggml_allocr_reset(alloc)
-
-
-lib.ggml_allocr_reset.argtypes = [ggml_allocr_t]
-lib.ggml_allocr_reset.restype = None
-
-
-# GGML_API void   ggml_allocr_alloc      (ggml_allocr_t alloc, struct ggml_tensor * tensor);
-def ggml_allocr_alloc(alloc: ggml_allocr_t, tensor: ggml_tensor_p) -> None:
-    return lib.ggml_allocr_alloc(alloc, tensor)
-
-
-lib.ggml_allocr_alloc.argtypes = [ggml_allocr_t, ctypes.POINTER(ggml_tensor)]
-lib.ggml_allocr_alloc.restype = None
-
-
-# GGML_API size_t ggml_allocr_max_size   (ggml_allocr_t alloc);
-def ggml_allocr_max_size(alloc: ggml_allocr_t) -> Union[ctypes.c_size_t, int]:
-    return lib.ggml_allocr_max_size(alloc)
-
-
-lib.ggml_allocr_max_size.argtypes = [ggml_allocr_t]
-lib.ggml_allocr_max_size.restype = ctypes.c_size_t
-
-
-# GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
-def ggml_allocr_alloc_graph(alloc: ggml_allocr_t, graph: ggml_cgraph_p) -> int:
-    return lib.ggml_allocr_alloc_graph(alloc, graph)
-
-
-lib.ggml_allocr_alloc_graph.argtypes = [ggml_allocr_t, ctypes.POINTER(ggml_cgraph)]
-lib.ggml_allocr_alloc_graph.restype = ctypes.c_size_t
-
-# //
-# // ggml-backend v2 API
-# //
+# typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+# typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+# typedef struct ggml_backend * ggml_backend_t;
+ggml_backend_buffer_type_t: TypeAlias = ctypes.c_void_p
+ggml_backend_buffer_t: TypeAlias = ctypes.c_void_p
+ggml_backend_t: TypeAlias = ctypes.c_void_p
 
-# // Separate tensor and graph allocator objects
-# // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
-# // The original API is kept as a wrapper around the new API
 
 # // Tensor allocator
 # typedef struct ggml_tallocr * ggml_tallocr_t;
-ggml_tallocr_t = ctypes.c_void_p
-
-
-# GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
-def ggml_tallocr_new(
-    data: ctypes.c_void_p,
-    size: Union[ctypes.c_size_t, int],
-    alignment: Union[ctypes.c_size_t, int],
-) -> ggml_tallocr_t:
-    return lib.ggml_tallocr_new(data, size, alignment)
-
-
-lib.ggml_tallocr_new.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t]
-lib.ggml_tallocr_new.restype = ggml_tallocr_t
-
-
-# GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
-def ggml_tallocr_new_measure(alignment: Union[ctypes.c_size_t, int]) -> ggml_tallocr_t:
-    return lib.ggml_tallocr_new_measure(alignment)
-
-
-lib.ggml_tallocr_new_measure.argtypes = [ctypes.c_size_t]
-lib.ggml_tallocr_new_measure.restype = ggml_tallocr_t
-
-
-# GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
-def ggml_tallocr_new_from_buft(
-    buft: ggml_backend_buffer_type_p, size: Union[ctypes.c_size_t, int]
-) -> ggml_tallocr_t:
-    return lib.ggml_tallocr_new_from_buft(buft, size)
-
-
-lib.ggml_tallocr_new_from_buft.argtypes = [ggml_backend_buffer_type_p, ctypes.c_size_t]
-lib.ggml_tallocr_new_from_buft.restype = ggml_tallocr_t
-
-
-# GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-def ggml_tallocr_new_from_backend(
-    backend: ggml_backend_t, size: Union[ctypes.c_size_t, int]
-) -> ggml_tallocr_t:
-    return lib.ggml_tallocr_new_from_backend(backend, size)
-
-
-lib.ggml_tallocr_new_from_backend.argtypes = [ggml_backend_t, ctypes.c_size_t]
-lib.ggml_tallocr_new_from_backend.restype = ggml_tallocr_t
-
+ggml_tallocr: TypeAlias = ctypes.c_void_p
 
-# GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-def ggml_tallocr_new_from_buffer(buffer: ggml_backend_buffer_p) -> ggml_tallocr_t:
-    return lib.ggml_tallocr_new_from_buffer(buffer)
 
+# GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
+def ggml_tallocr_new(buffer: ggml_backend_buffer_t) -> ggml_tallocr:
+    return lib.ggml_tallocr_new(buffer)
 
-lib.ggml_tallocr_new_from_buffer.argtypes = [ggml_backend_buffer_p]
-lib.ggml_tallocr_new_from_buffer.restype = ggml_tallocr_t
 
+lib.ggml_tallocr_new.argtypes = [ggml_backend_buffer_t]
+lib.ggml_tallocr_new.restype = ggml_tallocr
 
-# GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
-def ggml_tallocr_new_measure_from_buft(
-    buft: ggml_backend_buffer_type_p,
-) -> ggml_tallocr_t:
-    return lib.ggml_tallocr_new_measure_from_buft(buft)
 
-
-lib.ggml_tallocr_new_measure_from_buft.argtypes = [ggml_backend_buffer_type_p]
-lib.ggml_tallocr_new_measure_from_buft.restype = ggml_tallocr_t
-
-
-# GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
-def ggml_tallocr_new_measure_from_backend(backend: ggml_backend_t) -> ggml_tallocr_t:
-    return lib.ggml_tallocr_new_measure_from_backend(backend)
-
-
-lib.ggml_tallocr_new_measure_from_backend.argtypes = [ggml_backend_t]
-lib.ggml_tallocr_new_measure_from_backend.restype = ggml_tallocr_t
-
-
-# GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
-def ggml_tallocr_get_buffer(talloc: ggml_tallocr_t) -> ggml_backend_buffer_p:
-    return lib.ggml_tallocr_get_buffer(talloc)
-
-
-lib.ggml_tallocr_get_buffer.argtypes = [ggml_tallocr_t]
-lib.ggml_tallocr_get_buffer.restype = ggml_backend_buffer_p
-
-
-# GGML_API void   ggml_tallocr_free       (ggml_tallocr_t talloc);
-def ggml_tallocr_free(talloc: ggml_tallocr_t) -> None:
+# GGML_API void           ggml_tallocr_free(ggml_tallocr_t talloc);
+def ggml_tallocr_free(talloc: ggml_tallocr) -> None:
     return lib.ggml_tallocr_free(talloc)
 
 
-lib.ggml_tallocr_free.argtypes = [ggml_tallocr_t]
+lib.ggml_tallocr_free.argtypes = [ggml_tallocr]
 lib.ggml_tallocr_free.restype = None
 
 
-# GGML_API bool   ggml_tallocr_is_measure (ggml_tallocr_t talloc);
-def ggml_tallocr_is_measure(talloc: ggml_tallocr_t) -> bool:
-    return lib.ggml_tallocr_is_measure(talloc)
-
+# GGML_API void           ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
+def ggml_tallocr_alloc(talloc: ggml_tallocr, tensor: ggml_tensor_p) -> None:
+    return lib.ggml_tallocr_alloc(talloc, tensor)
 
-lib.ggml_tallocr_is_measure.argtypes = [ggml_tallocr_t]
-lib.ggml_tallocr_is_measure.restype = ctypes.c_bool
 
+lib.ggml_tallocr_alloc.argtypes = [ggml_tallocr, ctypes.POINTER(ggml_tensor)]
+lib.ggml_tallocr_alloc.restype = None
 
-# GGML_API void   ggml_tallocr_reset      (ggml_tallocr_t talloc);
-def ggml_tallocr_reset(talloc: ggml_tallocr_t) -> None:
-    return lib.ggml_tallocr_reset(talloc)
 
+# // Graph allocator
+# /*
+#   Example usage:
+#     ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
 
-lib.ggml_tallocr_reset.argtypes = [ggml_tallocr_t]
-lib.ggml_tallocr_reset.restype = None
+#     // optional: create a worst-case graph and reserve the buffers to avoid reallocations
+#     ggml_gallocr_reserve(galloc, build_graph(max_batch));
 
+#     // allocate the graph
+#     struct ggml_cgraph * graph = build_graph(batch);
+#     ggml_gallocr_alloc_graph(galloc, graph);
 
-# GGML_API void   ggml_tallocr_alloc      (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
-def ggml_tallocr_alloc(talloc: ggml_tallocr_t, tensor: ggml_tensor_p) -> None:
-    return lib.ggml_tallocr_alloc(talloc, tensor)
+#     printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
 
+#     // evaluate the graph
+#     ggml_backend_graph_compute(backend, graph);
+# */
 
-lib.ggml_tallocr_alloc.argtypes = [ggml_tallocr_t, ctypes.POINTER(ggml_tensor)]
-lib.ggml_tallocr_alloc.restype = None
+# // special tensor flags for use with the graph allocator:
+# //   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
+# //   ggml_set_output(): output tensors are never freed and never overwritten
 
 
-# GGML_API size_t ggml_tallocr_max_size   (ggml_tallocr_t talloc);
-def ggml_tallocr_max_size(talloc: ggml_tallocr_t) -> Union[ctypes.c_size_t, int]:
-    return lib.ggml_tallocr_max_size(talloc)
+# typedef struct ggml_gallocr * ggml_gallocr_t;
+ggml_gallocr: TypeAlias = ctypes.c_void_p
 
 
-lib.ggml_tallocr_max_size.argtypes = [ggml_tallocr_t]
-lib.ggml_tallocr_max_size.restype = ctypes.c_size_t
+# GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
+def ggml_gallocr_new(buft: ggml_backend_buffer_type_t) -> ggml_gallocr:
+    return lib.ggml_gallocr_new(buft)
 
 
-# // Graph allocator
-# typedef struct ggml_gallocr * ggml_gallocr_t;
-ggml_gallocr_t = ctypes.c_void_p
+lib.ggml_gallocr_new.argtypes = [ggml_backend_buffer_type_t]
+lib.ggml_gallocr_new.restype = ggml_gallocr
 
 
-# GGML_API ggml_gallocr_t ggml_gallocr_new(void);
-def ggml_gallocr_new() -> ggml_gallocr_t:
-    return lib.ggml_gallocr_new()
+# GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
+def ggml_gallocr_new_n(bufts: ggml_backend_buffer_type_t, n_bufs: int) -> ggml_gallocr:
+    return lib.ggml_gallocr_new_n(bufts, n_bufs)
 
 
-lib.ggml_gallocr_new.argtypes = []
-lib.ggml_gallocr_new.restype = ggml_gallocr_t
+lib.ggml_gallocr_new_n.argtypes = [ggml_backend_buffer_type_t, ctypes.c_int]
+lib.ggml_gallocr_new_n.restype = ggml_gallocr
 
 
-# GGML_API void   ggml_gallocr_free(ggml_gallocr_t galloc);
-def ggml_gallocr_free(galloc: ggml_gallocr_t) -> None:
+# GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
+def ggml_gallocr_free(galloc: ggml_gallocr) -> None:
     return lib.ggml_gallocr_free(galloc)
 
 
-lib.ggml_gallocr_free.argtypes = [ggml_gallocr_t]
+lib.ggml_gallocr_free.argtypes = [ggml_gallocr]
 lib.ggml_gallocr_free.restype = None
 
 
-# GGML_API void   ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
-def ggml_gallocr_set_parse_seq(
-    galloc: ggml_gallocr_t,
-    list: "ctypes._Pointer(ctypes.c_int)",  # type: ignore
-    n: Union[ctypes.c_int, int],
-) -> None:
-    return lib.ggml_gallocr_set_parse_seq(galloc, list, n)
+# // pre-allocate buffers from a measure graph - does not allocate or modify the graph
+# // call with a worst-case graph to avoid buffer reallocations
+# // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
+# // returns false if the buffer allocation failed
+# GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+def ggml_gallocr_reserve(galloc: ggml_gallocr, graph: ggml_cgraph_p) -> bool:
+    """pre-allocate buffers from a measure graph - does not allocate or modify the graph
+    call with a worst-case graph to avoid buffer reallocations
+    not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
+    returns false if the buffer allocation failed"""
+    return lib.ggml_gallocr_reserve(galloc, graph)
 
 
-lib.ggml_gallocr_set_parse_seq.argtypes = [
-    ggml_gallocr_t,
-    ctypes.POINTER(ctypes.c_int),
-    ctypes.c_int,
-]
-lib.ggml_gallocr_set_parse_seq.restype = None
+lib.ggml_gallocr_reserve.argtypes = [ggml_gallocr, ctypes.POINTER(ggml_cgraph)]
+lib.ggml_gallocr_reserve.restype = ctypes.c_bool
 
 
-# GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
-def ggml_gallocr_alloc_graph(
-    galloc: ggml_gallocr_t, talloc: ggml_tallocr_t, graph: ggml_cgraph_p
-) -> Union[ctypes.c_size_t, int]:
-    return lib.ggml_gallocr_alloc_graph(galloc, talloc, graph)
+# GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
+def ggml_gallocr_reserve_n(
+    galloc: ggml_gallocr,
+    graph: ggml_cgraph_p,
+    node_buffer_ids: "ctypes._Pointer[ctypes.c_int]",  # type: ignore
+) -> bool:
+    return lib.ggml_gallocr_reserve_n(galloc, graph, node_buffer_ids)
 
 
-lib.ggml_gallocr_alloc_graph.argtypes = [
-    ggml_gallocr_t,
-    ggml_tallocr_t,
+lib.ggml_gallocr_reserve_n.argtypes = [
+    ggml_gallocr,
     ctypes.POINTER(ggml_cgraph),
+    ctypes.POINTER(ctypes.c_int),
 ]
-lib.ggml_gallocr_alloc_graph.restype = ctypes.c_size_t
+lib.ggml_gallocr_reserve_n.restype = ctypes.c_bool
 
 
-# // Allocate tensors from the allocators given by the hash table
-# GGML_API void   ggml_gallocr_alloc_graph_n(
-#                     ggml_gallocr_t galloc,
-#                     struct ggml_cgraph * graph,
-#                     struct ggml_hash_set hash_set,
-#                     ggml_tallocr_t * hash_node_talloc);
-def ggml_gallocr_alloc_graph_n(
-    galloc: ggml_gallocr_t,
-    graph: ggml_cgraph_p,
-    hash_set: ggml_hash_set,
-    hash_node_talloc: "ctypes._Pointer(ggml_tallocr_t)",  # type: ignore
-) -> None:
-    return lib.ggml_gallocr_alloc_graph_n(galloc, graph, hash_set, hash_node_talloc)
+# // automatic reallocation if the topology changes when using a single buffer
+# // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
+# GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+def ggml_gallocr_alloc_graph(galloc: ggml_gallocr, graph: ggml_cgraph_p) -> bool:
+    """automatic reallocation if the topology changes when using a single buffer
+    returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)"""
+    return lib.ggml_gallocr_alloc_graph(galloc, graph)
 
 
-lib.ggml_gallocr_alloc_graph_n.argtypes = [
-    ggml_gallocr_t,
-    ctypes.POINTER(ggml_cgraph),
-    ggml_hash_set,
-    ctypes.POINTER(ggml_tallocr_t),
-]
-lib.ggml_gallocr_alloc_graph_n.restype = None
+lib.ggml_gallocr_alloc_graph.argtypes = [ggml_gallocr, ctypes.POINTER(ggml_cgraph)]
+lib.ggml_gallocr_alloc_graph.restype = ctypes.c_bool
+
+
+# GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+def ggml_gallocr_get_buffer_size(
+    galloc: ggml_gallocr, buffer_id: Union[ctypes.c_int, int]
+) -> int:
+    return lib.ggml_gallocr_get_buffer_size(galloc, buffer_id)
+
+
+lib.ggml_gallocr_get_buffer_size.argtypes = [ggml_gallocr, ctypes.c_int]
+lib.ggml_gallocr_get_buffer_size.restype = ctypes.c_size_t
 
 
 # // Utils
 # // Create a buffer and allocate all the tensors in a ggml_context
-# GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
+# GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
 def ggml_backend_alloc_ctx_tensors_from_buft(
-    ctx: ggml_context_p, buft: ggml_backend_buffer_type_p
-) -> ggml_backend_buffer_p:
+    ctx: ggml_context_p, buft: ggml_backend_buffer_type_t
+) -> ggml_backend_buffer_t:
+    """Create a buffer and allocate all the tensors in a ggml_context"""
     return lib.ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft)
 
 
 lib.ggml_backend_alloc_ctx_tensors_from_buft.argtypes = [
     ggml_context_p,
-    ggml_backend_buffer_type_p,
+    ggml_backend_buffer_type_t,
 ]
-lib.ggml_backend_alloc_ctx_tensors_from_buft.restype = ggml_backend_buffer_p
+lib.ggml_backend_alloc_ctx_tensors_from_buft.restype = ggml_backend_buffer_t
 
 
-# GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
+# GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
 def ggml_backend_alloc_ctx_tensors(
     ctx: ggml_context_p, backend: ggml_backend_t
-) -> ggml_backend_buffer_p:
+) -> ggml_backend_buffer_t:
     return lib.ggml_backend_alloc_ctx_tensors(ctx, backend)
 
 
-lib.ggml_backend_alloc_ctx_tensors.argtypes = [
-    ggml_context_p,
-    ggml_backend_t,
-]
-lib.ggml_backend_alloc_ctx_tensors.restype = ggml_backend_buffer_p
+lib.ggml_backend_alloc_ctx_tensors.argtypes = [ggml_context_p, ggml_backend_t]
+lib.ggml_backend_alloc_ctx_tensors.restype = ggml_backend_buffer_t
+
 
 #####################################################
 # GGML Backend API
@@ -9045,10 +8876,7 @@ def ggml_backend_alloc_ctx_tensors(
 # typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
 # typedef struct ggml_backend * ggml_backend_t;
 # typedef void * ggml_backend_graph_plan_t;
-ggml_backend_buffer_type_t = ctypes.c_void_p
-ggml_backend_buffer_t = ctypes.c_void_p
-# ggml_backend_t = ctypes.c_void_p
-ggml_backend_graph_plan_t = ctypes.c_void_p
+ggml_backend_graph_plan_t: TypeAlias = ctypes.c_void_p
 
 # //
 # // Backend buffer
@@ -9599,7 +9427,7 @@ def ggml_backend_cpu_set_n_threads(
 # GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 def ggml_backend_cpu_set_abort_callback(
     backend_cpu: ggml_backend_t,
-    abort_callback, # type: ignore
+    abort_callback,  # type: ignore
     abort_callback_data: ctypes.c_void_p,
 ):
     return lib.ggml_backend_cpu_set_abort_callback(
@@ -9751,11 +9579,7 @@ def ggml_backend_reg_alloc_buffer(
 
 #     // in build_graph:
 #     build_graph(...) {
-#         // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
-#         alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
-#         ggml_allocr_alloc(alloc_cpu, tensor);
-
-#         // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
+#         // manually assign nodes to a backend (optional, should not be needed in most cases)
 #         struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
 #         ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
 #     }
@@ -9783,8 +9607,7 @@ def ggml_backend_reg_alloc_buffer(
 # //
 # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
 ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(
-    ctypes.c_bool,
-    ctypes.POINTER(ggml_tensor), ctypes.c_bool, ctypes.c_void_p
+    ctypes.c_bool, ctypes.POINTER(ggml_tensor), ctypes.c_bool, ctypes.c_void_p
 )
 
 
@@ -9820,19 +9643,20 @@ def ggml_backend_sched_free(
 
 
 # // Initialize backend buffers from a measure graph
-# GGML_API void                  ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
-def ggml_backend_sched_init_measure(
+# GGML_API bool                  ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+def ggml_backend_sched_reserve(
     sched: ggml_backend_sched_t,
     measure_graph: ggml_cgraph_p,
-):
-    return lib.ggml_backend_sched_init_measure(sched, measure_graph)
+) -> bool:
+    """Initialize backend buffers from a measure graph."""
+    return lib.ggml_backend_sched_reserve(sched, measure_graph)
 
 
-lib.ggml_backend_sched_init_measure.argtypes = [
+lib.ggml_backend_sched_reserve.argtypes = [
     ggml_backend_sched_t,
     ctypes.POINTER(ggml_cgraph),
 ]
-lib.ggml_backend_sched_init_measure.restype = None
+lib.ggml_backend_sched_reserve.restype = ctypes.c_bool
 
 
 # // Get the number of splits of the last graph
@@ -9840,6 +9664,7 @@ def ggml_backend_sched_init_measure(
 def ggml_backend_sched_get_n_splits(
     sched: ggml_backend_sched_t,
 ) -> int:
+    """Get the number of splits of the last graph."""
     return lib.ggml_backend_sched_get_n_splits(sched)
 
 
@@ -9847,28 +9672,16 @@ def ggml_backend_sched_get_n_splits(
 lib.ggml_backend_sched_get_n_splits.restype = ctypes.c_int
 
 
-# GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
-def ggml_backend_sched_get_tallocr(
-    sched: ggml_backend_sched_t,
-    backend: ggml_backend_t,
-) -> ggml_tallocr_t:
-    return lib.ggml_backend_sched_get_tallocr(sched, backend)
-
-
-lib.ggml_backend_sched_get_tallocr.argtypes = [ggml_backend_sched_t, ggml_backend_t]
-lib.ggml_backend_sched_get_tallocr.restype = ggml_tallocr_t
-
-
-# GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
-def ggml_backend_sched_get_buffer(
+# GGML_API size_t                ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+def ggml_backend_sched_get_buffer_size(
     sched: ggml_backend_sched_t,
     backend: ggml_backend_t,
-) -> ggml_backend_buffer_t:
-    return lib.ggml_backend_sched_get_buffer(sched, backend)
+) -> int:
+    return lib.ggml_backend_sched_get_buffer_size(sched, backend)
 
 
-lib.ggml_backend_sched_get_buffer.argtypes = [ggml_backend_sched_t, ggml_backend_t]
-lib.ggml_backend_sched_get_buffer.restype = ggml_backend_buffer_t
+lib.ggml_backend_sched_get_buffer_size.argtypes = [ggml_backend_sched_t, ggml_backend_t]
+lib.ggml_backend_sched_get_buffer_size.restype = ctypes.c_size_t
 
 
 # GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
@@ -9905,10 +9718,12 @@ def ggml_backend_sched_get_node_backend(
 
 # // Allocate and compute graph on the backend scheduler
 # GGML_API void                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+# GGML_API bool                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
 def ggml_backend_sched_graph_compute(
     sched: ggml_backend_sched_t,
     graph: ggml_cgraph_p,
-):
+) -> bool:
+    """Allocate and compute graph on the backend scheduler."""
     return lib.ggml_backend_sched_graph_compute(sched, graph)
 
 
@@ -9916,14 +9731,15 @@ def ggml_backend_sched_graph_compute(
     ggml_backend_sched_t,
     ctypes.POINTER(ggml_cgraph),
 ]
-lib.ggml_backend_sched_graph_compute.restype = None
+lib.ggml_backend_sched_graph_compute.restype = ctypes.c_bool
 
 
-# // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
+# // Reset all assignments and allocators - must be called before changing the node backends
 # GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
 def ggml_backend_sched_reset(
     sched: ggml_backend_sched_t,
 ):
+    """Reset all assignments and allocators - must be called before changing the node backends."""
     return lib.ggml_backend_sched_reset(sched)
 
 
@@ -9935,7 +9751,7 @@ def ggml_backend_sched_reset(
 # GGML_API void                  ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
 def ggml_backend_sched_set_eval_callback(
     sched: ggml_backend_sched_t,
-    callback, # type: ignore
+    callback,  # type: ignore
     user_data: ctypes.c_void_p,
 ):
     return lib.ggml_backend_sched_set_eval_callback(sched, callback, user_data)
@@ -9943,7 +9759,7 @@ def ggml_backend_sched_set_eval_callback(
 
 lib.ggml_backend_sched_set_eval_callback.argtypes = [
     ggml_backend_sched_t,
-    ggml_backend_sched_eval_callback, # TODO: this may need to also accept NULL
+    ggml_backend_sched_eval_callback,  # TODO: this may need to also accept NULL
     ctypes.c_void_p,
 ]
 lib.ggml_backend_sched_set_eval_callback.restype = None
@@ -10014,7 +9830,7 @@ def ggml_backend_compare_graph_backend(
     backend1: ggml_backend_t,
     backend2: ggml_backend_t,
     graph: ggml_cgraph_p,
-    callback, # type: ignore
+    callback,  # type: ignore
     user_data: ctypes.c_void_p,
 ) -> bool:
     return lib.ggml_backend_compare_graph_backend(
@@ -10400,7 +10216,7 @@ class ggml_backend(ctypes.Structure):
 # GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
 def ggml_backend_register(
     name: bytes,
-    init_fn, # type: ignore
+    init_fn,  # type: ignore
     default_buffer_type: ggml_backend_buffer_type_t,
     user_data: ctypes.c_void_p,
 ):
@@ -10664,7 +10480,7 @@ def ggml_backend_cuda_get_device_memory(
 
 # GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 def ggml_backend_metal_log_set_callback(
-    log_callback, # type: ignore
+    log_callback,  # type: ignore
     user_data: ctypes.c_void_p,
 ):
     return lib.ggml_backend_metal_log_set_callback(log_callback, user_data)
@@ -10815,6 +10631,7 @@ def ggml_cl_add(
 ):
     return lib.ggml_cl_add(src0, src1, dst)
 
+
 if GGML_USE_CLBLAST:
     lib.ggml_cl_add.argtypes = [
         ctypes.POINTER(ggml_tensor),
@@ -10933,118 +10750,181 @@ def ggml_backend_opencl_host_buffer_type() -> ggml_backend_buffer_type_t:
 
 #####################################################
 # GGML Vulkan API
-# source: ggml-vulkan.h
+# source: src/ggml-vulkan.h
 #####################################################
 
+GGML_HAS_VULKAN = hasattr(lib, "ggml_vk_init_cpu_assist")
 
-GGML_USE_VULKAN = hasattr(lib, "ggml_vk_init")
+# #define GGML_VK_NAME "Vulkan"
+# #define GGML_VK_MAX_DEVICES 16
+GGML_VK_NAME = "Vulkan"
+GGML_VK_MAX_DEVICES = 16
 
 
-# GGML_API void ggml_vk_init(void);
-def ggml_vk_init():
-    return lib.ggml_vk_init()
+# GGML_API void ggml_vk_init_cpu_assist(void);
+def ggml_vk_init_cpu_assist():
+    return lib.ggml_vk_init_cpu_assist()
 
 
-if GGML_USE_VULKAN:
-    lib.ggml_vk_init.argtypes = []
-    lib.ggml_vk_init.restype = None
+if GGML_HAS_VULKAN:
+    lib.ggml_vk_init_cpu_assist.argtypes = []
+    lib.ggml_vk_init_cpu_assist.restype = None
 
 
-# GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node);
-def ggml_vk_preallocate_buffers_graph(
-    node: ggml_tensor_p,
-):
-    return lib.ggml_vk_preallocate_buffers_graph(node)
+# GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
+def ggml_vk_preallocate_buffers_graph_cpu_assist(node: ggml_tensor_p):
+    return lib.ggml_vk_preallocate_buffers_graph_cpu_assist(node)
 
 
-if GGML_USE_VULKAN:
-    lib.ggml_vk_preallocate_buffers_graph.argtypes = [
-        ctypes.POINTER(ggml_tensor),
+if GGML_HAS_VULKAN:
+    lib.ggml_vk_preallocate_buffers_graph_cpu_assist.argtypes = [
+        ctypes.POINTER(ggml_tensor)
     ]
-    lib.ggml_vk_preallocate_buffers_graph.restype = None
+    lib.ggml_vk_preallocate_buffers_graph_cpu_assist.restype = None
 
 
-# GGML_API void ggml_vk_preallocate_buffers(void);
-def ggml_vk_preallocate_buffers():
-    return lib.ggml_vk_preallocate_buffers()
+# GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
+def ggml_vk_preallocate_buffers_cpu_assist():
+    return lib.ggml_vk_preallocate_buffers_cpu_assist()
 
 
-if GGML_USE_VULKAN:
-    lib.ggml_vk_preallocate_buffers.argtypes = []
-    lib.ggml_vk_preallocate_buffers.restype = None
+if GGML_HAS_VULKAN:
+    lib.ggml_vk_preallocate_buffers_cpu_assist.argtypes = []
+    lib.ggml_vk_preallocate_buffers_cpu_assist.restype = None
 
 
-# GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node);
-def ggml_vk_build_graph(
-    node: ggml_tensor_p,
-    last_node: bool,
-):
-    return lib.ggml_vk_build_graph(node, last_node)
+# GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
+def ggml_vk_build_graph_cpu_assist(node: ggml_tensor_p, last_node: bool):
+    return lib.ggml_vk_build_graph_cpu_assist(node, last_node)
 
 
-if GGML_USE_VULKAN:
-    lib.ggml_vk_build_graph.argtypes = [
+if GGML_HAS_VULKAN:
+    lib.ggml_vk_build_graph_cpu_assist.argtypes = [
         ctypes.POINTER(ggml_tensor),
         ctypes.c_bool,
     ]
-    lib.ggml_vk_build_graph.restype = None
+    lib.ggml_vk_build_graph_cpu_assist.restype = None
 
 
-# GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-def ggml_vk_compute_forward(
-    params: ggml_compute_params_p,
-    tensor: ggml_tensor_p,
+# GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+def ggml_vk_compute_forward_cpu_assist(
+    params: ggml_compute_params_p, tensor: ggml_tensor_p
 ) -> bool:
-    return lib.ggml_vk_compute_forward(params, tensor)
+    return lib.ggml_vk_compute_forward_cpu_assist(params, tensor)
 
-if GGML_USE_VULKAN:
-    lib.ggml_vk_compute_forward.argtypes = [
+
+if GGML_HAS_VULKAN:
+    lib.ggml_vk_compute_forward_cpu_assist.argtypes = [
         ctypes.POINTER(ggml_compute_params),
         ctypes.POINTER(ggml_tensor),
     ]
-    lib.ggml_vk_compute_forward.restype = ctypes.c_bool
+    lib.ggml_vk_compute_forward_cpu_assist.restype = ctypes.c_bool
+
+# #ifdef GGML_VULKAN_CHECK_RESULTS
+# void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+# #endif
+
+
+# GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
+def ggml_vk_graph_cleanup_cpu_assist():
+    return lib.ggml_vk_graph_cleanup_cpu_assist()
+
+
+if GGML_HAS_VULKAN:
+    lib.ggml_vk_graph_cleanup_cpu_assist.argtypes = []
+    lib.ggml_vk_graph_cleanup_cpu_assist.restype = None
 
 
-# GGML_API void ggml_vk_graph_cleanup(void);
-def ggml_vk_graph_cleanup():
-    return lib.ggml_vk_graph_cleanup()
+# GGML_API void ggml_vk_free_cpu_assist(void);
+def ggml_vk_free_cpu_assist():
+    return lib.ggml_vk_free_cpu_assist()
 
 
-if GGML_USE_VULKAN:
-    lib.ggml_vk_graph_cleanup.argtypes = []
-    lib.ggml_vk_graph_cleanup.restype = None
+if GGML_HAS_VULKAN:
+    lib.ggml_vk_free_cpu_assist.argtypes = []
+    lib.ggml_vk_free_cpu_assist.restype = None
 
 
 # // backend API
-# GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void);
-def ggml_backend_vk_init() -> Optional[ggml_backend_t]:
-    return lib.ggml_backend_vk_init()
+# GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+def ggml_backend_vk_init(
+    dev_num: Union[ctypes.c_size_t, int]
+) -> Optional[ggml_backend_t]:
+    return lib.ggml_backend_vk_init(dev_num)
 
 
-if GGML_USE_VULKAN:
-    lib.ggml_backend_vk_init.argtypes = []
+if GGML_HAS_VULKAN:
+    lib.ggml_backend_vk_init.argtypes = [ctypes.c_size_t]
     lib.ggml_backend_vk_init.restype = ggml_backend_t
 
 
 # GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
-def ggml_backend_is_vk(
-    backend: ggml_backend_t,
-) -> bool:
+def ggml_backend_is_vk(backend: ggml_backend_t) -> bool:
     return lib.ggml_backend_is_vk(backend)
 
 
-if GGML_USE_VULKAN:
+if GGML_HAS_VULKAN:
     lib.ggml_backend_is_vk.argtypes = [ggml_backend_t]
     lib.ggml_backend_is_vk.restype = ctypes.c_bool
 
 
-# GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void);
-def ggml_backend_vk_buffer_type() -> ggml_backend_buffer_type_t:
-    return lib.ggml_backend_vk_buffer_type()
+# GGML_API GGML_CALL int  ggml_backend_vk_get_device_count(void);
+def ggml_backend_vk_get_device_count() -> int:
+    return lib.ggml_backend_vk_get_device_count()
+
+
+if GGML_HAS_VULKAN:
+    lib.ggml_backend_vk_get_device_count.argtypes = []
+    lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int
 
 
-if GGML_USE_VULKAN:
-    lib.ggml_backend_vk_buffer_type.argtypes = []
+# GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+def ggml_backend_vk_get_device_description(
+    device: Union[ctypes.c_int, int],
+    description: bytes,
+    description_size: Union[ctypes.c_size_t, int],
+):
+    return lib.ggml_backend_vk_get_device_description(
+        device, description, description_size
+    )
+
+
+if GGML_HAS_VULKAN:
+    lib.ggml_backend_vk_get_device_description.argtypes = [
+        ctypes.c_int,
+        ctypes.c_char_p,
+        ctypes.c_size_t,
+    ]
+    lib.ggml_backend_vk_get_device_description.restype = None
+
+
+# GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+def ggml_backend_vk_get_device_memory(
+    device: Union[ctypes.c_int, int],
+    free: "ctypes._Pointer[ctypes.c_size_t]",  # type: ignore
+    total: "ctypes._Pointer[ctypes.c_size_t]",  # type: ignore
+):
+    return lib.ggml_backend_vk_get_device_memory(device, free, total)
+
+
+if GGML_HAS_VULKAN:
+    lib.ggml_backend_vk_get_device_memory.argtypes = [
+        ctypes.c_int,
+        ctypes.POINTER(ctypes.c_size_t),
+        ctypes.POINTER(ctypes.c_size_t),
+    ]
+    lib.ggml_backend_vk_get_device_memory.restype = None
+
+
+# GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+def ggml_backend_vk_buffer_type(
+    dev_num: Union[ctypes.c_size_t, int]
+) -> ggml_backend_buffer_type_t:
+    return lib.ggml_backend_vk_buffer_type(dev_num)
+
+
+if GGML_HAS_VULKAN:
+    lib.ggml_backend_vk_buffer_type.argtypes = [ctypes.c_size_t]
     lib.ggml_backend_vk_buffer_type.restype = ggml_backend_buffer_type_t
 
 
@@ -11054,11 +10934,10 @@ def ggml_backend_vk_host_buffer_type() -> ggml_backend_buffer_type_t:
     return lib.ggml_backend_vk_host_buffer_type()
 
 
-if GGML_USE_VULKAN:
+if GGML_HAS_VULKAN:
     lib.ggml_backend_vk_host_buffer_type.argtypes = []
     lib.ggml_backend_vk_host_buffer_type.restype = ggml_backend_buffer_type_t
 
-
 # TODO: Add ggml-sycl.h
 
-# TODO: Add ggml-kompute.h
\ No newline at end of file
+# TODO: Add ggml-kompute.h
diff --git a/ggml/utils.py b/ggml/utils.py
index edd713d..307c23c 100644
--- a/ggml/utils.py
+++ b/ggml/utils.py
@@ -2,9 +2,8 @@
 """
 import enum
 import ctypes
-import contextlib
 
-from typing import Any, List, Optional, Sequence, Tuple
+from typing import Any, Optional, Sequence, Tuple
 
 from ggml import ggml
 
@@ -12,7 +11,7 @@
 import numpy.typing as npt
 
 
-class GGML_TYPE(enum.Enum):
+class GGML_TYPE(enum.IntEnum):
     F32 = ggml.GGML_TYPE_F32
     F16 = ggml.GGML_TYPE_F16
     Q4_0 = ggml.GGML_TYPE_Q4_0
@@ -53,13 +52,16 @@ def to_numpy(
         ctypes_type = ctypes.c_uint16
     else:
         ctypes_type = np.ctypeslib.as_ctypes_type(GGML_TYPE_TO_NUMPY_DTYPE[ggml_type])
-
-    array = ctypes.cast(ggml.ggml_get_data(tensor), ctypes.POINTER(ctypes_type))
+    
+    data = ggml.ggml_get_data(tensor)
+    if data is None:
+        raise ValueError("tensor data is None")
+    array = ctypes.cast(data, ctypes.POINTER(ctypes_type))
     n_dims = ggml.ggml_n_dims(tensor)
     shape = tuple(reversed(tensor.contents.ne[:n_dims]))
     output = np.ctypeslib.as_array(array, shape=shape)
     if ggml_type == GGML_TYPE.F16:
-        output.dtype = np.float16
+        output.dtype = np.float16 # type: ignore
     return np.lib.stride_tricks.as_strided(
         output, strides=tuple(reversed(tensor.contents.nb[:n_dims]))
     )
@@ -91,33 +93,6 @@ def from_numpy(x: npt.NDArray[Any], ctx: ggml.ggml_context_p) -> ggml.ggml_tenso
     return tensor
 
 
-@contextlib.contextmanager
-def ggml_context_manager(params: ggml.ggml_init_params):
-    """Creates a context manager for a new ggml context that free's it after use.
-
-    Example:
-        ```python
-        import ggml
-        from ggml.utils import ggml_context_manager
-
-        params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
-        with ggml_context_manager(params) as ctx:
-            # do stuff with ctx
-        ```
-
-    Parameters:
-        params: context parameters
-
-    Returns:
-        (contextlib.AbstractContextManager): ggml_context_p context manager
-    """
-    ctx = ggml.ggml_init(params)
-    try:
-        yield ctx
-    finally:
-        ggml.ggml_free(ctx)
-
-
 def copy_to_cpu(
     ctx: ggml.ggml_context_p, tensor: ggml.ggml_tensor_p
 ) -> ggml.ggml_tensor_p:
@@ -337,36 +312,3 @@ def slice_tensor(
             f"ggml tensors with {ndims} dimensions are not supported"
         )
 
-
-def alloc_graph_measure(
-    graph: ggml.ggml_cgraph,
-    alignment: int,
-    alloc_tensors: Optional[List[ggml.ggml_tensor_p]] = None,
-) -> int:
-    """Returns the number of bytes required by a ggml_allocr allocator to allocate the tensors in the graph.
-
-    NOTE: This implementation saves a copy of the current data pointers of all graph nodes and leafs and restores them
-    after measuring the allocation size so that the graph can be re-used.
-
-    Parameters:
-        graph: ggml graph
-        alignment: alignment of the allocation
-        alloc_tensors: list of tensors to allocate individually using ggml_allocr_alloc
-
-    Returns:
-        Size of the required allocation buffer in bytes"""
-    alloc_tensors = alloc_tensors or []
-    leaf_data = [ggml.ggml_get_data(graph.leafs[i]) for i in range(graph.n_leafs)]
-    node_data = [ggml.ggml_get_data(graph.nodes[i]) for i in range(graph.n_nodes)]
-    alloc = ggml.ggml_allocr_new_measure(alignment)
-    for tensor in alloc_tensors:
-        ggml.ggml_allocr_alloc(alloc, tensor)
-    alloc_size = (
-        ggml.ggml_allocr_alloc_graph(alloc, ctypes.byref(graph)) + alignment  # type: ignore
-    )
-    ggml.ggml_allocr_free(alloc)
-    for i in range(graph.n_leafs):
-        graph.leafs[i].contents.data = leaf_data[i]
-    for i in range(graph.n_nodes):
-        graph.nodes[i].contents.data = node_data[i]
-    return alloc_size
diff --git a/tests/test_experimental_api.py b/tests/test_experimental_api.py
index ec5473e..4e535e6 100644
--- a/tests/test_experimental_api.py
+++ b/tests/test_experimental_api.py
@@ -71,16 +71,10 @@ def test_experimental_api():
 
         assert f.shape == (1,)
 
-        measure_allocr = backend.new_measure()
-
         graph = ggml_cgraph(f)
 
-        mem_size = measure_allocr.alloc_graph(graph)
-
-        buffer = backend.alloc_buffer(mem_size)
-
-        allocr = buffer.new_allocr()
-        allocr.alloc_graph(graph)
+        gallocr = backend.new_graph_allocator()
+        gallocr.allocate_graph(graph)
 
         x[0] = 2.0
 
diff --git a/tests/test_ggml.py b/tests/test_ggml.py
index 2405efd..0ae5a69 100644
--- a/tests/test_ggml.py
+++ b/tests/test_ggml.py
@@ -1,7 +1,11 @@
 import ctypes
+
 from typing import Optional
+
 import ggml
 
+import numpy as np
+
 
 def test_ggml():
     assert ggml.GGML_FILE_VERSION == 1
@@ -58,7 +62,10 @@ def double(
 
 
 def test_ggml_min_alloc():
-    max_overhead = ggml.ggml_tensor_overhead() * ggml.GGML_DEFAULT_GRAPH_SIZE  + ggml.ggml_graph_overhead()
+    max_overhead = (
+        ggml.ggml_tensor_overhead() * ggml.GGML_DEFAULT_GRAPH_SIZE
+        + ggml.ggml_graph_overhead()
+    )
     assert max_overhead < 16 * 1024 * 1024  # 16MB
     params = ggml.ggml_init_params(
         mem_size=max_overhead, mem_buffer=None, no_alloc=True
@@ -89,7 +96,12 @@ def build_graph(ctx: ggml.ggml_context_p):
     n_leafs = gf.contents.n_leafs
     leafs_size = sum(ggml.ggml_nbytes_pad(gf.contents.leafs[i]) for i in range(n_leafs))
 
-    mem_size = nodes_size + leafs_size + ggml.ggml_tensor_overhead() * (n_nodes + n_leafs) + ggml.ggml_graph_overhead()
+    mem_size = (
+        nodes_size
+        + leafs_size
+        + ggml.ggml_tensor_overhead() * (n_nodes + n_leafs)
+        + ggml.ggml_graph_overhead()
+    )
 
     ggml.ggml_free(ctx)
 
@@ -119,197 +131,136 @@ def build_graph(ctx: ggml.ggml_context_p):
     ggml.ggml_free(ctx)
 
 
-def test_ggml_alloc():
-    def build_graph(ctx: ggml.ggml_context_p, alloc: ggml.ggml_allocr_t):
-        # inputs
-        x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-        ggml.ggml_set_name(x, b"x")
-        ggml.ggml_allocr_alloc(alloc, x)
-        a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-        ggml.ggml_set_name(a, b"a")
-        ggml.ggml_allocr_alloc(alloc, a)
-        b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-        ggml.ggml_set_name(b, b"b")
-        ggml.ggml_allocr_alloc(alloc, b)
-
-        x2 = ggml.ggml_mul(ctx, x, x)
-        tmp = ggml.ggml_mul(ctx, a, x2)
+def test_quantize():
+    ne0 = 32
+    ne1 = 1
+    nelements = ne0 * ne1
+    data = [float(i) for i in range(nelements)]
+    data_f32 = (ctypes.c_float * len(data))(*data)
+    work = (ctypes.c_float * nelements)(0)
+    hist = (ctypes.c_int64 * (1 << 4))(0)
+    cur_size = ggml.ggml_quantize_q8_0(
+        data_f32,
+        ctypes.cast(work, ctypes.c_void_p),
+        nelements,
+        ne0,
+        hist,
+    )
+    assert cur_size == 34
 
-        # outputs
-        f = ggml.ggml_add(ctx, tmp, b)
-        ggml.ggml_set_name(f, b"f")
+    type_traits = ggml.ggml_internal_get_type_traits(ggml.GGML_TYPE_Q8_0)
+    work2 = (ctypes.c_float * nelements)(0)
+    type_traits.to_float(
+        ctypes.cast(work, ctypes.c_void_p),
+        ctypes.cast(work2, ctypes.POINTER(ctypes.c_float)),
+        nelements,
+    )
 
-        # build graph
-        gf = ggml.ggml_new_graph(ctx)
-        ggml.ggml_build_forward_expand(gf, f)
+    eps = 0.5
+    for i in range(nelements):
+        assert abs(work2[i] - data[i]) < eps
 
-        return gf
 
-    max_overhead = ggml.ggml_tensor_overhead() * ggml.GGML_DEFAULT_GRAPH_SIZE  + ggml.ggml_graph_overhead()
-    assert max_overhead < 16 * 1024 * 1024  # 16MB
+def test_ggml_cpu_backend():
+    n_tensors = 1 + 2  # input (x) and weights (a, b)
     params = ggml.ggml_init_params(
-        mem_size=max_overhead, mem_buffer=None, no_alloc=True
+        mem_size=ggml.ggml_tensor_overhead() * n_tensors, mem_buffer=None, no_alloc=True
     )
     ctx = ggml.ggml_init(params=params)
     assert ctx is not None
 
-    tensor_alignment = 32
-    alloc = ggml.ggml_allocr_new_measure(tensor_alignment)
-    assert alloc is not None
-    assert ggml.ggml_allocr_is_measure(alloc)
+    backend = ggml.ggml_backend_cpu_init()
 
-    gf = build_graph(ctx, alloc)
-    gp = ggml.ggml_graph_plan(gf, 1)
-    assert gp.work_size == 0
+    assert backend is not None
 
-    alloc_size = (
-        ggml.ggml_allocr_alloc_graph(alloc, gf) + tensor_alignment
-    )
+    # create the tensors for input and weights
+    x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 
-    ggml.ggml_free(ctx)
-    ggml.ggml_allocr_free(alloc)
+    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
+    b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
 
-    params = ggml.ggml_init_params(
-        mem_size=max_overhead, mem_buffer=None, no_alloc=True
+    # allocate the tensors in the backend
+    buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend)
+
+    # set the values of the weights
+    ggml.ggml_backend_tensor_set(
+        a,
+        ctypes.cast(np.array([3.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
+        0,
+        ggml.ggml_nbytes(a),
     )
-    ctx = ggml.ggml_init(params=params)
-    assert ctx is not None
-    buffer = (ctypes.c_uint8 * alloc_size)()
-    alloc = ggml.ggml_allocr_new(
-        ctypes.cast(buffer, ctypes.c_void_p), alloc_size, tensor_alignment
+    ggml.ggml_backend_tensor_set(
+        b,
+        ctypes.cast(np.array([4.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
+        0,
+        ggml.ggml_nbytes(a),
     )
-    gf = build_graph(ctx, alloc)
-    ggml.ggml_allocr_alloc_graph(alloc, gf)
-
-    a = ggml.ggml_get_tensor(ctx, b"a")
-    b = ggml.ggml_get_tensor(ctx, b"b")
-    x = ggml.ggml_get_tensor(ctx, b"x")
-    f = ggml.ggml_get_tensor(ctx, b"f")
 
-    assert a is not None and b is not None and x is not None and f is not None
+    max_nodes = 4096
 
-    ggml.ggml_set_f32(x, 2.0)
-    ggml.ggml_set_f32(a, 3.0)
-    ggml.ggml_set_f32(b, 4.0)
+    buf_size = (
+        ggml.ggml_tensor_overhead() * max_nodes
+        + ggml.ggml_graph_overhead_custom(max_nodes, False)
+    )
+    buf = (ctypes.c_uint8 * buf_size)()
 
-    gp = ggml.ggml_graph_plan(gf, 1)
-    ggml.ggml_graph_compute(gf, ctypes.pointer(gp))
-    output = ggml.ggml_get_f32_1d(f, 0)
-    assert output == 16.0
-    ggml.ggml_free(ctx)
-    ggml.ggml_allocr_free(alloc)
+    def build_graph(
+        x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p
+    ):
+        params = ggml.ggml_init_params(
+            mem_size=buf_size,
+            mem_buffer=ctypes.cast(buf, ctypes.c_void_p),
+            no_alloc=True,
+        )
+        ctx0 = ggml.ggml_init(params=params)
 
-def test_ggml_alloc_one_pass():
-    max_overhead = ggml.ggml_tensor_overhead() * ggml.GGML_DEFAULT_GRAPH_SIZE  + ggml.ggml_graph_overhead()
-    assert max_overhead < 16 * 1024 * 1024  # 16MB
-    params = ggml.ggml_init_params(
-        mem_size=max_overhead, mem_buffer=None, no_alloc=True
-    )
-    ctx = ggml.ggml_init(params=params)
-    assert ctx is not None
+        assert ctx0 is not None
 
-    # define the graph
-    x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-    ggml.ggml_set_name(x, b"x")
-    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-    ggml.ggml_set_name(a, b"a")
-    b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-    ggml.ggml_set_name(b, b"b")
+        gf = ggml.ggml_new_graph_custom(ctx0, max_nodes, False)
 
-    x2 = ggml.ggml_mul(ctx, x, x)
-    tmp = ggml.ggml_mul(ctx, a, x2)
+        x2 = ggml.ggml_mul(ctx0, x, x)
+        ax2 = ggml.ggml_mul(ctx0, a, x2)
+        f = ggml.ggml_add(ctx0, ax2, b)
 
-    # outputs
-    f = ggml.ggml_add(ctx, tmp, b)
-    ggml.ggml_set_name(f, b"f")
+        ggml.ggml_set_name(x2, b"x2")
+        ggml.ggml_set_name(ax2, b"ax2")
+        ggml.ggml_set_name(f, b"f")
 
-    # build graph
+        ggml.ggml_build_forward_expand(gf, f)
 
-    gf = ggml.ggml_new_graph(ctx)
-    ggml.ggml_build_forward_expand(gf, f)
+        ggml.ggml_free(ctx0)
 
-    # save old data pointers
-    leaf_data = [ggml.ggml_get_data(gf.contents.leafs[i]) for i in range(gf.contents.n_leafs)]
-    node_data = [ggml.ggml_get_data(gf.contents.nodes[i]) for i in range(gf.contents.n_nodes)]
-
-    # create measure allocator
-    tensor_alignment = 32
-    alloc = ggml.ggml_allocr_new_measure(tensor_alignment)
-    assert alloc is not None
-    assert ggml.ggml_allocr_is_measure(alloc)
-
-    # allocate input tensors
-    ggml.ggml_allocr_alloc(alloc, x)
-    ggml.ggml_allocr_alloc(alloc, a)
-    ggml.ggml_allocr_alloc(alloc, b)
-    # allocate graph
-    alloc_size = (
-        ggml.ggml_allocr_alloc_graph(alloc, gf) + tensor_alignment
-    )
-    assert alloc_size > 0
-
-    # restore old data pointers
-    for i in range(gf.contents.n_leafs):
-        gf.contents.leafs[i].contents.data = leaf_data[i]
-    
-    for i in range(gf.contents.n_nodes):
-        gf.contents.nodes[i].contents.data = node_data[i]
-
-    # free measure allocator
-    ggml.ggml_allocr_free(alloc)
-
-    # allocate tensor memory
-    buffer = (ctypes.c_uint8 * alloc_size)()
-    alloc = ggml.ggml_allocr_new(
-        ctypes.cast(buffer, ctypes.c_void_p), alloc_size, tensor_alignment
-    )
-    ggml.ggml_allocr_alloc(alloc, x)
-    ggml.ggml_allocr_alloc(alloc, a)
-    ggml.ggml_allocr_alloc(alloc, b)
-    ggml.ggml_allocr_alloc_graph(alloc, gf)
+        return gf
 
-    # set input values
-    ggml.ggml_set_f32(x, 2.0)
-    ggml.ggml_set_f32(a, 3.0)
-    ggml.ggml_set_f32(b, 4.0)
+    allocr = ggml.ggml_gallocr_new(ggml.ggml_backend_get_default_buffer_type(backend))
 
-    gp = ggml.ggml_graph_plan(gf, 1)
-    assert gp.work_size == 0
+    gf = build_graph(x, a, b)
 
-    # compute
-    ggml.ggml_graph_compute(gf, ctypes.pointer(gp))
+    ggml.ggml_gallocr_reserve(allocr, gf)
 
-    output = ggml.ggml_get_f32_1d(f, 0)
-    assert output == 16.0
+    gf = build_graph(x, a, b)
 
-    ggml.ggml_free(ctx)
-    ggml.ggml_allocr_free(alloc)
+    ggml.ggml_gallocr_alloc_graph(allocr, gf)
 
-def test_quantize():
-    ne0 = 32
-    ne1 = 1
-    nelements = ne0 * ne1
-    data = [float(i) for i in range(nelements)]
-    data_f32 = (ctypes.c_float * len(data))(*data)
-    work = (ctypes.c_float * nelements)(0)
-    hist = (ctypes.c_int64 * (1 << 4))(0)
-    cur_size = ggml.ggml_quantize_q8_0(
-        data_f32,
-        ctypes.cast(work, ctypes.c_void_p),
-        nelements,
-        ne0,
-        hist,
+    ggml.ggml_backend_tensor_set(
+        x,
+        ctypes.cast(np.array([2.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
+        0,
+        ggml.ggml_nbytes(x),
     )
-    assert cur_size == 34
 
-    type_traits = ggml.ggml_internal_get_type_traits(ggml.GGML_TYPE_Q8_0)
-    work2 = (ctypes.c_float * nelements)(0)
-    type_traits.to_float(
-        ctypes.cast(work, ctypes.c_void_p),
-        ctypes.cast(work2, ctypes.POINTER(ctypes.c_float)),
-        nelements,
+    ggml.ggml_backend_graph_compute(backend, gf)
+
+    f = ggml.ggml_graph_get_tensor(gf, b"f")
+
+    output = np.zeros(1, dtype=np.single)
+    ggml.ggml_backend_tensor_get(
+        f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x)
     )
 
-    eps = 0.5
-    for i in range(nelements):
-        assert abs(work2[i] - data[i]) < eps
+    assert output[0] == 16.0
+
+    ggml.ggml_gallocr_free(allocr)
+    ggml.ggml_backend_buffer_free(buffer)
+    ggml.ggml_backend_free(backend)
+    ggml.ggml_free(ctx)
diff --git a/tests/test_ggml_backends.py b/tests/test_ggml_backends.py
deleted file mode 100644
index 828ae60..0000000
--- a/tests/test_ggml_backends.py
+++ /dev/null
@@ -1,108 +0,0 @@
-"""Simple example of graph offloading to a non-cpu backend."""
-
-import ggml
-import ctypes
-
-import numpy as np
-
-def test_ggml_cpu_backend():
-    n_tensors = 1 + 2 # input (x) and weights (a, b)
-    params = ggml.ggml_init_params(
-        mem_size=ggml.ggml_tensor_overhead() * n_tensors, mem_buffer=None, no_alloc=True
-    )
-    ctx = ggml.ggml_init(params=params)
-    assert ctx is not None
-
-    backend = ggml.ggml_backend_cpu_init()
-
-    assert backend is not None
-
-    # create the tensors for input and weights
-    x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-
-    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-    b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-
-    # allocate the tensors in the backend
-    buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend)
-
-    # set the values of the weights
-    ggml.ggml_backend_tensor_set(
-        a,
-        ctypes.cast(np.array([3.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
-        0,
-        ggml.ggml_nbytes(a),
-    )
-    ggml.ggml_backend_tensor_set(
-        b,
-        ctypes.cast(np.array([4.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
-        0,
-        ggml.ggml_nbytes(a),
-    )
-
-    max_nodes = 4096
-
-    buf_size = ggml.ggml_tensor_overhead() * max_nodes + ggml.ggml_graph_overhead_custom(max_nodes, False)
-    buf = (ctypes.c_uint8 * buf_size)()
-
-    def build_graph(x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p):
-        params = ggml.ggml_init_params(
-            mem_size=buf_size, mem_buffer=ctypes.cast(buf, ctypes.c_void_p), no_alloc=True
-        )
-        ctx0 = ggml.ggml_init(params=params)
-
-        assert ctx0 is not None
-
-        gf = ggml.ggml_new_graph_custom(ctx0, max_nodes, False)
-
-        x2 = ggml.ggml_mul(ctx0, x, x)
-        ax2 = ggml.ggml_mul(ctx0, a, x2)
-        f = ggml.ggml_add(ctx0, ax2, b)
-
-        ggml.ggml_set_name(x2, b"x2")
-        ggml.ggml_set_name(ax2, b"ax2")
-        ggml.ggml_set_name(f, b"f")
-
-        ggml.ggml_build_forward_expand(gf, f)
-
-        ggml.ggml_free(ctx0)
-
-        return gf
-
-    allocr = ggml.ggml_allocr_new_measure_from_backend(backend)
-
-    gf = build_graph(x, a, b)
-
-    mem_size = ggml.ggml_allocr_alloc_graph(allocr, gf)
-
-    ggml.ggml_allocr_free(allocr)
-
-    buf_compute = ggml.ggml_backend_alloc_buffer(backend, mem_size)
-    allocr = ggml.ggml_allocr_new_from_buffer(buf_compute)
-
-    ggml.ggml_allocr_reset(allocr)
-
-    gf = build_graph(x, a, b)
-
-    ggml.ggml_allocr_alloc_graph(allocr, gf)
-
-    ggml.ggml_backend_tensor_set(
-        x,
-        ctypes.cast(np.array([2.0], dtype=np.single).ctypes.data, ctypes.c_void_p),
-        0,
-        ggml.ggml_nbytes(x),
-    )
-
-    ggml.ggml_backend_graph_compute(backend, gf)
-
-    f = ggml.ggml_graph_get_tensor(gf, b"f")
-
-    output = np.zeros(1, dtype=np.single)
-    ggml.ggml_backend_tensor_get(f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x))
-
-    assert output[0] == 16.0
-
-    ggml.ggml_backend_buffer_free(buffer)
-    ggml.ggml_backend_buffer_free(buf_compute)
-    ggml.ggml_backend_free(backend)
-    ggml.ggml_free(ctx)
diff --git a/tests/test_ggml_metal.py b/tests/test_ggml_metal.py
index 009388e..4fd410b 100644
--- a/tests/test_ggml_metal.py
+++ b/tests/test_ggml_metal.py
@@ -11,7 +11,6 @@
     reason="METAL not available",
 )
 
-
 @run_if_ggml_metal_available
 def test_metal():
     n_tensors = 1 + 2 # input (x) and weights (a, b)
@@ -50,12 +49,19 @@ def test_metal():
 
     max_nodes = 4096
 
-    buf_size = ggml.ggml_tensor_overhead() * max_nodes + ggml.ggml_graph_overhead_custom(max_nodes, False)
+    buf_size = (
+        ggml.ggml_tensor_overhead() * max_nodes
+        + ggml.ggml_graph_overhead_custom(max_nodes, False)
+    )
     buf = (ctypes.c_uint8 * buf_size)()
 
-    def build_graph(x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p):
+    def build_graph(
+        x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p
+    ):
         params = ggml.ggml_init_params(
-            mem_size=buf_size, mem_buffer=ctypes.cast(buf, ctypes.c_void_p), no_alloc=True
+            mem_size=buf_size,
+            mem_buffer=ctypes.cast(buf, ctypes.c_void_p),
+            no_alloc=True,
         )
         ctx0 = ggml.ggml_init(params=params)
 
@@ -77,22 +83,15 @@ def build_graph(x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tenso
 
         return gf
 
-    allocr = ggml.ggml_allocr_new_measure_from_backend(backend)
+    allocr = ggml.ggml_gallocr_new(ggml.ggml_backend_get_default_buffer_type(backend))
 
     gf = build_graph(x, a, b)
 
-    mem_size = ggml.ggml_allocr_alloc_graph(allocr, gf)
-
-    ggml.ggml_allocr_free(allocr)
-
-    buf_compute = ggml.ggml_backend_alloc_buffer(backend, mem_size)
-    allocr = ggml.ggml_allocr_new_from_buffer(buf_compute)
-
-    ggml.ggml_allocr_reset(allocr)
+    ggml.ggml_gallocr_reserve(allocr, gf)
 
     gf = build_graph(x, a, b)
 
-    ggml.ggml_allocr_alloc_graph(allocr, gf)
+    ggml.ggml_gallocr_alloc_graph(allocr, gf)
 
     ggml.ggml_backend_tensor_set(
         x,
@@ -106,11 +105,13 @@ def build_graph(x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tenso
     f = ggml.ggml_graph_get_tensor(gf, b"f")
 
     output = np.zeros(1, dtype=np.single)
-    ggml.ggml_backend_tensor_get(f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x))
+    ggml.ggml_backend_tensor_get(
+        f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x)
+    )
 
     assert output[0] == 16.0
 
+    ggml.ggml_gallocr_free(allocr)
     ggml.ggml_backend_buffer_free(buffer)
-    ggml.ggml_backend_buffer_free(buf_compute)
     ggml.ggml_backend_free(backend)
     ggml.ggml_free(ctx)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index e3b348f..7da39d0 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,127 +1,75 @@
-import ctypes
-
 import ggml
 import ggml.utils
 
-import pytest
-
 import numpy as np
 
 
 def test_utils():
     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
-    with ggml.utils.ggml_context_manager(params) as ctx:
-        x = np.ones((3,), dtype=np.float32)
-        assert x.shape == (3,)
-        t = ggml.utils.from_numpy(x, ctx)
-        assert t.contents.ne[:1] == [3]
-        assert t.contents.type == ggml.GGML_TYPE_F32
-        assert np.allclose(ggml.utils.to_numpy(t), x)
+    ctx = ggml.ggml_init(params=params)
+    assert ctx is not None
+    x = np.ones((3,), dtype=np.float32)
+    assert x.shape == (3,)
+    t = ggml.utils.from_numpy(x, ctx)
+    assert t.contents.ne[:1] == [3]
+    assert t.contents.type == ggml.GGML_TYPE_F32
+    assert np.allclose(ggml.utils.to_numpy(t), x)
+    ggml.ggml_free(ctx)
 
 
 def test_numpy_arrays():
     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
-    with ggml.utils.ggml_context_manager(params) as ctx:
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32, order="F")
-        assert x.shape == (2, 3)
-        t = ggml.utils.from_numpy(x, ctx)
-        assert t.contents.ne[:2] == [3, 2]
-        y = ggml.utils.to_numpy(t)
-        assert y.shape == (2, 3)
+    ctx = ggml.ggml_init(params=params)
+    assert ctx is not None
+    x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32, order="F")
+    assert x.shape == (2, 3)
+    t = ggml.utils.from_numpy(x, ctx)
+    assert t.contents.ne[:2] == [3, 2]
+    y = ggml.utils.to_numpy(t)
+    assert y.shape == (2, 3)
+    ggml.ggml_free(ctx)
 
 
 def test_numpy_arrays_transposed():
     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
-    with ggml.utils.ggml_context_manager(params) as ctx:
-        # 2D
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
-        t = ggml.utils.from_numpy(x, ctx)
-        t_t = ggml.ggml_transpose(ctx, t)
-        x_t = ggml.utils.to_numpy(t_t)
-        assert np.array_equal(x_t, x.T)
-
-        t = ggml.utils.from_numpy(x.T, ctx)
-        x_t = ggml.utils.to_numpy(t)
-        assert np.array_equal(x.T, x_t)
-
-        # 3D
-        x = np.array(
-            [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], dtype=np.int32
-        )
-        t = ggml.utils.from_numpy(x, ctx)
-        t_t = ggml.ggml_permute(ctx, t, 2, 1, 0, 3)
-        x_t = ggml.utils.to_numpy(t_t)
-        assert np.array_equal(x_t, x.T)
-
-        t = ggml.utils.from_numpy(x.T, ctx)
-        x_t = ggml.utils.to_numpy(t)
-        assert np.array_equal(x.T, x_t)
+    ctx = ggml.ggml_init(params=params)
+    assert ctx is not None
+    # 2D
+    x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
+    t = ggml.utils.from_numpy(x, ctx)
+    t_t = ggml.ggml_transpose(ctx, t)
+    x_t = ggml.utils.to_numpy(t_t)
+    assert np.array_equal(x_t, x.T)
+
+    t = ggml.utils.from_numpy(x.T, ctx)
+    x_t = ggml.utils.to_numpy(t)
+    assert np.array_equal(x.T, x_t)
+
+    # 3D
+    x = np.array(
+        [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], dtype=np.int32
+    )
+    t = ggml.utils.from_numpy(x, ctx)
+    t_t = ggml.ggml_permute(ctx, t, 2, 1, 0, 3)
+    x_t = ggml.utils.to_numpy(t_t)
+    assert np.array_equal(x_t, x.T)
+
+    t = ggml.utils.from_numpy(x.T, ctx)
+    x_t = ggml.utils.to_numpy(t)
+    assert np.array_equal(x.T, x_t)
+    ggml.ggml_free(ctx)
 
 
 def test_slice_tensor():
     params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024)
-    with ggml.utils.ggml_context_manager(params) as ctx:
-        x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
-        t = ggml.utils.from_numpy(x, ctx)
-        t_slice = ggml.utils.slice_tensor(ctx, t, [
-            slice(0, 2),
-            slice(0, 1)
-        ])
-        x_slice = ggml.utils.to_numpy(t_slice)
-        assert np.array_equal(x_slice, x[:1, :2].squeeze())
-
-
-def test_alloc_graph_measure():
-    max_overhead = ggml.ggml_tensor_overhead() * ggml.GGML_DEFAULT_GRAPH_SIZE  + ggml.ggml_graph_overhead()
-    assert max_overhead < 16 * 1024 * 1024  # 16MB
-    params = ggml.ggml_init_params(
-        mem_size=max_overhead, mem_buffer=None, no_alloc=True
-    )
     ctx = ggml.ggml_init(params=params)
-
-    # define the graph
-    x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-    a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-    b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1)
-
-    x2 = ggml.ggml_mul(ctx, x, x)
-    tmp = ggml.ggml_mul(ctx, a, x2)
-
-    # outputs
-    f = ggml.ggml_add(ctx, tmp, b)
-
-    # build graph
-    gf = ggml.ggml_new_graph(ctx)
-    ggml.ggml_build_forward_expand(gf, f)
-
-    # create measure allocator
-    tensor_alignment = 32
-    input_tensors = [x, a, b]
-    alloc_size = ggml.utils.alloc_graph_measure(gf.contents, tensor_alignment, input_tensors)
-
-    # allocate tensor memory
-    buffer = (ctypes.c_uint8 * alloc_size)()
-    alloc = ggml.ggml_allocr_new(
-        ctypes.cast(buffer, ctypes.c_void_p), alloc_size, tensor_alignment
-    )
-    ggml.ggml_allocr_alloc(alloc, x)
-    ggml.ggml_allocr_alloc(alloc, a)
-    ggml.ggml_allocr_alloc(alloc, b)
-    ggml.ggml_allocr_alloc_graph(alloc, gf)
-
-    # set input values
-    ggml.ggml_set_f32(x, 2.0)
-    ggml.ggml_set_f32(a, 3.0)
-    ggml.ggml_set_f32(b, 4.0)
-
-    gp = ggml.ggml_graph_plan(gf, 1)
-    assert gp.work_size == 0
-
-    # compute
-    ggml.ggml_graph_compute(gf, ctypes.pointer(gp))
-
-    output = ggml.ggml_get_f32_1d(f, 0)
-    assert output == 16.0
-
+    assert ctx is not None
+    x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
+    t = ggml.utils.from_numpy(x, ctx)
+    t_slice = ggml.utils.slice_tensor(ctx, t, [
+        slice(0, 2),
+        slice(0, 1)
+    ])
+    x_slice = ggml.utils.to_numpy(t_slice)
+    assert np.array_equal(x_slice, x[:1, :2].squeeze())
     ggml.ggml_free(ctx)
-    ggml.ggml_allocr_free(alloc)
\ No newline at end of file
diff --git a/vendor/ggml b/vendor/ggml
index 2c7cf49..5070f07 160000
--- a/vendor/ggml
+++ b/vendor/ggml
@@ -1 +1 @@
-Subproject commit 2c7cf49810d523b9632da393a9e8270b60bf3b24
+Subproject commit 5070f078a67c18c11736e78316ab715ca9afde16