From 66ccdfed52132320128610a1e9ca55f971482976 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 11 Feb 2024 12:43:40 -0500 Subject: [PATCH] Update ggml --- ggml/experimental.py | 23 +- ggml/ggml.py | 757 ++++++++++++++------------------- ggml/utils.py | 74 +--- tests/test_experimental_api.py | 10 +- tests/test_ggml.py | 273 +++++------- tests/test_ggml_backends.py | 108 ----- tests/test_ggml_metal.py | 33 +- tests/test_utils.py | 158 +++---- vendor/ggml | 2 +- 9 files changed, 521 insertions(+), 917 deletions(-) delete mode 100644 tests/test_ggml_backends.py diff --git a/ggml/experimental.py b/ggml/experimental.py index a42a966..9cf3039 100644 --- a/ggml/experimental.py +++ b/ggml/experimental.py @@ -86,11 +86,11 @@ def cpu(): raise ValueError("Failed to initialize CPU backend") return Backend(backend=backend) - def new_measure(self) -> "Allocr": - allocr = ggml.ggml_allocr_new_measure_from_backend(self.backend) - return Allocr(allocr) + def new_graph_allocator(self) -> "GraphAllocator": + allocr = ggml.ggml_gallocr_new(ggml.ggml_backend_get_default_buffer_type(self.backend)) + return GraphAllocator(allocr) - def alloc_buffer(self, size: int) -> "BackendBuffer": + def new_backend_buffer(self, size: int) -> "BackendBuffer": buffer = ggml.ggml_backend_alloc_buffer(self.backend, size) return BackendBuffer(buffer) @@ -102,20 +102,17 @@ def __init__(self, buffer: ggml.ggml_backend_buffer_t): def __del__(self): ggml.ggml_backend_buffer_free(self.buffer) - def new_allocr(self) -> "Allocr": - allocr = ggml.ggml_allocr_new_from_buffer(self.buffer) - return Allocr(allocr) - -class Allocr: - def __init__(self, allocr: ggml.ggml_allocr_t): +class GraphAllocator: + def __init__(self, allocr: ggml.ggml_gallocr): self.allocr = allocr def __del__(self): - ggml.ggml_allocr_free(self.allocr) + ggml.ggml_gallocr_free(self.allocr) - def alloc_graph(self, graph: "CGraph") -> int: - return ggml.ggml_allocr_alloc_graph(self.allocr, graph.cgraph) + def allocate_graph(self, graph: "CGraph"): + ggml.ggml_gallocr_reserve(self.allocr, graph.cgraph) + ggml.ggml_gallocr_alloc_graph(self.allocr, graph.cgraph) class GGML_TYPE(enum.IntEnum): diff --git a/ggml/ggml.py b/ggml/ggml.py index a5ebf57..0ce9850 100644 --- a/ggml/ggml.py +++ b/ggml/ggml.py @@ -96,7 +96,7 @@ def load_shared_library(module_name: str, lib_base_name: str): # Try to load the shared library, handling potential errors try: - return ctypes.CDLL(str(path), **cdll_args) # type: ignore + return ctypes.CDLL(str(path), **cdll_args) # type: ignore except Exception as e: raise RuntimeError(f"Failed to load shared library '{path}': {e}") @@ -543,8 +543,8 @@ def ggml_fp32_to_fp16_row( # enum ggml_log_level { # GGML_LOG_LEVEL_ERROR = 2, -# GGML_LOG_LEVEL_WARN = 3, -# GGML_LOG_LEVEL_INFO = 4, +# GGML_LOG_LEVEL_WARN = 3, +# GGML_LOG_LEVEL_INFO = 4, # GGML_LOG_LEVEL_DEBUG = 5 # }; GGML_LOG_LEVEL_ERROR = 2 @@ -552,6 +552,17 @@ def ggml_fp32_to_fp16_row( GGML_LOG_LEVEL_INFO = 4 GGML_LOG_LEVEL_DEBUG = 5 + +# enum ggml_tensor_flag { +# GGML_TENSOR_FLAG_INPUT = 1, +# GGML_TENSOR_FLAG_OUTPUT = 2, +# GGML_TENSOR_FLAG_PARAM = 4, +# }; +GGML_TENSOR_FLAG_INPUT = 1 +GGML_TENSOR_FLAG_OUTPUT = 2 +GGML_TENSOR_FLAG_PARAM = 4 + + # // ggml object # struct ggml_object { # size_t offs; @@ -600,7 +611,7 @@ class ggml_object(ctypes.Structure): # // op params - allocated as int32_t for alignment # int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; -# bool is_param; +# int32_t flags; # struct ggml_tensor * grad; # struct ggml_tensor * src[GGML_MAX_SRC]; @@ -633,7 +644,7 @@ class ggml_tensor(ctypes.Structure): nb (ctypes.Array[ctypes.c_size_t]): stride in bytes for each dimension op (int): ggml operation op_params (ctypes.Array[ctypes.c_int32]): `GGML_MAX_OP_PARAMS`-length array of operation parameters - is_param (bool): is this a parameter tensor + flags (int): tensor flags grad (ggml_tensor_p): reference to gradient tensor src (ctypes.Array[ggml_tensor_p]): `GGML_MAX_SRC`-length array of source tensors perf_runs (int): number of performance runs @@ -660,7 +671,7 @@ class ggml_tensor(ctypes.Structure): "op_params", ctypes.c_int32 * (GGML_MAX_OP_PARAMS // ctypes.sizeof(ctypes.c_int32)), ), - ("is_param", ctypes.c_bool), + ("flags", ctypes.c_int), ("grad", ctypes.POINTER(ggml_tensor)), ("src", ctypes.POINTER(ggml_tensor) * GGML_MAX_SRC), ("perf_runs", ctypes.c_int), @@ -676,7 +687,7 @@ class ggml_tensor(ctypes.Structure): GGML_TENSOR_SIZE = ctypes.sizeof(ggml_tensor) -ggml_tensor_p: TypeAlias = "ctypes._Pointer[ggml_tensor]" # type: ignore +ggml_tensor_p: TypeAlias = "ctypes._Pointer[ggml_tensor]" # type: ignore """ctypes pointer to a [ggml_tensor][ggml.ggml_tensor] Can be dereferenced to a [ggml_tensor][ggml.ggml_tensor] object using @@ -805,7 +816,7 @@ class ggml_cgraph(ctypes.Structure): ] -ggml_cgraph_p: TypeAlias = "ctypes._Pointer[ggml_cgraph]" # type: ignore +ggml_cgraph_p: TypeAlias = "ctypes._Pointer[ggml_cgraph]" # type: ignore """ctypes pointer to a [ggml_cgraph][ggml.ggml_cgraph] Can be dereferenced to a [ggml_cgraph][ggml.ggml_cgraph] object using @@ -4994,7 +5005,8 @@ def ggml_clamp( # int p1, # int d0, # int d1, -# bool is_2D); +# bool is_2D, +# enum ggml_type dst_type); def ggml_im2col( ctx: ggml_context_p, a: ggml_tensor_p, @@ -5006,8 +5018,9 @@ def ggml_im2col( d0: Union[ctypes.c_int, int], d1: Union[ctypes.c_int, int], is_2D: Union[ctypes.c_bool, bool], + dst_type: Union[ctypes.c_int, int], ) -> ggml_tensor_p: - return lib.ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, is_2D) + return lib.ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, is_2D, dst_type) lib.ggml_im2col.argtypes = [ @@ -5021,6 +5034,7 @@ def ggml_im2col( ctypes.c_int, ctypes.c_int, ctypes.c_bool, + ctypes.c_int, ] lib.ggml_im2col.restype = ctypes.POINTER(ggml_tensor) @@ -7172,10 +7186,33 @@ def ggml_opt_resume_g( ] lib.ggml_opt_resume_g.restype = ctypes.c_int + +# // +# // tensor flags +# // +# GGML_API void ggml_set_input(struct ggml_tensor * tensor); +def ggml_set_input(tensor: ggml_tensor_p): + return lib.ggml_set_input(tensor) + + +lib.ggml_set_input.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_set_input.restype = None + + +# GGML_API void ggml_set_output(struct ggml_tensor * tensor); +def ggml_set_output(tensor: ggml_tensor_p): + return lib.ggml_set_output(tensor) + + +lib.ggml_set_output.argtypes = [ctypes.POINTER(ggml_tensor)] +lib.ggml_set_output.restype = None + + # // # // quantization # // + # // - ggml_quantize_init can be called multiple times with the same type # // it will only initialize the quantization tables for the first call or after ggml_quantize_free # // automatically called by ggml_quantize_chunk for convenience @@ -8537,6 +8574,15 @@ def ggml_cpu_has_vulkan() -> int: lib.ggml_cpu_has_vulkan.restype = ctypes.c_int +# GGML_API int ggml_cpu_has_kompute (void); +def ggml_cpu_has_kompute() -> int: + return lib.ggml_cpu_has_kompute() + + +lib.ggml_cpu_has_kompute.argtypes = [] +lib.ggml_cpu_has_kompute.restype = ctypes.c_int + + # GGML_API int ggml_cpu_has_gpublas (void); def ggml_cpu_has_gpublas() -> int: return lib.ggml_cpu_has_gpublas() @@ -8563,6 +8609,7 @@ def ggml_cpu_has_ssse3() -> int: lib.ggml_cpu_has_ssse3.argtypes = [] lib.ggml_cpu_has_ssse3.restype = ctypes.c_int + # GGML_API int ggml_cpu_has_sycl (void); def ggml_cpu_has_sycl() -> int: return lib.ggml_cpu_has_sycl() @@ -8639,402 +8686,186 @@ def ggml_internal_get_type_traits(type: Union[ctypes.c_int, int]) -> ggml_type_t # source: include/ggml/ggml-alloc.h ##################################################### -# struct ggml_backend; -# struct ggml_backend_buffer; -# struct ggml_backend_buffer_type; -ggml_backend_t: TypeAlias = ctypes.c_void_p -ggml_backend_buffer_p: TypeAlias = ctypes.c_void_p -ggml_backend_buffer_type_p: TypeAlias = ctypes.c_void_p - -# // -# // Legacy API -# // - -# typedef struct ggml_allocr * ggml_allocr_t; -ggml_allocr_t = ctypes.c_void_p - -# // initialize allocator for use with CPU backend only -# GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment); -def ggml_allocr_new( - data: ctypes.c_void_p, - size: Union[ctypes.c_size_t, int], - alignment: Union[ctypes.c_size_t, int], -) -> ggml_allocr_t: - return lib.ggml_allocr_new(data, size, alignment) - - -lib.ggml_allocr_new.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t] -lib.ggml_allocr_new.restype = ggml_allocr_t - - -# GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment); -def ggml_allocr_new_measure(alignment: Union[ctypes.c_size_t, int]) -> ggml_allocr_t: - return lib.ggml_allocr_new_measure(alignment) - - -lib.ggml_allocr_new_measure.argtypes = [ctypes.c_size_t] -lib.ggml_allocr_new_measure.restype = ggml_allocr_t - - -# // initialize allocator for use with ggml-backend -# GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer); -def ggml_allocr_new_from_buffer(buffer: ggml_backend_buffer_p) -> ggml_allocr_t: - return lib.ggml_allocr_new_from_buffer(buffer) - - -lib.ggml_allocr_new_from_buffer.argtypes = [ggml_backend_buffer_p] -lib.ggml_allocr_new_from_buffer.restype = ggml_allocr_t - - -# GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer -def ggml_allocr_new_from_backend( - backend: ggml_backend_t, size: Union[ctypes.c_size_t, int] -) -> ggml_allocr_t: - return lib.ggml_allocr_new_from_backend(backend, size) - - -lib.ggml_allocr_new_from_backend.argtypes = [ggml_backend_t, ctypes.c_size_t] -lib.ggml_allocr_new_from_backend.restype = ggml_allocr_t - - -# GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend); -def ggml_allocr_new_measure_from_backend(backend: ggml_backend_t) -> ggml_allocr_t: - return lib.ggml_allocr_new_measure_from_backend(backend) - - -lib.ggml_allocr_new_measure_from_backend.argtypes = [ggml_backend_t] -lib.ggml_allocr_new_measure_from_backend.restype = ggml_allocr_t - - -# GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc); -def ggml_allocr_get_buffer(alloc: ggml_allocr_t) -> ggml_backend_buffer_p: - return lib.ggml_allocr_get_buffer(alloc) - - -lib.ggml_allocr_get_buffer.argtypes = [ggml_allocr_t] -lib.ggml_allocr_get_buffer.restype = ggml_backend_buffer_p - - -# // tell the allocator to parse nodes following the order described in the list -# // you should call this if your graph are optimized to execute out-of-order -# GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n); -def ggml_allocr_set_parse_seq( - alloc: ggml_allocr_t, - list: "ctypes._Pointer(ctypes.c_int)", # type: ignore - n: Union[ctypes.c_int, int], -) -> None: - return lib.ggml_allocr_set_parse_seq(alloc, list, n) - - -lib.ggml_allocr_set_parse_seq.argtypes = [ - ggml_allocr_t, - ctypes.POINTER(ctypes.c_int), - ctypes.c_int, -] -lib.ggml_allocr_set_parse_seq.restype = None - - -# GGML_API void ggml_allocr_free (ggml_allocr_t alloc); -def ggml_allocr_free(alloc: ggml_allocr_t) -> None: - return lib.ggml_allocr_free(alloc) - - -lib.ggml_allocr_free.argtypes = [ggml_allocr_t] -lib.ggml_allocr_free.restype = None - - -# GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc); -def ggml_allocr_is_measure(alloc: ggml_allocr_t) -> ctypes.c_bool: - return lib.ggml_allocr_is_measure(alloc) - - -lib.ggml_allocr_is_measure.argtypes = [ggml_allocr_t] -lib.ggml_allocr_is_measure.restype = ctypes.c_bool - - -# GGML_API void ggml_allocr_reset (ggml_allocr_t alloc); -def ggml_allocr_reset(alloc: ggml_allocr_t) -> None: - return lib.ggml_allocr_reset(alloc) - - -lib.ggml_allocr_reset.argtypes = [ggml_allocr_t] -lib.ggml_allocr_reset.restype = None - - -# GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor); -def ggml_allocr_alloc(alloc: ggml_allocr_t, tensor: ggml_tensor_p) -> None: - return lib.ggml_allocr_alloc(alloc, tensor) - - -lib.ggml_allocr_alloc.argtypes = [ggml_allocr_t, ctypes.POINTER(ggml_tensor)] -lib.ggml_allocr_alloc.restype = None - - -# GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc); -def ggml_allocr_max_size(alloc: ggml_allocr_t) -> Union[ctypes.c_size_t, int]: - return lib.ggml_allocr_max_size(alloc) - - -lib.ggml_allocr_max_size.argtypes = [ggml_allocr_t] -lib.ggml_allocr_max_size.restype = ctypes.c_size_t - - -# GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph); -def ggml_allocr_alloc_graph(alloc: ggml_allocr_t, graph: ggml_cgraph_p) -> int: - return lib.ggml_allocr_alloc_graph(alloc, graph) - - -lib.ggml_allocr_alloc_graph.argtypes = [ggml_allocr_t, ctypes.POINTER(ggml_cgraph)] -lib.ggml_allocr_alloc_graph.restype = ctypes.c_size_t - -# // -# // ggml-backend v2 API -# // +# typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; +# typedef struct ggml_backend_buffer * ggml_backend_buffer_t; +# typedef struct ggml_backend * ggml_backend_t; +ggml_backend_buffer_type_t: TypeAlias = ctypes.c_void_p +ggml_backend_buffer_t: TypeAlias = ctypes.c_void_p +ggml_backend_t: TypeAlias = ctypes.c_void_p -# // Separate tensor and graph allocator objects -# // This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators -# // The original API is kept as a wrapper around the new API # // Tensor allocator # typedef struct ggml_tallocr * ggml_tallocr_t; -ggml_tallocr_t = ctypes.c_void_p - - -# GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment); -def ggml_tallocr_new( - data: ctypes.c_void_p, - size: Union[ctypes.c_size_t, int], - alignment: Union[ctypes.c_size_t, int], -) -> ggml_tallocr_t: - return lib.ggml_tallocr_new(data, size, alignment) - - -lib.ggml_tallocr_new.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_size_t] -lib.ggml_tallocr_new.restype = ggml_tallocr_t - - -# GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment); -def ggml_tallocr_new_measure(alignment: Union[ctypes.c_size_t, int]) -> ggml_tallocr_t: - return lib.ggml_tallocr_new_measure(alignment) - - -lib.ggml_tallocr_new_measure.argtypes = [ctypes.c_size_t] -lib.ggml_tallocr_new_measure.restype = ggml_tallocr_t - - -# GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size); -def ggml_tallocr_new_from_buft( - buft: ggml_backend_buffer_type_p, size: Union[ctypes.c_size_t, int] -) -> ggml_tallocr_t: - return lib.ggml_tallocr_new_from_buft(buft, size) - - -lib.ggml_tallocr_new_from_buft.argtypes = [ggml_backend_buffer_type_p, ctypes.c_size_t] -lib.ggml_tallocr_new_from_buft.restype = ggml_tallocr_t - - -# GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer -def ggml_tallocr_new_from_backend( - backend: ggml_backend_t, size: Union[ctypes.c_size_t, int] -) -> ggml_tallocr_t: - return lib.ggml_tallocr_new_from_backend(backend, size) - - -lib.ggml_tallocr_new_from_backend.argtypes = [ggml_backend_t, ctypes.c_size_t] -lib.ggml_tallocr_new_from_backend.restype = ggml_tallocr_t - +ggml_tallocr: TypeAlias = ctypes.c_void_p -# GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer); -def ggml_tallocr_new_from_buffer(buffer: ggml_backend_buffer_p) -> ggml_tallocr_t: - return lib.ggml_tallocr_new_from_buffer(buffer) +# GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer); +def ggml_tallocr_new(buffer: ggml_backend_buffer_t) -> ggml_tallocr: + return lib.ggml_tallocr_new(buffer) -lib.ggml_tallocr_new_from_buffer.argtypes = [ggml_backend_buffer_p] -lib.ggml_tallocr_new_from_buffer.restype = ggml_tallocr_t +lib.ggml_tallocr_new.argtypes = [ggml_backend_buffer_t] +lib.ggml_tallocr_new.restype = ggml_tallocr -# GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft); -def ggml_tallocr_new_measure_from_buft( - buft: ggml_backend_buffer_type_p, -) -> ggml_tallocr_t: - return lib.ggml_tallocr_new_measure_from_buft(buft) - -lib.ggml_tallocr_new_measure_from_buft.argtypes = [ggml_backend_buffer_type_p] -lib.ggml_tallocr_new_measure_from_buft.restype = ggml_tallocr_t - - -# GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend); -def ggml_tallocr_new_measure_from_backend(backend: ggml_backend_t) -> ggml_tallocr_t: - return lib.ggml_tallocr_new_measure_from_backend(backend) - - -lib.ggml_tallocr_new_measure_from_backend.argtypes = [ggml_backend_t] -lib.ggml_tallocr_new_measure_from_backend.restype = ggml_tallocr_t - - -# GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc); -def ggml_tallocr_get_buffer(talloc: ggml_tallocr_t) -> ggml_backend_buffer_p: - return lib.ggml_tallocr_get_buffer(talloc) - - -lib.ggml_tallocr_get_buffer.argtypes = [ggml_tallocr_t] -lib.ggml_tallocr_get_buffer.restype = ggml_backend_buffer_p - - -# GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc); -def ggml_tallocr_free(talloc: ggml_tallocr_t) -> None: +# GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc); +def ggml_tallocr_free(talloc: ggml_tallocr) -> None: return lib.ggml_tallocr_free(talloc) -lib.ggml_tallocr_free.argtypes = [ggml_tallocr_t] +lib.ggml_tallocr_free.argtypes = [ggml_tallocr] lib.ggml_tallocr_free.restype = None -# GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc); -def ggml_tallocr_is_measure(talloc: ggml_tallocr_t) -> bool: - return lib.ggml_tallocr_is_measure(talloc) - +# GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor); +def ggml_tallocr_alloc(talloc: ggml_tallocr, tensor: ggml_tensor_p) -> None: + return lib.ggml_tallocr_alloc(talloc, tensor) -lib.ggml_tallocr_is_measure.argtypes = [ggml_tallocr_t] -lib.ggml_tallocr_is_measure.restype = ctypes.c_bool +lib.ggml_tallocr_alloc.argtypes = [ggml_tallocr, ctypes.POINTER(ggml_tensor)] +lib.ggml_tallocr_alloc.restype = None -# GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc); -def ggml_tallocr_reset(talloc: ggml_tallocr_t) -> None: - return lib.ggml_tallocr_reset(talloc) +# // Graph allocator +# /* +# Example usage: +# ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type()); -lib.ggml_tallocr_reset.argtypes = [ggml_tallocr_t] -lib.ggml_tallocr_reset.restype = None +# // optional: create a worst-case graph and reserve the buffers to avoid reallocations +# ggml_gallocr_reserve(galloc, build_graph(max_batch)); +# // allocate the graph +# struct ggml_cgraph * graph = build_graph(batch); +# ggml_gallocr_alloc_graph(galloc, graph); -# GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor); -def ggml_tallocr_alloc(talloc: ggml_tallocr_t, tensor: ggml_tensor_p) -> None: - return lib.ggml_tallocr_alloc(talloc, tensor) +# printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0)); +# // evaluate the graph +# ggml_backend_graph_compute(backend, graph); +# */ -lib.ggml_tallocr_alloc.argtypes = [ggml_tallocr_t, ctypes.POINTER(ggml_tensor)] -lib.ggml_tallocr_alloc.restype = None +# // special tensor flags for use with the graph allocator: +# // ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses +# // ggml_set_output(): output tensors are never freed and never overwritten -# GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc); -def ggml_tallocr_max_size(talloc: ggml_tallocr_t) -> Union[ctypes.c_size_t, int]: - return lib.ggml_tallocr_max_size(talloc) +# typedef struct ggml_gallocr * ggml_gallocr_t; +ggml_gallocr: TypeAlias = ctypes.c_void_p -lib.ggml_tallocr_max_size.argtypes = [ggml_tallocr_t] -lib.ggml_tallocr_max_size.restype = ctypes.c_size_t +# GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft); +def ggml_gallocr_new(buft: ggml_backend_buffer_type_t) -> ggml_gallocr: + return lib.ggml_gallocr_new(buft) -# // Graph allocator -# typedef struct ggml_gallocr * ggml_gallocr_t; -ggml_gallocr_t = ctypes.c_void_p +lib.ggml_gallocr_new.argtypes = [ggml_backend_buffer_type_t] +lib.ggml_gallocr_new.restype = ggml_gallocr -# GGML_API ggml_gallocr_t ggml_gallocr_new(void); -def ggml_gallocr_new() -> ggml_gallocr_t: - return lib.ggml_gallocr_new() +# GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs); +def ggml_gallocr_new_n(bufts: ggml_backend_buffer_type_t, n_bufs: int) -> ggml_gallocr: + return lib.ggml_gallocr_new_n(bufts, n_bufs) -lib.ggml_gallocr_new.argtypes = [] -lib.ggml_gallocr_new.restype = ggml_gallocr_t +lib.ggml_gallocr_new_n.argtypes = [ggml_backend_buffer_type_t, ctypes.c_int] +lib.ggml_gallocr_new_n.restype = ggml_gallocr -# GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); -def ggml_gallocr_free(galloc: ggml_gallocr_t) -> None: +# GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); +def ggml_gallocr_free(galloc: ggml_gallocr) -> None: return lib.ggml_gallocr_free(galloc) -lib.ggml_gallocr_free.argtypes = [ggml_gallocr_t] +lib.ggml_gallocr_free.argtypes = [ggml_gallocr] lib.ggml_gallocr_free.restype = None -# GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n); -def ggml_gallocr_set_parse_seq( - galloc: ggml_gallocr_t, - list: "ctypes._Pointer(ctypes.c_int)", # type: ignore - n: Union[ctypes.c_int, int], -) -> None: - return lib.ggml_gallocr_set_parse_seq(galloc, list, n) +# // pre-allocate buffers from a measure graph - does not allocate or modify the graph +# // call with a worst-case graph to avoid buffer reallocations +# // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed +# // returns false if the buffer allocation failed +# GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +def ggml_gallocr_reserve(galloc: ggml_gallocr, graph: ggml_cgraph_p) -> bool: + """pre-allocate buffers from a measure graph - does not allocate or modify the graph + call with a worst-case graph to avoid buffer reallocations + not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed + returns false if the buffer allocation failed""" + return lib.ggml_gallocr_reserve(galloc, graph) -lib.ggml_gallocr_set_parse_seq.argtypes = [ - ggml_gallocr_t, - ctypes.POINTER(ctypes.c_int), - ctypes.c_int, -] -lib.ggml_gallocr_set_parse_seq.restype = None +lib.ggml_gallocr_reserve.argtypes = [ggml_gallocr, ctypes.POINTER(ggml_cgraph)] +lib.ggml_gallocr_reserve.restype = ctypes.c_bool -# GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph); -def ggml_gallocr_alloc_graph( - galloc: ggml_gallocr_t, talloc: ggml_tallocr_t, graph: ggml_cgraph_p -) -> Union[ctypes.c_size_t, int]: - return lib.ggml_gallocr_alloc_graph(galloc, talloc, graph) +# GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids); +def ggml_gallocr_reserve_n( + galloc: ggml_gallocr, + graph: ggml_cgraph_p, + node_buffer_ids: "ctypes._Pointer[ctypes.c_int]", # type: ignore +) -> bool: + return lib.ggml_gallocr_reserve_n(galloc, graph, node_buffer_ids) -lib.ggml_gallocr_alloc_graph.argtypes = [ - ggml_gallocr_t, - ggml_tallocr_t, +lib.ggml_gallocr_reserve_n.argtypes = [ + ggml_gallocr, ctypes.POINTER(ggml_cgraph), + ctypes.POINTER(ctypes.c_int), ] -lib.ggml_gallocr_alloc_graph.restype = ctypes.c_size_t +lib.ggml_gallocr_reserve_n.restype = ctypes.c_bool -# // Allocate tensors from the allocators given by the hash table -# GGML_API void ggml_gallocr_alloc_graph_n( -# ggml_gallocr_t galloc, -# struct ggml_cgraph * graph, -# struct ggml_hash_set hash_set, -# ggml_tallocr_t * hash_node_talloc); -def ggml_gallocr_alloc_graph_n( - galloc: ggml_gallocr_t, - graph: ggml_cgraph_p, - hash_set: ggml_hash_set, - hash_node_talloc: "ctypes._Pointer(ggml_tallocr_t)", # type: ignore -) -> None: - return lib.ggml_gallocr_alloc_graph_n(galloc, graph, hash_set, hash_node_talloc) +# // automatic reallocation if the topology changes when using a single buffer +# // returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) +# GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +def ggml_gallocr_alloc_graph(galloc: ggml_gallocr, graph: ggml_cgraph_p) -> bool: + """automatic reallocation if the topology changes when using a single buffer + returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)""" + return lib.ggml_gallocr_alloc_graph(galloc, graph) -lib.ggml_gallocr_alloc_graph_n.argtypes = [ - ggml_gallocr_t, - ctypes.POINTER(ggml_cgraph), - ggml_hash_set, - ctypes.POINTER(ggml_tallocr_t), -] -lib.ggml_gallocr_alloc_graph_n.restype = None +lib.ggml_gallocr_alloc_graph.argtypes = [ggml_gallocr, ctypes.POINTER(ggml_cgraph)] +lib.ggml_gallocr_alloc_graph.restype = ctypes.c_bool + + +# GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); +def ggml_gallocr_get_buffer_size( + galloc: ggml_gallocr, buffer_id: Union[ctypes.c_int, int] +) -> int: + return lib.ggml_gallocr_get_buffer_size(galloc, buffer_id) + + +lib.ggml_gallocr_get_buffer_size.argtypes = [ggml_gallocr, ctypes.c_int] +lib.ggml_gallocr_get_buffer_size.restype = ctypes.c_size_t # // Utils # // Create a buffer and allocate all the tensors in a ggml_context -# GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft); +# GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); def ggml_backend_alloc_ctx_tensors_from_buft( - ctx: ggml_context_p, buft: ggml_backend_buffer_type_p -) -> ggml_backend_buffer_p: + ctx: ggml_context_p, buft: ggml_backend_buffer_type_t +) -> ggml_backend_buffer_t: + """Create a buffer and allocate all the tensors in a ggml_context""" return lib.ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft) lib.ggml_backend_alloc_ctx_tensors_from_buft.argtypes = [ ggml_context_p, - ggml_backend_buffer_type_p, + ggml_backend_buffer_type_t, ] -lib.ggml_backend_alloc_ctx_tensors_from_buft.restype = ggml_backend_buffer_p +lib.ggml_backend_alloc_ctx_tensors_from_buft.restype = ggml_backend_buffer_t -# GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend); +# GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); def ggml_backend_alloc_ctx_tensors( ctx: ggml_context_p, backend: ggml_backend_t -) -> ggml_backend_buffer_p: +) -> ggml_backend_buffer_t: return lib.ggml_backend_alloc_ctx_tensors(ctx, backend) -lib.ggml_backend_alloc_ctx_tensors.argtypes = [ - ggml_context_p, - ggml_backend_t, -] -lib.ggml_backend_alloc_ctx_tensors.restype = ggml_backend_buffer_p +lib.ggml_backend_alloc_ctx_tensors.argtypes = [ggml_context_p, ggml_backend_t] +lib.ggml_backend_alloc_ctx_tensors.restype = ggml_backend_buffer_t + ##################################################### # GGML Backend API @@ -9045,10 +8876,7 @@ def ggml_backend_alloc_ctx_tensors( # typedef struct ggml_backend_buffer * ggml_backend_buffer_t; # typedef struct ggml_backend * ggml_backend_t; # typedef void * ggml_backend_graph_plan_t; -ggml_backend_buffer_type_t = ctypes.c_void_p -ggml_backend_buffer_t = ctypes.c_void_p -# ggml_backend_t = ctypes.c_void_p -ggml_backend_graph_plan_t = ctypes.c_void_p +ggml_backend_graph_plan_t: TypeAlias = ctypes.c_void_p # // # // Backend buffer @@ -9599,7 +9427,7 @@ def ggml_backend_cpu_set_n_threads( # GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); def ggml_backend_cpu_set_abort_callback( backend_cpu: ggml_backend_t, - abort_callback, # type: ignore + abort_callback, # type: ignore abort_callback_data: ctypes.c_void_p, ): return lib.ggml_backend_cpu_set_abort_callback( @@ -9751,11 +9579,7 @@ def ggml_backend_reg_alloc_buffer( # // in build_graph: # build_graph(...) { -# // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer) -# alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu); -# ggml_allocr_alloc(alloc_cpu, tensor); - -# // manually assigning nodes to a backend (optional, shouldn't be needed in most cases) +# // manually assign nodes to a backend (optional, should not be needed in most cases) # struct ggml_tensor * node = ggml_mul_mat(ctx, ...); # ggml_backend_sched_set_node_backend(sched, node, backend_gpu); # } @@ -9783,8 +9607,7 @@ def ggml_backend_reg_alloc_buffer( # // # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE( - ctypes.c_bool, - ctypes.POINTER(ggml_tensor), ctypes.c_bool, ctypes.c_void_p + ctypes.c_bool, ctypes.POINTER(ggml_tensor), ctypes.c_bool, ctypes.c_void_p ) @@ -9820,19 +9643,20 @@ def ggml_backend_sched_free( # // Initialize backend buffers from a measure graph -# GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); -def ggml_backend_sched_init_measure( +# GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); +def ggml_backend_sched_reserve( sched: ggml_backend_sched_t, measure_graph: ggml_cgraph_p, -): - return lib.ggml_backend_sched_init_measure(sched, measure_graph) +) -> bool: + """Initialize backend buffers from a measure graph.""" + return lib.ggml_backend_sched_reserve(sched, measure_graph) -lib.ggml_backend_sched_init_measure.argtypes = [ +lib.ggml_backend_sched_reserve.argtypes = [ ggml_backend_sched_t, ctypes.POINTER(ggml_cgraph), ] -lib.ggml_backend_sched_init_measure.restype = None +lib.ggml_backend_sched_reserve.restype = ctypes.c_bool # // Get the number of splits of the last graph @@ -9840,6 +9664,7 @@ def ggml_backend_sched_init_measure( def ggml_backend_sched_get_n_splits( sched: ggml_backend_sched_t, ) -> int: + """Get the number of splits of the last graph.""" return lib.ggml_backend_sched_get_n_splits(sched) @@ -9847,28 +9672,16 @@ def ggml_backend_sched_get_n_splits( lib.ggml_backend_sched_get_n_splits.restype = ctypes.c_int -# GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend); -def ggml_backend_sched_get_tallocr( - sched: ggml_backend_sched_t, - backend: ggml_backend_t, -) -> ggml_tallocr_t: - return lib.ggml_backend_sched_get_tallocr(sched, backend) - - -lib.ggml_backend_sched_get_tallocr.argtypes = [ggml_backend_sched_t, ggml_backend_t] -lib.ggml_backend_sched_get_tallocr.restype = ggml_tallocr_t - - -# GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend); -def ggml_backend_sched_get_buffer( +# GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); +def ggml_backend_sched_get_buffer_size( sched: ggml_backend_sched_t, backend: ggml_backend_t, -) -> ggml_backend_buffer_t: - return lib.ggml_backend_sched_get_buffer(sched, backend) +) -> int: + return lib.ggml_backend_sched_get_buffer_size(sched, backend) -lib.ggml_backend_sched_get_buffer.argtypes = [ggml_backend_sched_t, ggml_backend_t] -lib.ggml_backend_sched_get_buffer.restype = ggml_backend_buffer_t +lib.ggml_backend_sched_get_buffer_size.argtypes = [ggml_backend_sched_t, ggml_backend_t] +lib.ggml_backend_sched_get_buffer_size.restype = ctypes.c_size_t # GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); @@ -9905,10 +9718,12 @@ def ggml_backend_sched_get_node_backend( # // Allocate and compute graph on the backend scheduler # GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); +# GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); def ggml_backend_sched_graph_compute( sched: ggml_backend_sched_t, graph: ggml_cgraph_p, -): +) -> bool: + """Allocate and compute graph on the backend scheduler.""" return lib.ggml_backend_sched_graph_compute(sched, graph) @@ -9916,14 +9731,15 @@ def ggml_backend_sched_graph_compute( ggml_backend_sched_t, ctypes.POINTER(ggml_cgraph), ] -lib.ggml_backend_sched_graph_compute.restype = None +lib.ggml_backend_sched_graph_compute.restype = ctypes.c_bool -# // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs +# // Reset all assignments and allocators - must be called before changing the node backends # GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); def ggml_backend_sched_reset( sched: ggml_backend_sched_t, ): + """Reset all assignments and allocators - must be called before changing the node backends.""" return lib.ggml_backend_sched_reset(sched) @@ -9935,7 +9751,7 @@ def ggml_backend_sched_reset( # GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); def ggml_backend_sched_set_eval_callback( sched: ggml_backend_sched_t, - callback, # type: ignore + callback, # type: ignore user_data: ctypes.c_void_p, ): return lib.ggml_backend_sched_set_eval_callback(sched, callback, user_data) @@ -9943,7 +9759,7 @@ def ggml_backend_sched_set_eval_callback( lib.ggml_backend_sched_set_eval_callback.argtypes = [ ggml_backend_sched_t, - ggml_backend_sched_eval_callback, # TODO: this may need to also accept NULL + ggml_backend_sched_eval_callback, # TODO: this may need to also accept NULL ctypes.c_void_p, ] lib.ggml_backend_sched_set_eval_callback.restype = None @@ -10014,7 +9830,7 @@ def ggml_backend_compare_graph_backend( backend1: ggml_backend_t, backend2: ggml_backend_t, graph: ggml_cgraph_p, - callback, # type: ignore + callback, # type: ignore user_data: ctypes.c_void_p, ) -> bool: return lib.ggml_backend_compare_graph_backend( @@ -10400,7 +10216,7 @@ class ggml_backend(ctypes.Structure): # GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data); def ggml_backend_register( name: bytes, - init_fn, # type: ignore + init_fn, # type: ignore default_buffer_type: ggml_backend_buffer_type_t, user_data: ctypes.c_void_p, ): @@ -10664,7 +10480,7 @@ def ggml_backend_cuda_get_device_memory( # GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); def ggml_backend_metal_log_set_callback( - log_callback, # type: ignore + log_callback, # type: ignore user_data: ctypes.c_void_p, ): return lib.ggml_backend_metal_log_set_callback(log_callback, user_data) @@ -10815,6 +10631,7 @@ def ggml_cl_add( ): return lib.ggml_cl_add(src0, src1, dst) + if GGML_USE_CLBLAST: lib.ggml_cl_add.argtypes = [ ctypes.POINTER(ggml_tensor), @@ -10933,118 +10750,181 @@ def ggml_backend_opencl_host_buffer_type() -> ggml_backend_buffer_type_t: ##################################################### # GGML Vulkan API -# source: ggml-vulkan.h +# source: src/ggml-vulkan.h ##################################################### +GGML_HAS_VULKAN = hasattr(lib, "ggml_vk_init_cpu_assist") -GGML_USE_VULKAN = hasattr(lib, "ggml_vk_init") +# #define GGML_VK_NAME "Vulkan" +# #define GGML_VK_MAX_DEVICES 16 +GGML_VK_NAME = "Vulkan" +GGML_VK_MAX_DEVICES = 16 -# GGML_API void ggml_vk_init(void); -def ggml_vk_init(): - return lib.ggml_vk_init() +# GGML_API void ggml_vk_init_cpu_assist(void); +def ggml_vk_init_cpu_assist(): + return lib.ggml_vk_init_cpu_assist() -if GGML_USE_VULKAN: - lib.ggml_vk_init.argtypes = [] - lib.ggml_vk_init.restype = None +if GGML_HAS_VULKAN: + lib.ggml_vk_init_cpu_assist.argtypes = [] + lib.ggml_vk_init_cpu_assist.restype = None -# GGML_API void ggml_vk_preallocate_buffers_graph(struct ggml_tensor * node); -def ggml_vk_preallocate_buffers_graph( - node: ggml_tensor_p, -): - return lib.ggml_vk_preallocate_buffers_graph(node) +# GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node); +def ggml_vk_preallocate_buffers_graph_cpu_assist(node: ggml_tensor_p): + return lib.ggml_vk_preallocate_buffers_graph_cpu_assist(node) -if GGML_USE_VULKAN: - lib.ggml_vk_preallocate_buffers_graph.argtypes = [ - ctypes.POINTER(ggml_tensor), +if GGML_HAS_VULKAN: + lib.ggml_vk_preallocate_buffers_graph_cpu_assist.argtypes = [ + ctypes.POINTER(ggml_tensor) ] - lib.ggml_vk_preallocate_buffers_graph.restype = None + lib.ggml_vk_preallocate_buffers_graph_cpu_assist.restype = None -# GGML_API void ggml_vk_preallocate_buffers(void); -def ggml_vk_preallocate_buffers(): - return lib.ggml_vk_preallocate_buffers() +# GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void); +def ggml_vk_preallocate_buffers_cpu_assist(): + return lib.ggml_vk_preallocate_buffers_cpu_assist() -if GGML_USE_VULKAN: - lib.ggml_vk_preallocate_buffers.argtypes = [] - lib.ggml_vk_preallocate_buffers.restype = None +if GGML_HAS_VULKAN: + lib.ggml_vk_preallocate_buffers_cpu_assist.argtypes = [] + lib.ggml_vk_preallocate_buffers_cpu_assist.restype = None -# GGML_API void ggml_vk_build_graph(struct ggml_tensor * node, bool last_node); -def ggml_vk_build_graph( - node: ggml_tensor_p, - last_node: bool, -): - return lib.ggml_vk_build_graph(node, last_node) +# GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node); +def ggml_vk_build_graph_cpu_assist(node: ggml_tensor_p, last_node: bool): + return lib.ggml_vk_build_graph_cpu_assist(node, last_node) -if GGML_USE_VULKAN: - lib.ggml_vk_build_graph.argtypes = [ +if GGML_HAS_VULKAN: + lib.ggml_vk_build_graph_cpu_assist.argtypes = [ ctypes.POINTER(ggml_tensor), ctypes.c_bool, ] - lib.ggml_vk_build_graph.restype = None + lib.ggml_vk_build_graph_cpu_assist.restype = None -# GGML_API bool ggml_vk_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); -def ggml_vk_compute_forward( - params: ggml_compute_params_p, - tensor: ggml_tensor_p, +# GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor); +def ggml_vk_compute_forward_cpu_assist( + params: ggml_compute_params_p, tensor: ggml_tensor_p ) -> bool: - return lib.ggml_vk_compute_forward(params, tensor) + return lib.ggml_vk_compute_forward_cpu_assist(params, tensor) -if GGML_USE_VULKAN: - lib.ggml_vk_compute_forward.argtypes = [ + +if GGML_HAS_VULKAN: + lib.ggml_vk_compute_forward_cpu_assist.argtypes = [ ctypes.POINTER(ggml_compute_params), ctypes.POINTER(ggml_tensor), ] - lib.ggml_vk_compute_forward.restype = ctypes.c_bool + lib.ggml_vk_compute_forward_cpu_assist.restype = ctypes.c_bool + +# #ifdef GGML_VULKAN_CHECK_RESULTS +# void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor); +# #endif + + +# GGML_API void ggml_vk_graph_cleanup_cpu_assist(void); +def ggml_vk_graph_cleanup_cpu_assist(): + return lib.ggml_vk_graph_cleanup_cpu_assist() + + +if GGML_HAS_VULKAN: + lib.ggml_vk_graph_cleanup_cpu_assist.argtypes = [] + lib.ggml_vk_graph_cleanup_cpu_assist.restype = None -# GGML_API void ggml_vk_graph_cleanup(void); -def ggml_vk_graph_cleanup(): - return lib.ggml_vk_graph_cleanup() +# GGML_API void ggml_vk_free_cpu_assist(void); +def ggml_vk_free_cpu_assist(): + return lib.ggml_vk_free_cpu_assist() -if GGML_USE_VULKAN: - lib.ggml_vk_graph_cleanup.argtypes = [] - lib.ggml_vk_graph_cleanup.restype = None +if GGML_HAS_VULKAN: + lib.ggml_vk_free_cpu_assist.argtypes = [] + lib.ggml_vk_free_cpu_assist.restype = None # // backend API -# GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(void); -def ggml_backend_vk_init() -> Optional[ggml_backend_t]: - return lib.ggml_backend_vk_init() +# GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num); +def ggml_backend_vk_init( + dev_num: Union[ctypes.c_size_t, int] +) -> Optional[ggml_backend_t]: + return lib.ggml_backend_vk_init(dev_num) -if GGML_USE_VULKAN: - lib.ggml_backend_vk_init.argtypes = [] +if GGML_HAS_VULKAN: + lib.ggml_backend_vk_init.argtypes = [ctypes.c_size_t] lib.ggml_backend_vk_init.restype = ggml_backend_t # GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend); -def ggml_backend_is_vk( - backend: ggml_backend_t, -) -> bool: +def ggml_backend_is_vk(backend: ggml_backend_t) -> bool: return lib.ggml_backend_is_vk(backend) -if GGML_USE_VULKAN: +if GGML_HAS_VULKAN: lib.ggml_backend_is_vk.argtypes = [ggml_backend_t] lib.ggml_backend_is_vk.restype = ctypes.c_bool -# GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(void); -def ggml_backend_vk_buffer_type() -> ggml_backend_buffer_type_t: - return lib.ggml_backend_vk_buffer_type() +# GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void); +def ggml_backend_vk_get_device_count() -> int: + return lib.ggml_backend_vk_get_device_count() + + +if GGML_HAS_VULKAN: + lib.ggml_backend_vk_get_device_count.argtypes = [] + lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int -if GGML_USE_VULKAN: - lib.ggml_backend_vk_buffer_type.argtypes = [] +# GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); +def ggml_backend_vk_get_device_description( + device: Union[ctypes.c_int, int], + description: bytes, + description_size: Union[ctypes.c_size_t, int], +): + return lib.ggml_backend_vk_get_device_description( + device, description, description_size + ) + + +if GGML_HAS_VULKAN: + lib.ggml_backend_vk_get_device_description.argtypes = [ + ctypes.c_int, + ctypes.c_char_p, + ctypes.c_size_t, + ] + lib.ggml_backend_vk_get_device_description.restype = None + + +# GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); +def ggml_backend_vk_get_device_memory( + device: Union[ctypes.c_int, int], + free: "ctypes._Pointer[ctypes.c_size_t]", # type: ignore + total: "ctypes._Pointer[ctypes.c_size_t]", # type: ignore +): + return lib.ggml_backend_vk_get_device_memory(device, free, total) + + +if GGML_HAS_VULKAN: + lib.ggml_backend_vk_get_device_memory.argtypes = [ + ctypes.c_int, + ctypes.POINTER(ctypes.c_size_t), + ctypes.POINTER(ctypes.c_size_t), + ] + lib.ggml_backend_vk_get_device_memory.restype = None + + +# GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); +def ggml_backend_vk_buffer_type( + dev_num: Union[ctypes.c_size_t, int] +) -> ggml_backend_buffer_type_t: + return lib.ggml_backend_vk_buffer_type(dev_num) + + +if GGML_HAS_VULKAN: + lib.ggml_backend_vk_buffer_type.argtypes = [ctypes.c_size_t] lib.ggml_backend_vk_buffer_type.restype = ggml_backend_buffer_type_t @@ -11054,11 +10934,10 @@ def ggml_backend_vk_host_buffer_type() -> ggml_backend_buffer_type_t: return lib.ggml_backend_vk_host_buffer_type() -if GGML_USE_VULKAN: +if GGML_HAS_VULKAN: lib.ggml_backend_vk_host_buffer_type.argtypes = [] lib.ggml_backend_vk_host_buffer_type.restype = ggml_backend_buffer_type_t - # TODO: Add ggml-sycl.h -# TODO: Add ggml-kompute.h \ No newline at end of file +# TODO: Add ggml-kompute.h diff --git a/ggml/utils.py b/ggml/utils.py index edd713d..307c23c 100644 --- a/ggml/utils.py +++ b/ggml/utils.py @@ -2,9 +2,8 @@ """ import enum import ctypes -import contextlib -from typing import Any, List, Optional, Sequence, Tuple +from typing import Any, Optional, Sequence, Tuple from ggml import ggml @@ -12,7 +11,7 @@ import numpy.typing as npt -class GGML_TYPE(enum.Enum): +class GGML_TYPE(enum.IntEnum): F32 = ggml.GGML_TYPE_F32 F16 = ggml.GGML_TYPE_F16 Q4_0 = ggml.GGML_TYPE_Q4_0 @@ -53,13 +52,16 @@ def to_numpy( ctypes_type = ctypes.c_uint16 else: ctypes_type = np.ctypeslib.as_ctypes_type(GGML_TYPE_TO_NUMPY_DTYPE[ggml_type]) - - array = ctypes.cast(ggml.ggml_get_data(tensor), ctypes.POINTER(ctypes_type)) + + data = ggml.ggml_get_data(tensor) + if data is None: + raise ValueError("tensor data is None") + array = ctypes.cast(data, ctypes.POINTER(ctypes_type)) n_dims = ggml.ggml_n_dims(tensor) shape = tuple(reversed(tensor.contents.ne[:n_dims])) output = np.ctypeslib.as_array(array, shape=shape) if ggml_type == GGML_TYPE.F16: - output.dtype = np.float16 + output.dtype = np.float16 # type: ignore return np.lib.stride_tricks.as_strided( output, strides=tuple(reversed(tensor.contents.nb[:n_dims])) ) @@ -91,33 +93,6 @@ def from_numpy(x: npt.NDArray[Any], ctx: ggml.ggml_context_p) -> ggml.ggml_tenso return tensor -@contextlib.contextmanager -def ggml_context_manager(params: ggml.ggml_init_params): - """Creates a context manager for a new ggml context that free's it after use. - - Example: - ```python - import ggml - from ggml.utils import ggml_context_manager - - params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) - with ggml_context_manager(params) as ctx: - # do stuff with ctx - ``` - - Parameters: - params: context parameters - - Returns: - (contextlib.AbstractContextManager): ggml_context_p context manager - """ - ctx = ggml.ggml_init(params) - try: - yield ctx - finally: - ggml.ggml_free(ctx) - - def copy_to_cpu( ctx: ggml.ggml_context_p, tensor: ggml.ggml_tensor_p ) -> ggml.ggml_tensor_p: @@ -337,36 +312,3 @@ def slice_tensor( f"ggml tensors with {ndims} dimensions are not supported" ) - -def alloc_graph_measure( - graph: ggml.ggml_cgraph, - alignment: int, - alloc_tensors: Optional[List[ggml.ggml_tensor_p]] = None, -) -> int: - """Returns the number of bytes required by a ggml_allocr allocator to allocate the tensors in the graph. - - NOTE: This implementation saves a copy of the current data pointers of all graph nodes and leafs and restores them - after measuring the allocation size so that the graph can be re-used. - - Parameters: - graph: ggml graph - alignment: alignment of the allocation - alloc_tensors: list of tensors to allocate individually using ggml_allocr_alloc - - Returns: - Size of the required allocation buffer in bytes""" - alloc_tensors = alloc_tensors or [] - leaf_data = [ggml.ggml_get_data(graph.leafs[i]) for i in range(graph.n_leafs)] - node_data = [ggml.ggml_get_data(graph.nodes[i]) for i in range(graph.n_nodes)] - alloc = ggml.ggml_allocr_new_measure(alignment) - for tensor in alloc_tensors: - ggml.ggml_allocr_alloc(alloc, tensor) - alloc_size = ( - ggml.ggml_allocr_alloc_graph(alloc, ctypes.byref(graph)) + alignment # type: ignore - ) - ggml.ggml_allocr_free(alloc) - for i in range(graph.n_leafs): - graph.leafs[i].contents.data = leaf_data[i] - for i in range(graph.n_nodes): - graph.nodes[i].contents.data = node_data[i] - return alloc_size diff --git a/tests/test_experimental_api.py b/tests/test_experimental_api.py index ec5473e..4e535e6 100644 --- a/tests/test_experimental_api.py +++ b/tests/test_experimental_api.py @@ -71,16 +71,10 @@ def test_experimental_api(): assert f.shape == (1,) - measure_allocr = backend.new_measure() - graph = ggml_cgraph(f) - mem_size = measure_allocr.alloc_graph(graph) - - buffer = backend.alloc_buffer(mem_size) - - allocr = buffer.new_allocr() - allocr.alloc_graph(graph) + gallocr = backend.new_graph_allocator() + gallocr.allocate_graph(graph) x[0] = 2.0 diff --git a/tests/test_ggml.py b/tests/test_ggml.py index 2405efd..0ae5a69 100644 --- a/tests/test_ggml.py +++ b/tests/test_ggml.py @@ -1,7 +1,11 @@ import ctypes + from typing import Optional + import ggml +import numpy as np + def test_ggml(): assert ggml.GGML_FILE_VERSION == 1 @@ -58,7 +62,10 @@ def double( def test_ggml_min_alloc(): - max_overhead = ggml.ggml_tensor_overhead() * ggml.GGML_DEFAULT_GRAPH_SIZE + ggml.ggml_graph_overhead() + max_overhead = ( + ggml.ggml_tensor_overhead() * ggml.GGML_DEFAULT_GRAPH_SIZE + + ggml.ggml_graph_overhead() + ) assert max_overhead < 16 * 1024 * 1024 # 16MB params = ggml.ggml_init_params( mem_size=max_overhead, mem_buffer=None, no_alloc=True @@ -89,7 +96,12 @@ def build_graph(ctx: ggml.ggml_context_p): n_leafs = gf.contents.n_leafs leafs_size = sum(ggml.ggml_nbytes_pad(gf.contents.leafs[i]) for i in range(n_leafs)) - mem_size = nodes_size + leafs_size + ggml.ggml_tensor_overhead() * (n_nodes + n_leafs) + ggml.ggml_graph_overhead() + mem_size = ( + nodes_size + + leafs_size + + ggml.ggml_tensor_overhead() * (n_nodes + n_leafs) + + ggml.ggml_graph_overhead() + ) ggml.ggml_free(ctx) @@ -119,197 +131,136 @@ def build_graph(ctx: ggml.ggml_context_p): ggml.ggml_free(ctx) -def test_ggml_alloc(): - def build_graph(ctx: ggml.ggml_context_p, alloc: ggml.ggml_allocr_t): - # inputs - x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - ggml.ggml_set_name(x, b"x") - ggml.ggml_allocr_alloc(alloc, x) - a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - ggml.ggml_set_name(a, b"a") - ggml.ggml_allocr_alloc(alloc, a) - b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - ggml.ggml_set_name(b, b"b") - ggml.ggml_allocr_alloc(alloc, b) - - x2 = ggml.ggml_mul(ctx, x, x) - tmp = ggml.ggml_mul(ctx, a, x2) +def test_quantize(): + ne0 = 32 + ne1 = 1 + nelements = ne0 * ne1 + data = [float(i) for i in range(nelements)] + data_f32 = (ctypes.c_float * len(data))(*data) + work = (ctypes.c_float * nelements)(0) + hist = (ctypes.c_int64 * (1 << 4))(0) + cur_size = ggml.ggml_quantize_q8_0( + data_f32, + ctypes.cast(work, ctypes.c_void_p), + nelements, + ne0, + hist, + ) + assert cur_size == 34 - # outputs - f = ggml.ggml_add(ctx, tmp, b) - ggml.ggml_set_name(f, b"f") + type_traits = ggml.ggml_internal_get_type_traits(ggml.GGML_TYPE_Q8_0) + work2 = (ctypes.c_float * nelements)(0) + type_traits.to_float( + ctypes.cast(work, ctypes.c_void_p), + ctypes.cast(work2, ctypes.POINTER(ctypes.c_float)), + nelements, + ) - # build graph - gf = ggml.ggml_new_graph(ctx) - ggml.ggml_build_forward_expand(gf, f) + eps = 0.5 + for i in range(nelements): + assert abs(work2[i] - data[i]) < eps - return gf - max_overhead = ggml.ggml_tensor_overhead() * ggml.GGML_DEFAULT_GRAPH_SIZE + ggml.ggml_graph_overhead() - assert max_overhead < 16 * 1024 * 1024 # 16MB +def test_ggml_cpu_backend(): + n_tensors = 1 + 2 # input (x) and weights (a, b) params = ggml.ggml_init_params( - mem_size=max_overhead, mem_buffer=None, no_alloc=True + mem_size=ggml.ggml_tensor_overhead() * n_tensors, mem_buffer=None, no_alloc=True ) ctx = ggml.ggml_init(params=params) assert ctx is not None - tensor_alignment = 32 - alloc = ggml.ggml_allocr_new_measure(tensor_alignment) - assert alloc is not None - assert ggml.ggml_allocr_is_measure(alloc) + backend = ggml.ggml_backend_cpu_init() - gf = build_graph(ctx, alloc) - gp = ggml.ggml_graph_plan(gf, 1) - assert gp.work_size == 0 + assert backend is not None - alloc_size = ( - ggml.ggml_allocr_alloc_graph(alloc, gf) + tensor_alignment - ) + # create the tensors for input and weights + x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - ggml.ggml_free(ctx) - ggml.ggml_allocr_free(alloc) + a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) + b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - params = ggml.ggml_init_params( - mem_size=max_overhead, mem_buffer=None, no_alloc=True + # allocate the tensors in the backend + buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend) + + # set the values of the weights + ggml.ggml_backend_tensor_set( + a, + ctypes.cast(np.array([3.0], dtype=np.single).ctypes.data, ctypes.c_void_p), + 0, + ggml.ggml_nbytes(a), ) - ctx = ggml.ggml_init(params=params) - assert ctx is not None - buffer = (ctypes.c_uint8 * alloc_size)() - alloc = ggml.ggml_allocr_new( - ctypes.cast(buffer, ctypes.c_void_p), alloc_size, tensor_alignment + ggml.ggml_backend_tensor_set( + b, + ctypes.cast(np.array([4.0], dtype=np.single).ctypes.data, ctypes.c_void_p), + 0, + ggml.ggml_nbytes(a), ) - gf = build_graph(ctx, alloc) - ggml.ggml_allocr_alloc_graph(alloc, gf) - - a = ggml.ggml_get_tensor(ctx, b"a") - b = ggml.ggml_get_tensor(ctx, b"b") - x = ggml.ggml_get_tensor(ctx, b"x") - f = ggml.ggml_get_tensor(ctx, b"f") - assert a is not None and b is not None and x is not None and f is not None + max_nodes = 4096 - ggml.ggml_set_f32(x, 2.0) - ggml.ggml_set_f32(a, 3.0) - ggml.ggml_set_f32(b, 4.0) + buf_size = ( + ggml.ggml_tensor_overhead() * max_nodes + + ggml.ggml_graph_overhead_custom(max_nodes, False) + ) + buf = (ctypes.c_uint8 * buf_size)() - gp = ggml.ggml_graph_plan(gf, 1) - ggml.ggml_graph_compute(gf, ctypes.pointer(gp)) - output = ggml.ggml_get_f32_1d(f, 0) - assert output == 16.0 - ggml.ggml_free(ctx) - ggml.ggml_allocr_free(alloc) + def build_graph( + x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p + ): + params = ggml.ggml_init_params( + mem_size=buf_size, + mem_buffer=ctypes.cast(buf, ctypes.c_void_p), + no_alloc=True, + ) + ctx0 = ggml.ggml_init(params=params) -def test_ggml_alloc_one_pass(): - max_overhead = ggml.ggml_tensor_overhead() * ggml.GGML_DEFAULT_GRAPH_SIZE + ggml.ggml_graph_overhead() - assert max_overhead < 16 * 1024 * 1024 # 16MB - params = ggml.ggml_init_params( - mem_size=max_overhead, mem_buffer=None, no_alloc=True - ) - ctx = ggml.ggml_init(params=params) - assert ctx is not None + assert ctx0 is not None - # define the graph - x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - ggml.ggml_set_name(x, b"x") - a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - ggml.ggml_set_name(a, b"a") - b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - ggml.ggml_set_name(b, b"b") + gf = ggml.ggml_new_graph_custom(ctx0, max_nodes, False) - x2 = ggml.ggml_mul(ctx, x, x) - tmp = ggml.ggml_mul(ctx, a, x2) + x2 = ggml.ggml_mul(ctx0, x, x) + ax2 = ggml.ggml_mul(ctx0, a, x2) + f = ggml.ggml_add(ctx0, ax2, b) - # outputs - f = ggml.ggml_add(ctx, tmp, b) - ggml.ggml_set_name(f, b"f") + ggml.ggml_set_name(x2, b"x2") + ggml.ggml_set_name(ax2, b"ax2") + ggml.ggml_set_name(f, b"f") - # build graph + ggml.ggml_build_forward_expand(gf, f) - gf = ggml.ggml_new_graph(ctx) - ggml.ggml_build_forward_expand(gf, f) + ggml.ggml_free(ctx0) - # save old data pointers - leaf_data = [ggml.ggml_get_data(gf.contents.leafs[i]) for i in range(gf.contents.n_leafs)] - node_data = [ggml.ggml_get_data(gf.contents.nodes[i]) for i in range(gf.contents.n_nodes)] - - # create measure allocator - tensor_alignment = 32 - alloc = ggml.ggml_allocr_new_measure(tensor_alignment) - assert alloc is not None - assert ggml.ggml_allocr_is_measure(alloc) - - # allocate input tensors - ggml.ggml_allocr_alloc(alloc, x) - ggml.ggml_allocr_alloc(alloc, a) - ggml.ggml_allocr_alloc(alloc, b) - # allocate graph - alloc_size = ( - ggml.ggml_allocr_alloc_graph(alloc, gf) + tensor_alignment - ) - assert alloc_size > 0 - - # restore old data pointers - for i in range(gf.contents.n_leafs): - gf.contents.leafs[i].contents.data = leaf_data[i] - - for i in range(gf.contents.n_nodes): - gf.contents.nodes[i].contents.data = node_data[i] - - # free measure allocator - ggml.ggml_allocr_free(alloc) - - # allocate tensor memory - buffer = (ctypes.c_uint8 * alloc_size)() - alloc = ggml.ggml_allocr_new( - ctypes.cast(buffer, ctypes.c_void_p), alloc_size, tensor_alignment - ) - ggml.ggml_allocr_alloc(alloc, x) - ggml.ggml_allocr_alloc(alloc, a) - ggml.ggml_allocr_alloc(alloc, b) - ggml.ggml_allocr_alloc_graph(alloc, gf) + return gf - # set input values - ggml.ggml_set_f32(x, 2.0) - ggml.ggml_set_f32(a, 3.0) - ggml.ggml_set_f32(b, 4.0) + allocr = ggml.ggml_gallocr_new(ggml.ggml_backend_get_default_buffer_type(backend)) - gp = ggml.ggml_graph_plan(gf, 1) - assert gp.work_size == 0 + gf = build_graph(x, a, b) - # compute - ggml.ggml_graph_compute(gf, ctypes.pointer(gp)) + ggml.ggml_gallocr_reserve(allocr, gf) - output = ggml.ggml_get_f32_1d(f, 0) - assert output == 16.0 + gf = build_graph(x, a, b) - ggml.ggml_free(ctx) - ggml.ggml_allocr_free(alloc) + ggml.ggml_gallocr_alloc_graph(allocr, gf) -def test_quantize(): - ne0 = 32 - ne1 = 1 - nelements = ne0 * ne1 - data = [float(i) for i in range(nelements)] - data_f32 = (ctypes.c_float * len(data))(*data) - work = (ctypes.c_float * nelements)(0) - hist = (ctypes.c_int64 * (1 << 4))(0) - cur_size = ggml.ggml_quantize_q8_0( - data_f32, - ctypes.cast(work, ctypes.c_void_p), - nelements, - ne0, - hist, + ggml.ggml_backend_tensor_set( + x, + ctypes.cast(np.array([2.0], dtype=np.single).ctypes.data, ctypes.c_void_p), + 0, + ggml.ggml_nbytes(x), ) - assert cur_size == 34 - type_traits = ggml.ggml_internal_get_type_traits(ggml.GGML_TYPE_Q8_0) - work2 = (ctypes.c_float * nelements)(0) - type_traits.to_float( - ctypes.cast(work, ctypes.c_void_p), - ctypes.cast(work2, ctypes.POINTER(ctypes.c_float)), - nelements, + ggml.ggml_backend_graph_compute(backend, gf) + + f = ggml.ggml_graph_get_tensor(gf, b"f") + + output = np.zeros(1, dtype=np.single) + ggml.ggml_backend_tensor_get( + f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x) ) - eps = 0.5 - for i in range(nelements): - assert abs(work2[i] - data[i]) < eps + assert output[0] == 16.0 + + ggml.ggml_gallocr_free(allocr) + ggml.ggml_backend_buffer_free(buffer) + ggml.ggml_backend_free(backend) + ggml.ggml_free(ctx) diff --git a/tests/test_ggml_backends.py b/tests/test_ggml_backends.py deleted file mode 100644 index 828ae60..0000000 --- a/tests/test_ggml_backends.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Simple example of graph offloading to a non-cpu backend.""" - -import ggml -import ctypes - -import numpy as np - -def test_ggml_cpu_backend(): - n_tensors = 1 + 2 # input (x) and weights (a, b) - params = ggml.ggml_init_params( - mem_size=ggml.ggml_tensor_overhead() * n_tensors, mem_buffer=None, no_alloc=True - ) - ctx = ggml.ggml_init(params=params) - assert ctx is not None - - backend = ggml.ggml_backend_cpu_init() - - assert backend is not None - - # create the tensors for input and weights - x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - - a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - - # allocate the tensors in the backend - buffer = ggml.ggml_backend_alloc_ctx_tensors(ctx, backend) - - # set the values of the weights - ggml.ggml_backend_tensor_set( - a, - ctypes.cast(np.array([3.0], dtype=np.single).ctypes.data, ctypes.c_void_p), - 0, - ggml.ggml_nbytes(a), - ) - ggml.ggml_backend_tensor_set( - b, - ctypes.cast(np.array([4.0], dtype=np.single).ctypes.data, ctypes.c_void_p), - 0, - ggml.ggml_nbytes(a), - ) - - max_nodes = 4096 - - buf_size = ggml.ggml_tensor_overhead() * max_nodes + ggml.ggml_graph_overhead_custom(max_nodes, False) - buf = (ctypes.c_uint8 * buf_size)() - - def build_graph(x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p): - params = ggml.ggml_init_params( - mem_size=buf_size, mem_buffer=ctypes.cast(buf, ctypes.c_void_p), no_alloc=True - ) - ctx0 = ggml.ggml_init(params=params) - - assert ctx0 is not None - - gf = ggml.ggml_new_graph_custom(ctx0, max_nodes, False) - - x2 = ggml.ggml_mul(ctx0, x, x) - ax2 = ggml.ggml_mul(ctx0, a, x2) - f = ggml.ggml_add(ctx0, ax2, b) - - ggml.ggml_set_name(x2, b"x2") - ggml.ggml_set_name(ax2, b"ax2") - ggml.ggml_set_name(f, b"f") - - ggml.ggml_build_forward_expand(gf, f) - - ggml.ggml_free(ctx0) - - return gf - - allocr = ggml.ggml_allocr_new_measure_from_backend(backend) - - gf = build_graph(x, a, b) - - mem_size = ggml.ggml_allocr_alloc_graph(allocr, gf) - - ggml.ggml_allocr_free(allocr) - - buf_compute = ggml.ggml_backend_alloc_buffer(backend, mem_size) - allocr = ggml.ggml_allocr_new_from_buffer(buf_compute) - - ggml.ggml_allocr_reset(allocr) - - gf = build_graph(x, a, b) - - ggml.ggml_allocr_alloc_graph(allocr, gf) - - ggml.ggml_backend_tensor_set( - x, - ctypes.cast(np.array([2.0], dtype=np.single).ctypes.data, ctypes.c_void_p), - 0, - ggml.ggml_nbytes(x), - ) - - ggml.ggml_backend_graph_compute(backend, gf) - - f = ggml.ggml_graph_get_tensor(gf, b"f") - - output = np.zeros(1, dtype=np.single) - ggml.ggml_backend_tensor_get(f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x)) - - assert output[0] == 16.0 - - ggml.ggml_backend_buffer_free(buffer) - ggml.ggml_backend_buffer_free(buf_compute) - ggml.ggml_backend_free(backend) - ggml.ggml_free(ctx) diff --git a/tests/test_ggml_metal.py b/tests/test_ggml_metal.py index 009388e..4fd410b 100644 --- a/tests/test_ggml_metal.py +++ b/tests/test_ggml_metal.py @@ -11,7 +11,6 @@ reason="METAL not available", ) - @run_if_ggml_metal_available def test_metal(): n_tensors = 1 + 2 # input (x) and weights (a, b) @@ -50,12 +49,19 @@ def test_metal(): max_nodes = 4096 - buf_size = ggml.ggml_tensor_overhead() * max_nodes + ggml.ggml_graph_overhead_custom(max_nodes, False) + buf_size = ( + ggml.ggml_tensor_overhead() * max_nodes + + ggml.ggml_graph_overhead_custom(max_nodes, False) + ) buf = (ctypes.c_uint8 * buf_size)() - def build_graph(x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p): + def build_graph( + x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tensor_p + ): params = ggml.ggml_init_params( - mem_size=buf_size, mem_buffer=ctypes.cast(buf, ctypes.c_void_p), no_alloc=True + mem_size=buf_size, + mem_buffer=ctypes.cast(buf, ctypes.c_void_p), + no_alloc=True, ) ctx0 = ggml.ggml_init(params=params) @@ -77,22 +83,15 @@ def build_graph(x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tenso return gf - allocr = ggml.ggml_allocr_new_measure_from_backend(backend) + allocr = ggml.ggml_gallocr_new(ggml.ggml_backend_get_default_buffer_type(backend)) gf = build_graph(x, a, b) - mem_size = ggml.ggml_allocr_alloc_graph(allocr, gf) - - ggml.ggml_allocr_free(allocr) - - buf_compute = ggml.ggml_backend_alloc_buffer(backend, mem_size) - allocr = ggml.ggml_allocr_new_from_buffer(buf_compute) - - ggml.ggml_allocr_reset(allocr) + ggml.ggml_gallocr_reserve(allocr, gf) gf = build_graph(x, a, b) - ggml.ggml_allocr_alloc_graph(allocr, gf) + ggml.ggml_gallocr_alloc_graph(allocr, gf) ggml.ggml_backend_tensor_set( x, @@ -106,11 +105,13 @@ def build_graph(x: ggml.ggml_tensor_p, a: ggml.ggml_tensor_p, b: ggml.ggml_tenso f = ggml.ggml_graph_get_tensor(gf, b"f") output = np.zeros(1, dtype=np.single) - ggml.ggml_backend_tensor_get(f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x)) + ggml.ggml_backend_tensor_get( + f, ctypes.cast(output.ctypes.data, ctypes.c_void_p), 0, ggml.ggml_nbytes(x) + ) assert output[0] == 16.0 + ggml.ggml_gallocr_free(allocr) ggml.ggml_backend_buffer_free(buffer) - ggml.ggml_backend_buffer_free(buf_compute) ggml.ggml_backend_free(backend) ggml.ggml_free(ctx) diff --git a/tests/test_utils.py b/tests/test_utils.py index e3b348f..7da39d0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,127 +1,75 @@ -import ctypes - import ggml import ggml.utils -import pytest - import numpy as np def test_utils(): params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) - with ggml.utils.ggml_context_manager(params) as ctx: - x = np.ones((3,), dtype=np.float32) - assert x.shape == (3,) - t = ggml.utils.from_numpy(x, ctx) - assert t.contents.ne[:1] == [3] - assert t.contents.type == ggml.GGML_TYPE_F32 - assert np.allclose(ggml.utils.to_numpy(t), x) + ctx = ggml.ggml_init(params=params) + assert ctx is not None + x = np.ones((3,), dtype=np.float32) + assert x.shape == (3,) + t = ggml.utils.from_numpy(x, ctx) + assert t.contents.ne[:1] == [3] + assert t.contents.type == ggml.GGML_TYPE_F32 + assert np.allclose(ggml.utils.to_numpy(t), x) + ggml.ggml_free(ctx) def test_numpy_arrays(): params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) - with ggml.utils.ggml_context_manager(params) as ctx: - x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32, order="F") - assert x.shape == (2, 3) - t = ggml.utils.from_numpy(x, ctx) - assert t.contents.ne[:2] == [3, 2] - y = ggml.utils.to_numpy(t) - assert y.shape == (2, 3) + ctx = ggml.ggml_init(params=params) + assert ctx is not None + x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32, order="F") + assert x.shape == (2, 3) + t = ggml.utils.from_numpy(x, ctx) + assert t.contents.ne[:2] == [3, 2] + y = ggml.utils.to_numpy(t) + assert y.shape == (2, 3) + ggml.ggml_free(ctx) def test_numpy_arrays_transposed(): params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) - with ggml.utils.ggml_context_manager(params) as ctx: - # 2D - x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32) - t = ggml.utils.from_numpy(x, ctx) - t_t = ggml.ggml_transpose(ctx, t) - x_t = ggml.utils.to_numpy(t_t) - assert np.array_equal(x_t, x.T) - - t = ggml.utils.from_numpy(x.T, ctx) - x_t = ggml.utils.to_numpy(t) - assert np.array_equal(x.T, x_t) - - # 3D - x = np.array( - [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], dtype=np.int32 - ) - t = ggml.utils.from_numpy(x, ctx) - t_t = ggml.ggml_permute(ctx, t, 2, 1, 0, 3) - x_t = ggml.utils.to_numpy(t_t) - assert np.array_equal(x_t, x.T) - - t = ggml.utils.from_numpy(x.T, ctx) - x_t = ggml.utils.to_numpy(t) - assert np.array_equal(x.T, x_t) + ctx = ggml.ggml_init(params=params) + assert ctx is not None + # 2D + x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32) + t = ggml.utils.from_numpy(x, ctx) + t_t = ggml.ggml_transpose(ctx, t) + x_t = ggml.utils.to_numpy(t_t) + assert np.array_equal(x_t, x.T) + + t = ggml.utils.from_numpy(x.T, ctx) + x_t = ggml.utils.to_numpy(t) + assert np.array_equal(x.T, x_t) + + # 3D + x = np.array( + [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], dtype=np.int32 + ) + t = ggml.utils.from_numpy(x, ctx) + t_t = ggml.ggml_permute(ctx, t, 2, 1, 0, 3) + x_t = ggml.utils.to_numpy(t_t) + assert np.array_equal(x_t, x.T) + + t = ggml.utils.from_numpy(x.T, ctx) + x_t = ggml.utils.to_numpy(t) + assert np.array_equal(x.T, x_t) + ggml.ggml_free(ctx) def test_slice_tensor(): params = ggml.ggml_init_params(mem_size=16 * 1024 * 1024) - with ggml.utils.ggml_context_manager(params) as ctx: - x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32) - t = ggml.utils.from_numpy(x, ctx) - t_slice = ggml.utils.slice_tensor(ctx, t, [ - slice(0, 2), - slice(0, 1) - ]) - x_slice = ggml.utils.to_numpy(t_slice) - assert np.array_equal(x_slice, x[:1, :2].squeeze()) - - -def test_alloc_graph_measure(): - max_overhead = ggml.ggml_tensor_overhead() * ggml.GGML_DEFAULT_GRAPH_SIZE + ggml.ggml_graph_overhead() - assert max_overhead < 16 * 1024 * 1024 # 16MB - params = ggml.ggml_init_params( - mem_size=max_overhead, mem_buffer=None, no_alloc=True - ) ctx = ggml.ggml_init(params=params) - - # define the graph - x = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - a = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - b = ggml.ggml_new_tensor_1d(ctx, ggml.GGML_TYPE_F32, 1) - - x2 = ggml.ggml_mul(ctx, x, x) - tmp = ggml.ggml_mul(ctx, a, x2) - - # outputs - f = ggml.ggml_add(ctx, tmp, b) - - # build graph - gf = ggml.ggml_new_graph(ctx) - ggml.ggml_build_forward_expand(gf, f) - - # create measure allocator - tensor_alignment = 32 - input_tensors = [x, a, b] - alloc_size = ggml.utils.alloc_graph_measure(gf.contents, tensor_alignment, input_tensors) - - # allocate tensor memory - buffer = (ctypes.c_uint8 * alloc_size)() - alloc = ggml.ggml_allocr_new( - ctypes.cast(buffer, ctypes.c_void_p), alloc_size, tensor_alignment - ) - ggml.ggml_allocr_alloc(alloc, x) - ggml.ggml_allocr_alloc(alloc, a) - ggml.ggml_allocr_alloc(alloc, b) - ggml.ggml_allocr_alloc_graph(alloc, gf) - - # set input values - ggml.ggml_set_f32(x, 2.0) - ggml.ggml_set_f32(a, 3.0) - ggml.ggml_set_f32(b, 4.0) - - gp = ggml.ggml_graph_plan(gf, 1) - assert gp.work_size == 0 - - # compute - ggml.ggml_graph_compute(gf, ctypes.pointer(gp)) - - output = ggml.ggml_get_f32_1d(f, 0) - assert output == 16.0 - + assert ctx is not None + x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32) + t = ggml.utils.from_numpy(x, ctx) + t_slice = ggml.utils.slice_tensor(ctx, t, [ + slice(0, 2), + slice(0, 1) + ]) + x_slice = ggml.utils.to_numpy(t_slice) + assert np.array_equal(x_slice, x[:1, :2].squeeze()) ggml.ggml_free(ctx) - ggml.ggml_allocr_free(alloc) \ No newline at end of file diff --git a/vendor/ggml b/vendor/ggml index 2c7cf49..5070f07 160000 --- a/vendor/ggml +++ b/vendor/ggml @@ -1 +1 @@ -Subproject commit 2c7cf49810d523b9632da393a9e8270b60bf3b24 +Subproject commit 5070f078a67c18c11736e78316ab715ca9afde16