diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs index 44388dc7..dd107695 100644 --- a/binaries/llm-cli/src/cli_args.rs +++ b/binaries/llm-cli/src/cli_args.rs @@ -9,7 +9,8 @@ use clap::{Parser, ValueEnum}; use color_eyre::eyre::{self, WrapErr}; use llm::{ ggml_format, ElementType, InferenceParameters, InferenceSessionConfig, InvalidTokenBias, - LoadProgress, Model, ModelKVMemoryType, ModelParameters, TokenBias, TokenizerSource, + LoadProgress, Model, ModelKVMemoryType, ModelParameters, RoPEOverrides, TokenBias, + TokenizerSource, }; use rand::SeedableRng; @@ -430,6 +431,29 @@ impl ModelAndTokenizer { } } +#[derive(Parser, Debug)] +pub struct RoPEScaling { + #[arg(long)] + pub rope_freq_base: Option, + + #[arg(long)] + pub rope_freq_scale: Option, +} + +impl RoPEScaling { + pub fn to_rope_arguments(&self) -> Option { + if self.rope_freq_base.is_none() && self.rope_freq_scale.is_none() { + return None; + } + + let default = RoPEOverrides::default(); + Some(RoPEOverrides { + frequency_base: self.rope_freq_base.unwrap_or(default.frequency_base), + frequency_scale: self.rope_freq_scale.unwrap_or(default.frequency_scale), + }) + } +} + #[derive(Parser, Debug)] pub struct ModelLoad { #[command(flatten)] @@ -460,7 +484,11 @@ pub struct ModelLoad { /// Number of layers to run on the GPU. If not specified, all layers will be run on the GPU. #[arg(long)] pub gpu_layers: Option, + + #[command(flatten)] + pub rope_scaling: RoPEScaling, } + impl ModelLoad { pub fn load(&self, use_gpu: bool) -> eyre::Result> { let params = ModelParameters { @@ -469,6 +497,7 @@ impl ModelLoad { lora_adapters: self.lora_paths.clone(), use_gpu, gpu_layers: self.gpu_layers, + rope_overrides: self.rope_scaling.to_rope_arguments(), }; let mut sp = Some(spinoff::Spinner::new( diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs index 472b58c1..782fb4d6 100644 --- a/crates/ggml/src/context.rs +++ b/crates/ggml/src/context.rs @@ -8,7 +8,9 @@ use std::{ use memmap2::Mmap; -use crate::{accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, Tensor, Type}; +use crate::{ + accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, RoPEOverrides, Tensor, Type, +}; /// Acts as a RAII-guard over a `sys::ggml_context`, allocating via /// `ggml_init` and dropping via `ggml_free`. @@ -267,7 +269,8 @@ impl Context { /// Creates a new tensor with the values of `a`, but normalized using RMSNorm. pub fn op_rms_norm(&self, a: &Tensor) -> Tensor { - let tensor = unsafe { sys::ggml_rms_norm(self.as_ptr(), a.ptr.as_ptr()) }; + let tensor = + unsafe { sys::ggml_rms_norm(self.as_ptr(), a.ptr.as_ptr(), crate::DEFAULT_EPS) }; self.new_tensor_raw(tensor) } @@ -527,16 +530,36 @@ impl Context { } /// In-place; applies ROtary Positional Encoding. - pub fn op_rope_inplace(&self, a: &Tensor, npast: usize, ndims: usize, mode: i32) -> Tensor { + pub fn op_rope_inplace( + &self, + a: &Tensor, + npast: usize, + ndims: usize, + mode: i32, + overrides: Option<&RoPEOverrides>, + ) -> Tensor { let tensor = unsafe { - sys::ggml_rope_inplace( - self.as_ptr(), - a.ptr.as_ptr(), - usize_to_i32(npast), - usize_to_i32(ndims), - mode, - 0, - ) + if let Some(custom_args) = overrides { + sys::ggml_rope_custom_inplace( + self.as_ptr(), + a.ptr.as_ptr(), + usize_to_i32(npast), + usize_to_i32(ndims), + mode, + 1, + custom_args.frequency_base as f32, + custom_args.frequency_scale, + ) + } else { + sys::ggml_rope_inplace( + self.as_ptr(), + a.ptr.as_ptr(), + usize_to_i32(npast), + usize_to_i32(ndims), + mode, + 0, + ) + } }; self.new_tensor_raw(tensor) } diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 597447f7..8b3b9a8c 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -127,6 +127,29 @@ pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE; /// The maximum length of a `ggml` tensor-name. pub const MAX_NAME_LENGTH: usize = sys::GGML_MAX_NAME as usize; +/// Default epsilon to use for RMS computation. +pub const DEFAULT_EPS: f32 = sys::llama::LLAMA_DEFAULT_RMS_EPS as f32; + +/// Value overrides to use for RoPE. +/// +/// Formula: `theta_i = scale * base^(āˆ’2(iāˆ’1)/d), for i in [1, 2, ..., d/2]` +#[derive(Debug, Clone)] +pub struct RoPEOverrides { + /// The frequency scale to use. + pub frequency_scale: f32, + /// The frequency base value to use. + pub frequency_base: usize, +} + +impl Default for RoPEOverrides { + fn default() -> Self { + Self { + frequency_scale: 1.0, + frequency_base: 10_000, + } + } +} + #[derive(Debug, Copy, Clone, PartialEq, Eq, Default)] /// The type of a value in `ggml`. pub enum Type { diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp index b7647436..1a941869 160000 --- a/crates/ggml/sys/llama-cpp +++ b/crates/ggml/sys/llama-cpp @@ -1 +1 @@ -Subproject commit b7647436ccc80970b44a270f70f4f2ea139054d1 +Subproject commit 1a941869cbef8e9cc351a6c6987e4ae3b0f021f7 diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs index 04a999df..10412720 100644 --- a/crates/ggml/sys/src/lib.rs +++ b/crates/ggml/sys/src/lib.rs @@ -23,9 +23,11 @@ pub const GGML_MAX_PARAMS: u32 = 256; pub const GGML_MAX_CONTEXTS: u32 = 64; pub const GGML_MAX_SRC: u32 = 6; pub const GGML_MAX_NAME: u32 = 48; +pub const GGML_MAX_OP_PARAMS: u32 = 32; pub const GGML_DEFAULT_N_THREADS: u32 = 4; pub const GGML_EXIT_SUCCESS: u32 = 0; pub const GGML_EXIT_ABORTED: u32 = 1; +pub const GGML_GRAPH_HASHTABLE_SIZE: u32 = 8273; pub const QK_K: u32 = 256; pub const K_SCALE_SIZE: u32 = 12; pub type ggml_fp16_t = u16; @@ -101,66 +103,73 @@ pub const ggml_op_GGML_OP_MEAN: ggml_op = 13; pub const ggml_op_GGML_OP_ARGMAX: ggml_op = 14; pub const ggml_op_GGML_OP_REPEAT: ggml_op = 15; pub const ggml_op_GGML_OP_REPEAT_BACK: ggml_op = 16; -pub const ggml_op_GGML_OP_ABS: ggml_op = 17; -pub const ggml_op_GGML_OP_SGN: ggml_op = 18; -pub const ggml_op_GGML_OP_NEG: ggml_op = 19; -pub const ggml_op_GGML_OP_STEP: ggml_op = 20; -pub const ggml_op_GGML_OP_TANH: ggml_op = 21; -pub const ggml_op_GGML_OP_ELU: ggml_op = 22; -pub const ggml_op_GGML_OP_RELU: ggml_op = 23; -pub const ggml_op_GGML_OP_GELU: ggml_op = 24; -pub const ggml_op_GGML_OP_GELU_QUICK: ggml_op = 25; -pub const ggml_op_GGML_OP_SILU: ggml_op = 26; -pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 27; -pub const ggml_op_GGML_OP_NORM: ggml_op = 28; -pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 29; -pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 30; -pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 31; -pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 32; -pub const ggml_op_GGML_OP_SCALE: ggml_op = 33; -pub const ggml_op_GGML_OP_SET: ggml_op = 34; -pub const ggml_op_GGML_OP_CPY: ggml_op = 35; -pub const ggml_op_GGML_OP_CONT: ggml_op = 36; -pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 37; -pub const ggml_op_GGML_OP_VIEW: ggml_op = 38; -pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 39; -pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 40; -pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 41; -pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 42; -pub const ggml_op_GGML_OP_DIAG: ggml_op = 43; -pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 44; -pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 45; -pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 46; -pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 47; -pub const ggml_op_GGML_OP_ROPE: ggml_op = 48; -pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 49; -pub const ggml_op_GGML_OP_ALIBI: ggml_op = 50; -pub const ggml_op_GGML_OP_CLAMP: ggml_op = 51; -pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 52; -pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 53; -pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 54; -pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 55; -pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 56; -pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 57; -pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 58; -pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 59; -pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 60; -pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 61; -pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 62; -pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 63; -pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 64; -pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 65; -pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 66; -pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 67; -pub const ggml_op_GGML_OP_COUNT: ggml_op = 68; +pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 17; +pub const ggml_op_GGML_OP_NORM: ggml_op = 18; +pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 19; +pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 20; +pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 21; +pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 22; +pub const ggml_op_GGML_OP_SCALE: ggml_op = 23; +pub const ggml_op_GGML_OP_SET: ggml_op = 24; +pub const ggml_op_GGML_OP_CPY: ggml_op = 25; +pub const ggml_op_GGML_OP_CONT: ggml_op = 26; +pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 27; +pub const ggml_op_GGML_OP_VIEW: ggml_op = 28; +pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 29; +pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 30; +pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 31; +pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 32; +pub const ggml_op_GGML_OP_DIAG: ggml_op = 33; +pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 34; +pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 35; +pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 36; +pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 37; +pub const ggml_op_GGML_OP_ROPE: ggml_op = 38; +pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 39; +pub const ggml_op_GGML_OP_ALIBI: ggml_op = 40; +pub const ggml_op_GGML_OP_CLAMP: ggml_op = 41; +pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 42; +pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 43; +pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 44; +pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 45; +pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 46; +pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 47; +pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 48; +pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 49; +pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 50; +pub const ggml_op_GGML_OP_UNARY: ggml_op = 51; +pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 52; +pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 53; +pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 54; +pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 55; +pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 56; +pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 57; +pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 58; +pub const ggml_op_GGML_OP_COUNT: ggml_op = 59; pub type ggml_op = ::std::os::raw::c_uint; +pub const ggml_unary_op_GGML_UNARY_OP_ABS: ggml_unary_op = 0; +pub const ggml_unary_op_GGML_UNARY_OP_SGN: ggml_unary_op = 1; +pub const ggml_unary_op_GGML_UNARY_OP_NEG: ggml_unary_op = 2; +pub const ggml_unary_op_GGML_UNARY_OP_STEP: ggml_unary_op = 3; +pub const ggml_unary_op_GGML_UNARY_OP_TANH: ggml_unary_op = 4; +pub const ggml_unary_op_GGML_UNARY_OP_ELU: ggml_unary_op = 5; +pub const ggml_unary_op_GGML_UNARY_OP_RELU: ggml_unary_op = 6; +pub const ggml_unary_op_GGML_UNARY_OP_GELU: ggml_unary_op = 7; +pub const ggml_unary_op_GGML_UNARY_OP_GELU_QUICK: ggml_unary_op = 8; +pub const ggml_unary_op_GGML_UNARY_OP_SILU: ggml_unary_op = 9; +pub type ggml_unary_op = ::std::os::raw::c_int; +pub const ggml_object_type_GGML_OBJECT_TENSOR: ggml_object_type = 0; +pub const ggml_object_type_GGML_OBJECT_GRAPH: ggml_object_type = 1; +pub const ggml_object_type_GGML_OBJECT_WORK_BUFFER: ggml_object_type = 2; +pub type ggml_object_type = ::std::os::raw::c_int; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct ggml_object { pub offs: usize, pub size: usize, pub next: *mut ggml_object, - pub padding: [::std::os::raw::c_char; 8usize], + pub type_: ggml_object_type, + pub padding: [::std::os::raw::c_char; 4usize], } #[test] fn bindgen_test_layout_ggml_object() { @@ -207,8 +216,18 @@ fn bindgen_test_layout_ggml_object() { ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize }, + unsafe { ::std::ptr::addr_of!((*ptr).type_) as usize - ptr as usize }, 24usize, + concat!( + "Offset of field: ", + stringify!(ggml_object), + "::", + stringify!(type_) + ) + ); + assert_eq!( + unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize }, + 28usize, concat!( "Offset of field: ", stringify!(ggml_object), @@ -227,6 +246,7 @@ pub struct ggml_tensor { pub ne: [i64; 4usize], pub nb: [usize; 4usize], pub op: ggml_op, + pub op_params: [i32; 8usize], pub is_param: bool, pub grad: *mut ggml_tensor, pub src: [*mut ggml_tensor; 6usize], @@ -236,7 +256,7 @@ pub struct ggml_tensor { pub data: *mut ::std::os::raw::c_void, pub name: [::std::os::raw::c_char; 48usize], pub extra: *mut ::std::os::raw::c_void, - pub padding: [::std::os::raw::c_char; 8usize], + pub padding: [::std::os::raw::c_char; 4usize], } #[test] fn bindgen_test_layout_ggml_tensor() { @@ -244,7 +264,7 @@ fn bindgen_test_layout_ggml_tensor() { let ptr = UNINIT.as_ptr(); assert_eq!( ::std::mem::size_of::(), - 240usize, + 272usize, concat!("Size of: ", stringify!(ggml_tensor)) ); assert_eq!( @@ -313,8 +333,18 @@ fn bindgen_test_layout_ggml_tensor() { ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).is_param) as usize - ptr as usize }, + unsafe { ::std::ptr::addr_of!((*ptr).op_params) as usize - ptr as usize }, 84usize, + concat!( + "Offset of field: ", + stringify!(ggml_tensor), + "::", + stringify!(op_params) + ) + ); + assert_eq!( + unsafe { ::std::ptr::addr_of!((*ptr).is_param) as usize - ptr as usize }, + 116usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -324,7 +354,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).grad) as usize - ptr as usize }, - 88usize, + 120usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -334,7 +364,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).src) as usize - ptr as usize }, - 96usize, + 128usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -344,7 +374,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize }, - 144usize, + 176usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -354,7 +384,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize }, - 152usize, + 184usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -364,7 +394,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize }, - 160usize, + 192usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -374,7 +404,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize }, - 168usize, + 200usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -384,7 +414,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).name) as usize - ptr as usize }, - 176usize, + 208usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -394,7 +424,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).extra) as usize - ptr as usize }, - 224usize, + 256usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -404,7 +434,7 @@ fn bindgen_test_layout_ggml_tensor() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize }, - 232usize, + 264usize, concat!( "Offset of field: ", stringify!(ggml_tensor), @@ -413,7 +443,7 @@ fn bindgen_test_layout_ggml_tensor() { ) ); } -pub const GGML_TENSOR_SIZE: usize = 240; +pub const GGML_TENSOR_SIZE: usize = 272; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct ggml_cplan { @@ -508,6 +538,7 @@ pub struct ggml_cgraph { pub nodes: [*mut ggml_tensor; 4096usize], pub grads: [*mut ggml_tensor; 4096usize], pub leafs: [*mut ggml_tensor; 4096usize], + pub visited_hash_table: [*mut ::std::os::raw::c_void; 8273usize], pub perf_runs: ::std::os::raw::c_int, pub perf_cycles: i64, pub perf_time_us: i64, @@ -518,7 +549,7 @@ fn bindgen_test_layout_ggml_cgraph() { let ptr = UNINIT.as_ptr(); assert_eq!( ::std::mem::size_of::(), - 98336usize, + 164520usize, concat!("Size of: ", stringify!(ggml_cgraph)) ); assert_eq!( @@ -577,8 +608,18 @@ fn bindgen_test_layout_ggml_cgraph() { ) ); assert_eq!( - unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize }, + unsafe { ::std::ptr::addr_of!((*ptr).visited_hash_table) as usize - ptr as usize }, 98312usize, + concat!( + "Offset of field: ", + stringify!(ggml_cgraph), + "::", + stringify!(visited_hash_table) + ) + ); + assert_eq!( + unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize }, + 164496usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), @@ -588,7 +629,7 @@ fn bindgen_test_layout_ggml_cgraph() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize }, - 98320usize, + 164504usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), @@ -598,7 +639,7 @@ fn bindgen_test_layout_ggml_cgraph() { ); assert_eq!( unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize }, - 98328usize, + 164512usize, concat!( "Offset of field: ", stringify!(ggml_cgraph), @@ -607,6 +648,7 @@ fn bindgen_test_layout_ggml_cgraph() { ) ); } +pub const GGML_GRAPH_SIZE: usize = 164520; #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct ggml_scratch { @@ -846,6 +888,9 @@ extern "C" { extern "C" { pub fn ggml_op_name(op: ggml_op) -> *const ::std::os::raw::c_char; } +extern "C" { + pub fn ggml_op_symbol(op: ggml_op) -> *const ::std::os::raw::c_char; +} extern "C" { pub fn ggml_element_size(tensor: *const ggml_tensor) -> usize; } @@ -879,6 +924,9 @@ extern "C" { extern "C" { pub fn ggml_set_scratch(ctx: *mut ggml_context, scratch: ggml_scratch) -> usize; } +extern "C" { + pub fn ggml_get_no_alloc(ctx: *mut ggml_context) -> bool; +} extern "C" { pub fn ggml_set_no_alloc(ctx: *mut ggml_context, no_alloc: bool); } @@ -978,6 +1026,9 @@ extern "C" { extern "C" { pub fn ggml_get_data_f32(tensor: *const ggml_tensor) -> *mut f32; } +extern "C" { + pub fn ggml_get_unary_op(tensor: *const ggml_tensor) -> ggml_unary_op; +} extern "C" { pub fn ggml_get_name(tensor: *const ggml_tensor) -> *const ::std::os::raw::c_char; } @@ -997,6 +1048,9 @@ extern "C" { extern "C" { pub fn ggml_dup(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_dup_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_add( ctx: *mut ggml_context, @@ -1208,10 +1262,15 @@ extern "C" { pub fn ggml_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } extern "C" { - pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; + pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor, eps: f32) + -> *mut ggml_tensor; } extern "C" { - pub fn ggml_rms_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; + pub fn ggml_rms_norm_inplace( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + eps: f32, + ) -> *mut ggml_tensor; } extern "C" { pub fn ggml_rms_norm_back( @@ -1311,9 +1370,19 @@ extern "C" { b: *mut ggml_tensor, ) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_cpy_inplace( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + b: *mut ggml_tensor, + ) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_cont(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; } +extern "C" { + pub fn ggml_cont_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_reshape( ctx: *mut ggml_context, @@ -1505,9 +1574,9 @@ extern "C" { n_past: ::std::os::raw::c_int, n_dims: ::std::os::raw::c_int, mode: ::std::os::raw::c_int, + n_ctx: ::std::os::raw::c_int, freq_base: f32, freq_scale: f32, - n_ctx: ::std::os::raw::c_int, ) -> *mut ggml_tensor; } extern "C" { @@ -1517,6 +1586,7 @@ extern "C" { n_past: ::std::os::raw::c_int, n_dims: ::std::os::raw::c_int, mode: ::std::os::raw::c_int, + n_ctx: ::std::os::raw::c_int, ) -> *mut ggml_tensor; } extern "C" { @@ -1668,6 +1738,20 @@ pub type ggml_custom3_op_f32_t = ::std::option::Option< arg4: *const ggml_tensor, ), >; +extern "C" { + pub fn ggml_unary( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + op: ggml_unary_op, + ) -> *mut ggml_tensor; +} +extern "C" { + pub fn ggml_unary_inplace( + ctx: *mut ggml_context, + a: *mut ggml_tensor, + op: ggml_unary_op, + ) -> *mut ggml_tensor; +} extern "C" { pub fn ggml_map_unary_f32( ctx: *mut ggml_context, @@ -1777,6 +1861,18 @@ extern "C" { keep: bool, ) -> ggml_cgraph; } +extern "C" { + pub fn ggml_new_graph(ctx: *mut ggml_context) -> *mut ggml_cgraph; +} +extern "C" { + pub fn ggml_build_forward_ctx( + ctx: *mut ggml_context, + tensor: *mut ggml_tensor, + ) -> *mut ggml_cgraph; +} +extern "C" { + pub fn ggml_graph_overhead() -> usize; +} extern "C" { pub fn ggml_graph_plan( cgraph: *mut ggml_cgraph, diff --git a/crates/ggml/sys/src/llama.rs b/crates/ggml/sys/src/llama.rs index 2d5a9a6f..a8aa42ef 100644 --- a/crates/ggml/sys/src/llama.rs +++ b/crates/ggml/sys/src/llama.rs @@ -12,6 +12,7 @@ pub const LLAMA_FILE_MAGIC_UNVERSIONED: u32 = 1734831468; pub const LLAMA_SESSION_MAGIC: u32 = 1734833006; pub const LLAMA_SESSION_VERSION: u32 = 1; pub const LLAMA_DEFAULT_SEED: u32 = 4294967295; +pub const LLAMA_DEFAULT_RMS_EPS: f64 = 0.000005; pub const LLAMA_FTYPE_ALL_F32: llama_ftype = 0; pub const LLAMA_FTYPE_MOSTLY_F16: llama_ftype = 1; pub const LLAMA_FTYPE_MOSTLY_Q4_0: llama_ftype = 2; diff --git a/crates/ggml/sys/src/metal.rs b/crates/ggml/sys/src/metal.rs index 37d97e68..bbd16034 100644 --- a/crates/ggml/sys/src/metal.rs +++ b/crates/ggml/sys/src/metal.rs @@ -40,6 +40,12 @@ extern "C" { extern "C" { pub fn ggml_metal_get_tensor(ctx: *mut ggml_metal_context, t: *mut ggml_tensor); } +extern "C" { + pub fn ggml_metal_graph_find_concurrency(ctx: *mut ggml_metal_context, gf: *mut ggml_cgraph); +} +extern "C" { + pub fn ggml_metal_if_optimized(ctx: *mut ggml_metal_context) -> bool; +} extern "C" { pub fn ggml_metal_graph_compute(ctx: *mut ggml_metal_context, gf: *mut ggml_cgraph); } diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs index 2fffbace..3d5bc163 100644 --- a/crates/llm-base/src/model/mod.rs +++ b/crates/llm-base/src/model/mod.rs @@ -207,6 +207,8 @@ pub struct ModelParameters { pub use_gpu: bool, /// If `use_gpu` is active this defines the number of layers to offload to the gpu. If `None`, all layers will be offloaded. pub gpu_layers: Option, + /// The arguments/overrides to pass to the [custom RoPE](https://arxiv.org/pdf/2306.15595.pdf) function, if it is used by the model. + pub rope_overrides: Option, } impl Default for ModelParameters { @@ -217,6 +219,7 @@ impl Default for ModelParameters { lora_adapters: None, use_gpu: false, gpu_layers: None, + rope_overrides: None, } } } diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs index 94aa1260..37514511 100644 --- a/crates/llm/src/lib.rs +++ b/crates/llm/src/lib.rs @@ -79,14 +79,15 @@ use std::{ pub use llm_base::{ conversation_inference_callback, feed_prompt_callback, ggml::accelerator::get_accelerator as ggml_get_accelerator, - ggml::accelerator::Accelerator as GgmlAccelerator, ggml::format as ggml_format, load, - load_progress_callback_stdout, quantize, samplers, ElementType, FileType, FileTypeFormat, - FormatMagic, Hyperparameters, InferenceError, InferenceFeedback, InferenceParameters, - InferenceRequest, InferenceResponse, InferenceSession, InferenceSessionConfig, - InferenceSnapshot, InferenceSnapshotRef, InferenceStats, InvalidTokenBias, KnownModel, - LoadError, LoadProgress, Loader, Model, ModelKVMemoryType, ModelParameters, OutputRequest, - Prompt, QuantizeError, QuantizeProgress, RewindError, Sampler, SnapshotError, TokenBias, - TokenId, TokenUtf8Buffer, TokenizationError, Tokenizer, TokenizerSource, + ggml::accelerator::Accelerator as GgmlAccelerator, ggml::format as ggml_format, + ggml::RoPEOverrides, load, load_progress_callback_stdout, quantize, samplers, ElementType, + FileType, FileTypeFormat, FormatMagic, Hyperparameters, InferenceError, InferenceFeedback, + InferenceParameters, InferenceRequest, InferenceResponse, InferenceSession, + InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats, + InvalidTokenBias, KnownModel, LoadError, LoadProgress, Loader, Model, ModelKVMemoryType, + ModelParameters, OutputRequest, Prompt, QuantizeError, QuantizeProgress, RewindError, Sampler, + SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Tokenizer, + TokenizerSource, }; use serde::Serialize; diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 1c4c11c4..e83d2252 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -193,8 +193,9 @@ impl KnownModel for Falcon { ); // using mode = 2 for neox mode - qcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2); - kcur = ctx0.op_rope_inplace(&kcur, session_len, head_dim, 2); + let overrides = self.params.rope_overrides.as_ref(); + qcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2, overrides); + kcur = ctx0.op_rope_inplace(&kcur, session_len, head_dim, 2, overrides); // store key and value to memory diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs index 6464cf72..690fa643 100644 --- a/crates/models/gptj/src/lib.rs +++ b/crates/models/gptj/src/lib.rs @@ -147,6 +147,7 @@ impl KnownModel for GptJ { let input_sa = current.share(); // self-attention + let overrides = self.params.rope_overrides.as_ref(); let qcur = ctx0.op_rope_inplace( &ctx0.op_reshape_3d( &ctx0.op_mul_mat(&self.layers[il].c_attn_q_proj_w, ¤t), @@ -157,6 +158,7 @@ impl KnownModel for GptJ { session_len, n_rot, 0, + overrides, ); let kcur = ctx0.op_rope_inplace( &ctx0.op_reshape_3d( @@ -168,6 +170,7 @@ impl KnownModel for GptJ { session_len, n_rot, 0, + overrides, ); // self-attention store key and value to memory diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs index c64bce50..b7f5f2bf 100644 --- a/crates/models/gptneox/src/lib.rs +++ b/crates/models/gptneox/src/lib.rs @@ -193,8 +193,9 @@ impl KnownModel for GptNeoX { )); // self-attention using mode = 2 for GPT-NeoX mode - qcur = ctx0.op_rope_inplace(&qcur, n_past, n_rot, 2); - kcur = ctx0.op_rope_inplace(&kcur, n_past, n_rot, 2); + let overrides = self.params.rope_overrides.as_ref(); + qcur = ctx0.op_rope_inplace(&qcur, n_past, n_rot, 2, overrides); + kcur = ctx0.op_rope_inplace(&kcur, n_past, n_rot, 2, overrides); // store key and value to memory vcur = ctx0.op_transpose(&ctx0.op_reshape_2d(&vcur, n_embd, n)); diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index 24fb8e02..6c0f5136 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -162,6 +162,7 @@ impl KnownModel for Llama { // self-attention // compute Q and K and RoPE them + let overrides = self.params.rope_overrides.as_ref(); let q_current = ctx0 .op_rope_inplace( &ctx0.op_reshape_3d( @@ -173,6 +174,7 @@ impl KnownModel for Llama { session_len, n_rot, 0, + overrides, ) .set_name("Qcur"); let k_current = ctx0 @@ -186,6 +188,7 @@ impl KnownModel for Llama { session_len, n_rot, 0, + overrides, ) .set_name("Kcur");