diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
index 44388dc7..dd107695 100644
--- a/binaries/llm-cli/src/cli_args.rs
+++ b/binaries/llm-cli/src/cli_args.rs
@@ -9,7 +9,8 @@ use clap::{Parser, ValueEnum};
 use color_eyre::eyre::{self, WrapErr};
 use llm::{
     ggml_format, ElementType, InferenceParameters, InferenceSessionConfig, InvalidTokenBias,
-    LoadProgress, Model, ModelKVMemoryType, ModelParameters, TokenBias, TokenizerSource,
+    LoadProgress, Model, ModelKVMemoryType, ModelParameters, RoPEOverrides, TokenBias,
+    TokenizerSource,
 };
 use rand::SeedableRng;
 
@@ -430,6 +431,29 @@ impl ModelAndTokenizer {
     }
 }
 
+#[derive(Parser, Debug)]
+pub struct RoPEScaling {
+    #[arg(long)]
+    pub rope_freq_base: Option<usize>,
+
+    #[arg(long)]
+    pub rope_freq_scale: Option<f32>,
+}
+
+impl RoPEScaling {
+    pub fn to_rope_arguments(&self) -> Option<RoPEOverrides> {
+        if self.rope_freq_base.is_none() && self.rope_freq_scale.is_none() {
+            return None;
+        }
+
+        let default = RoPEOverrides::default();
+        Some(RoPEOverrides {
+            frequency_base: self.rope_freq_base.unwrap_or(default.frequency_base),
+            frequency_scale: self.rope_freq_scale.unwrap_or(default.frequency_scale),
+        })
+    }
+}
+
 #[derive(Parser, Debug)]
 pub struct ModelLoad {
     #[command(flatten)]
@@ -460,7 +484,11 @@ pub struct ModelLoad {
     /// Number of layers to run on the GPU. If not specified, all layers will be run on the GPU.
     #[arg(long)]
     pub gpu_layers: Option<usize>,
+
+    #[command(flatten)]
+    pub rope_scaling: RoPEScaling,
 }
+
 impl ModelLoad {
     pub fn load(&self, use_gpu: bool) -> eyre::Result<Box<dyn Model>> {
         let params = ModelParameters {
@@ -469,6 +497,7 @@ impl ModelLoad {
             lora_adapters: self.lora_paths.clone(),
             use_gpu,
             gpu_layers: self.gpu_layers,
+            rope_overrides: self.rope_scaling.to_rope_arguments(),
         };
 
         let mut sp = Some(spinoff::Spinner::new(
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 472b58c1..782fb4d6 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -8,7 +8,9 @@ use std::{
 
 use memmap2::Mmap;
 
-use crate::{accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, Tensor, Type};
+use crate::{
+    accelerator::Backend, sys, usize_to_i32, usize_to_i64, Buffer, RoPEOverrides, Tensor, Type,
+};
 
 /// Acts as a RAII-guard over a `sys::ggml_context`, allocating via
 /// `ggml_init` and dropping via `ggml_free`.
@@ -267,7 +269,8 @@ impl Context {
 
     /// Creates a new tensor with the values of `a`, but normalized using RMSNorm.
     pub fn op_rms_norm(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_rms_norm(self.as_ptr(), a.ptr.as_ptr()) };
+        let tensor =
+            unsafe { sys::ggml_rms_norm(self.as_ptr(), a.ptr.as_ptr(), crate::DEFAULT_EPS) };
         self.new_tensor_raw(tensor)
     }
 
@@ -527,16 +530,36 @@ impl Context {
     }
 
     /// In-place; applies ROtary Positional Encoding.
-    pub fn op_rope_inplace(&self, a: &Tensor, npast: usize, ndims: usize, mode: i32) -> Tensor {
+    pub fn op_rope_inplace(
+        &self,
+        a: &Tensor,
+        npast: usize,
+        ndims: usize,
+        mode: i32,
+        overrides: Option<&RoPEOverrides>,
+    ) -> Tensor {
         let tensor = unsafe {
-            sys::ggml_rope_inplace(
-                self.as_ptr(),
-                a.ptr.as_ptr(),
-                usize_to_i32(npast),
-                usize_to_i32(ndims),
-                mode,
-                0,
-            )
+            if let Some(custom_args) = overrides {
+                sys::ggml_rope_custom_inplace(
+                    self.as_ptr(),
+                    a.ptr.as_ptr(),
+                    usize_to_i32(npast),
+                    usize_to_i32(ndims),
+                    mode,
+                    1,
+                    custom_args.frequency_base as f32,
+                    custom_args.frequency_scale,
+                )
+            } else {
+                sys::ggml_rope_inplace(
+                    self.as_ptr(),
+                    a.ptr.as_ptr(),
+                    usize_to_i32(npast),
+                    usize_to_i32(ndims),
+                    mode,
+                    0,
+                )
+            }
         };
         self.new_tensor_raw(tensor)
     }
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 597447f7..8b3b9a8c 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -127,6 +127,29 @@ pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE;
 /// The maximum length of a `ggml` tensor-name.
 pub const MAX_NAME_LENGTH: usize = sys::GGML_MAX_NAME as usize;
 
+/// Default epsilon to use for RMS computation.
+pub const DEFAULT_EPS: f32 = sys::llama::LLAMA_DEFAULT_RMS_EPS as f32;
+
+/// Value overrides to use for RoPE.
+///
+/// Formula: `theta_i = scale * base^(−2(i−1)/d), for i in [1, 2, ..., d/2]`
+#[derive(Debug, Clone)]
+pub struct RoPEOverrides {
+    /// The frequency scale to use.
+    pub frequency_scale: f32,
+    /// The frequency base value to use.
+    pub frequency_base: usize,
+}
+
+impl Default for RoPEOverrides {
+    fn default() -> Self {
+        Self {
+            frequency_scale: 1.0,
+            frequency_base: 10_000,
+        }
+    }
+}
+
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
 /// The type of a value in `ggml`.
 pub enum Type {
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index b7647436..1a941869 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit b7647436ccc80970b44a270f70f4f2ea139054d1
+Subproject commit 1a941869cbef8e9cc351a6c6987e4ae3b0f021f7
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index 04a999df..10412720 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -23,9 +23,11 @@ pub const GGML_MAX_PARAMS: u32 = 256;
 pub const GGML_MAX_CONTEXTS: u32 = 64;
 pub const GGML_MAX_SRC: u32 = 6;
 pub const GGML_MAX_NAME: u32 = 48;
+pub const GGML_MAX_OP_PARAMS: u32 = 32;
 pub const GGML_DEFAULT_N_THREADS: u32 = 4;
 pub const GGML_EXIT_SUCCESS: u32 = 0;
 pub const GGML_EXIT_ABORTED: u32 = 1;
+pub const GGML_GRAPH_HASHTABLE_SIZE: u32 = 8273;
 pub const QK_K: u32 = 256;
 pub const K_SCALE_SIZE: u32 = 12;
 pub type ggml_fp16_t = u16;
@@ -101,66 +103,73 @@ pub const ggml_op_GGML_OP_MEAN: ggml_op = 13;
 pub const ggml_op_GGML_OP_ARGMAX: ggml_op = 14;
 pub const ggml_op_GGML_OP_REPEAT: ggml_op = 15;
 pub const ggml_op_GGML_OP_REPEAT_BACK: ggml_op = 16;
-pub const ggml_op_GGML_OP_ABS: ggml_op = 17;
-pub const ggml_op_GGML_OP_SGN: ggml_op = 18;
-pub const ggml_op_GGML_OP_NEG: ggml_op = 19;
-pub const ggml_op_GGML_OP_STEP: ggml_op = 20;
-pub const ggml_op_GGML_OP_TANH: ggml_op = 21;
-pub const ggml_op_GGML_OP_ELU: ggml_op = 22;
-pub const ggml_op_GGML_OP_RELU: ggml_op = 23;
-pub const ggml_op_GGML_OP_GELU: ggml_op = 24;
-pub const ggml_op_GGML_OP_GELU_QUICK: ggml_op = 25;
-pub const ggml_op_GGML_OP_SILU: ggml_op = 26;
-pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 27;
-pub const ggml_op_GGML_OP_NORM: ggml_op = 28;
-pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 29;
-pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 30;
-pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 31;
-pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 32;
-pub const ggml_op_GGML_OP_SCALE: ggml_op = 33;
-pub const ggml_op_GGML_OP_SET: ggml_op = 34;
-pub const ggml_op_GGML_OP_CPY: ggml_op = 35;
-pub const ggml_op_GGML_OP_CONT: ggml_op = 36;
-pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 37;
-pub const ggml_op_GGML_OP_VIEW: ggml_op = 38;
-pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 39;
-pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 40;
-pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 41;
-pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 42;
-pub const ggml_op_GGML_OP_DIAG: ggml_op = 43;
-pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 44;
-pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 45;
-pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 46;
-pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 47;
-pub const ggml_op_GGML_OP_ROPE: ggml_op = 48;
-pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 49;
-pub const ggml_op_GGML_OP_ALIBI: ggml_op = 50;
-pub const ggml_op_GGML_OP_CLAMP: ggml_op = 51;
-pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 52;
-pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 53;
-pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 54;
-pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 55;
-pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 56;
-pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 57;
-pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 58;
-pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 59;
-pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 60;
-pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 61;
-pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 62;
-pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 63;
-pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 64;
-pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 65;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 66;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 67;
-pub const ggml_op_GGML_OP_COUNT: ggml_op = 68;
+pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 17;
+pub const ggml_op_GGML_OP_NORM: ggml_op = 18;
+pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 19;
+pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 20;
+pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 21;
+pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 22;
+pub const ggml_op_GGML_OP_SCALE: ggml_op = 23;
+pub const ggml_op_GGML_OP_SET: ggml_op = 24;
+pub const ggml_op_GGML_OP_CPY: ggml_op = 25;
+pub const ggml_op_GGML_OP_CONT: ggml_op = 26;
+pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 27;
+pub const ggml_op_GGML_OP_VIEW: ggml_op = 28;
+pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 29;
+pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 30;
+pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 31;
+pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 32;
+pub const ggml_op_GGML_OP_DIAG: ggml_op = 33;
+pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 34;
+pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 35;
+pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 36;
+pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 37;
+pub const ggml_op_GGML_OP_ROPE: ggml_op = 38;
+pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 39;
+pub const ggml_op_GGML_OP_ALIBI: ggml_op = 40;
+pub const ggml_op_GGML_OP_CLAMP: ggml_op = 41;
+pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 42;
+pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 43;
+pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 44;
+pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 45;
+pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 46;
+pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 47;
+pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 48;
+pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 49;
+pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 50;
+pub const ggml_op_GGML_OP_UNARY: ggml_op = 51;
+pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 52;
+pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 53;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 54;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 55;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 56;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 57;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 58;
+pub const ggml_op_GGML_OP_COUNT: ggml_op = 59;
 pub type ggml_op = ::std::os::raw::c_uint;
+pub const ggml_unary_op_GGML_UNARY_OP_ABS: ggml_unary_op = 0;
+pub const ggml_unary_op_GGML_UNARY_OP_SGN: ggml_unary_op = 1;
+pub const ggml_unary_op_GGML_UNARY_OP_NEG: ggml_unary_op = 2;
+pub const ggml_unary_op_GGML_UNARY_OP_STEP: ggml_unary_op = 3;
+pub const ggml_unary_op_GGML_UNARY_OP_TANH: ggml_unary_op = 4;
+pub const ggml_unary_op_GGML_UNARY_OP_ELU: ggml_unary_op = 5;
+pub const ggml_unary_op_GGML_UNARY_OP_RELU: ggml_unary_op = 6;
+pub const ggml_unary_op_GGML_UNARY_OP_GELU: ggml_unary_op = 7;
+pub const ggml_unary_op_GGML_UNARY_OP_GELU_QUICK: ggml_unary_op = 8;
+pub const ggml_unary_op_GGML_UNARY_OP_SILU: ggml_unary_op = 9;
+pub type ggml_unary_op = ::std::os::raw::c_int;
+pub const ggml_object_type_GGML_OBJECT_TENSOR: ggml_object_type = 0;
+pub const ggml_object_type_GGML_OBJECT_GRAPH: ggml_object_type = 1;
+pub const ggml_object_type_GGML_OBJECT_WORK_BUFFER: ggml_object_type = 2;
+pub type ggml_object_type = ::std::os::raw::c_int;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_object {
     pub offs: usize,
     pub size: usize,
     pub next: *mut ggml_object,
-    pub padding: [::std::os::raw::c_char; 8usize],
+    pub type_: ggml_object_type,
+    pub padding: [::std::os::raw::c_char; 4usize],
 }
 #[test]
 fn bindgen_test_layout_ggml_object() {
@@ -207,8 +216,18 @@ fn bindgen_test_layout_ggml_object() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).type_) as usize - ptr as usize },
         24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_object),
+            "::",
+            stringify!(type_)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_object),
@@ -227,6 +246,7 @@ pub struct ggml_tensor {
     pub ne: [i64; 4usize],
     pub nb: [usize; 4usize],
     pub op: ggml_op,
+    pub op_params: [i32; 8usize],
     pub is_param: bool,
     pub grad: *mut ggml_tensor,
     pub src: [*mut ggml_tensor; 6usize],
@@ -236,7 +256,7 @@ pub struct ggml_tensor {
     pub data: *mut ::std::os::raw::c_void,
     pub name: [::std::os::raw::c_char; 48usize],
     pub extra: *mut ::std::os::raw::c_void,
-    pub padding: [::std::os::raw::c_char; 8usize],
+    pub padding: [::std::os::raw::c_char; 4usize],
 }
 #[test]
 fn bindgen_test_layout_ggml_tensor() {
@@ -244,7 +264,7 @@ fn bindgen_test_layout_ggml_tensor() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_tensor>(),
-        240usize,
+        272usize,
         concat!("Size of: ", stringify!(ggml_tensor))
     );
     assert_eq!(
@@ -313,8 +333,18 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).is_param) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).op_params) as usize - ptr as usize },
         84usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(op_params)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).is_param) as usize - ptr as usize },
+        116usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -324,7 +354,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).grad) as usize - ptr as usize },
-        88usize,
+        120usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -334,7 +364,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).src) as usize - ptr as usize },
-        96usize,
+        128usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -344,7 +374,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
-        144usize,
+        176usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -354,7 +384,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
-        152usize,
+        184usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -364,7 +394,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
-        160usize,
+        192usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -374,7 +404,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
-        168usize,
+        200usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -384,7 +414,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).name) as usize - ptr as usize },
-        176usize,
+        208usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -394,7 +424,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).extra) as usize - ptr as usize },
-        224usize,
+        256usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -404,7 +434,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
-        232usize,
+        264usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -413,7 +443,7 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
 }
-pub const GGML_TENSOR_SIZE: usize = 240;
+pub const GGML_TENSOR_SIZE: usize = 272;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_cplan {
@@ -508,6 +538,7 @@ pub struct ggml_cgraph {
     pub nodes: [*mut ggml_tensor; 4096usize],
     pub grads: [*mut ggml_tensor; 4096usize],
     pub leafs: [*mut ggml_tensor; 4096usize],
+    pub visited_hash_table: [*mut ::std::os::raw::c_void; 8273usize],
     pub perf_runs: ::std::os::raw::c_int,
     pub perf_cycles: i64,
     pub perf_time_us: i64,
@@ -518,7 +549,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_cgraph>(),
-        98336usize,
+        164520usize,
         concat!("Size of: ", stringify!(ggml_cgraph))
     );
     assert_eq!(
@@ -577,8 +608,18 @@ fn bindgen_test_layout_ggml_cgraph() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).visited_hash_table) as usize - ptr as usize },
         98312usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(visited_hash_table)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
+        164496usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -588,7 +629,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
-        98320usize,
+        164504usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -598,7 +639,7 @@ fn bindgen_test_layout_ggml_cgraph() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
-        98328usize,
+        164512usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_cgraph),
@@ -607,6 +648,7 @@ fn bindgen_test_layout_ggml_cgraph() {
         )
     );
 }
+pub const GGML_GRAPH_SIZE: usize = 164520;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_scratch {
@@ -846,6 +888,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_op_name(op: ggml_op) -> *const ::std::os::raw::c_char;
 }
+extern "C" {
+    pub fn ggml_op_symbol(op: ggml_op) -> *const ::std::os::raw::c_char;
+}
 extern "C" {
     pub fn ggml_element_size(tensor: *const ggml_tensor) -> usize;
 }
@@ -879,6 +924,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_set_scratch(ctx: *mut ggml_context, scratch: ggml_scratch) -> usize;
 }
+extern "C" {
+    pub fn ggml_get_no_alloc(ctx: *mut ggml_context) -> bool;
+}
 extern "C" {
     pub fn ggml_set_no_alloc(ctx: *mut ggml_context, no_alloc: bool);
 }
@@ -978,6 +1026,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_get_data_f32(tensor: *const ggml_tensor) -> *mut f32;
 }
+extern "C" {
+    pub fn ggml_get_unary_op(tensor: *const ggml_tensor) -> ggml_unary_op;
+}
 extern "C" {
     pub fn ggml_get_name(tensor: *const ggml_tensor) -> *const ::std::os::raw::c_char;
 }
@@ -997,6 +1048,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_dup(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_dup_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_add(
         ctx: *mut ggml_context,
@@ -1208,10 +1262,15 @@ extern "C" {
     pub fn ggml_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor, eps: f32)
+        -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_rms_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_rms_norm_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        eps: f32,
+    ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_rms_norm_back(
@@ -1311,9 +1370,19 @@ extern "C" {
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_cpy_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_cont(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_cont_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_reshape(
         ctx: *mut ggml_context,
@@ -1505,9 +1574,9 @@ extern "C" {
         n_past: ::std::os::raw::c_int,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
+        n_ctx: ::std::os::raw::c_int,
         freq_base: f32,
         freq_scale: f32,
-        n_ctx: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1517,6 +1586,7 @@ extern "C" {
         n_past: ::std::os::raw::c_int,
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
+        n_ctx: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1668,6 +1738,20 @@ pub type ggml_custom3_op_f32_t = ::std::option::Option<
         arg4: *const ggml_tensor,
     ),
 >;
+extern "C" {
+    pub fn ggml_unary(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_unary_op,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_unary_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_unary_op,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_map_unary_f32(
         ctx: *mut ggml_context,
@@ -1777,6 +1861,18 @@ extern "C" {
         keep: bool,
     ) -> ggml_cgraph;
 }
+extern "C" {
+    pub fn ggml_new_graph(ctx: *mut ggml_context) -> *mut ggml_cgraph;
+}
+extern "C" {
+    pub fn ggml_build_forward_ctx(
+        ctx: *mut ggml_context,
+        tensor: *mut ggml_tensor,
+    ) -> *mut ggml_cgraph;
+}
+extern "C" {
+    pub fn ggml_graph_overhead() -> usize;
+}
 extern "C" {
     pub fn ggml_graph_plan(
         cgraph: *mut ggml_cgraph,
diff --git a/crates/ggml/sys/src/llama.rs b/crates/ggml/sys/src/llama.rs
index 2d5a9a6f..a8aa42ef 100644
--- a/crates/ggml/sys/src/llama.rs
+++ b/crates/ggml/sys/src/llama.rs
@@ -12,6 +12,7 @@ pub const LLAMA_FILE_MAGIC_UNVERSIONED: u32 = 1734831468;
 pub const LLAMA_SESSION_MAGIC: u32 = 1734833006;
 pub const LLAMA_SESSION_VERSION: u32 = 1;
 pub const LLAMA_DEFAULT_SEED: u32 = 4294967295;
+pub const LLAMA_DEFAULT_RMS_EPS: f64 = 0.000005;
 pub const LLAMA_FTYPE_ALL_F32: llama_ftype = 0;
 pub const LLAMA_FTYPE_MOSTLY_F16: llama_ftype = 1;
 pub const LLAMA_FTYPE_MOSTLY_Q4_0: llama_ftype = 2;
diff --git a/crates/ggml/sys/src/metal.rs b/crates/ggml/sys/src/metal.rs
index 37d97e68..bbd16034 100644
--- a/crates/ggml/sys/src/metal.rs
+++ b/crates/ggml/sys/src/metal.rs
@@ -40,6 +40,12 @@ extern "C" {
 extern "C" {
     pub fn ggml_metal_get_tensor(ctx: *mut ggml_metal_context, t: *mut ggml_tensor);
 }
+extern "C" {
+    pub fn ggml_metal_graph_find_concurrency(ctx: *mut ggml_metal_context, gf: *mut ggml_cgraph);
+}
+extern "C" {
+    pub fn ggml_metal_if_optimized(ctx: *mut ggml_metal_context) -> bool;
+}
 extern "C" {
     pub fn ggml_metal_graph_compute(ctx: *mut ggml_metal_context, gf: *mut ggml_cgraph);
 }
diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
index 2fffbace..3d5bc163 100644
--- a/crates/llm-base/src/model/mod.rs
+++ b/crates/llm-base/src/model/mod.rs
@@ -207,6 +207,8 @@ pub struct ModelParameters {
     pub use_gpu: bool,
     /// If `use_gpu` is active this defines the number of layers to offload to the gpu. If `None`, all layers will be offloaded.
     pub gpu_layers: Option<usize>,
+    /// The arguments/overrides to pass to the [custom RoPE](https://arxiv.org/pdf/2306.15595.pdf) function, if it is used by the model.
+    pub rope_overrides: Option<ggml::RoPEOverrides>,
 }
 
 impl Default for ModelParameters {
@@ -217,6 +219,7 @@ impl Default for ModelParameters {
             lora_adapters: None,
             use_gpu: false,
             gpu_layers: None,
+            rope_overrides: None,
         }
     }
 }
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index 94aa1260..37514511 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -79,14 +79,15 @@ use std::{
 pub use llm_base::{
     conversation_inference_callback, feed_prompt_callback,
     ggml::accelerator::get_accelerator as ggml_get_accelerator,
-    ggml::accelerator::Accelerator as GgmlAccelerator, ggml::format as ggml_format, load,
-    load_progress_callback_stdout, quantize, samplers, ElementType, FileType, FileTypeFormat,
-    FormatMagic, Hyperparameters, InferenceError, InferenceFeedback, InferenceParameters,
-    InferenceRequest, InferenceResponse, InferenceSession, InferenceSessionConfig,
-    InferenceSnapshot, InferenceSnapshotRef, InferenceStats, InvalidTokenBias, KnownModel,
-    LoadError, LoadProgress, Loader, Model, ModelKVMemoryType, ModelParameters, OutputRequest,
-    Prompt, QuantizeError, QuantizeProgress, RewindError, Sampler, SnapshotError, TokenBias,
-    TokenId, TokenUtf8Buffer, TokenizationError, Tokenizer, TokenizerSource,
+    ggml::accelerator::Accelerator as GgmlAccelerator, ggml::format as ggml_format,
+    ggml::RoPEOverrides, load, load_progress_callback_stdout, quantize, samplers, ElementType,
+    FileType, FileTypeFormat, FormatMagic, Hyperparameters, InferenceError, InferenceFeedback,
+    InferenceParameters, InferenceRequest, InferenceResponse, InferenceSession,
+    InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats,
+    InvalidTokenBias, KnownModel, LoadError, LoadProgress, Loader, Model, ModelKVMemoryType,
+    ModelParameters, OutputRequest, Prompt, QuantizeError, QuantizeProgress, RewindError, Sampler,
+    SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Tokenizer,
+    TokenizerSource,
 };
 
 use serde::Serialize;
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 1c4c11c4..e83d2252 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -193,8 +193,9 @@ impl KnownModel for Falcon {
                 );
 
                 // using mode = 2 for neox mode
-                qcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2);
-                kcur = ctx0.op_rope_inplace(&kcur, session_len, head_dim, 2);
+                let overrides = self.params.rope_overrides.as_ref();
+                qcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2, overrides);
+                kcur = ctx0.op_rope_inplace(&kcur, session_len, head_dim, 2, overrides);
 
                 // store key and value to memory
 
diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
index 6464cf72..690fa643 100644
--- a/crates/models/gptj/src/lib.rs
+++ b/crates/models/gptj/src/lib.rs
@@ -147,6 +147,7 @@ impl KnownModel for GptJ {
                 let input_sa = current.share();
 
                 // self-attention
+                let overrides = self.params.rope_overrides.as_ref();
                 let qcur = ctx0.op_rope_inplace(
                     &ctx0.op_reshape_3d(
                         &ctx0.op_mul_mat(&self.layers[il].c_attn_q_proj_w, &current),
@@ -157,6 +158,7 @@ impl KnownModel for GptJ {
                     session_len,
                     n_rot,
                     0,
+                    overrides,
                 );
                 let kcur = ctx0.op_rope_inplace(
                     &ctx0.op_reshape_3d(
@@ -168,6 +170,7 @@ impl KnownModel for GptJ {
                     session_len,
                     n_rot,
                     0,
+                    overrides,
                 );
 
                 // self-attention store key and value to memory
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index c64bce50..b7f5f2bf 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -193,8 +193,9 @@ impl KnownModel for GptNeoX {
                 ));
 
                 // self-attention using mode = 2 for GPT-NeoX mode
-                qcur = ctx0.op_rope_inplace(&qcur, n_past, n_rot, 2);
-                kcur = ctx0.op_rope_inplace(&kcur, n_past, n_rot, 2);
+                let overrides = self.params.rope_overrides.as_ref();
+                qcur = ctx0.op_rope_inplace(&qcur, n_past, n_rot, 2, overrides);
+                kcur = ctx0.op_rope_inplace(&kcur, n_past, n_rot, 2, overrides);
 
                 // store key and value to memory
                 vcur = ctx0.op_transpose(&ctx0.op_reshape_2d(&vcur, n_embd, n));
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index 24fb8e02..6c0f5136 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -162,6 +162,7 @@ impl KnownModel for Llama {
 
                 // self-attention
                 // compute Q and K and RoPE them
+                let overrides = self.params.rope_overrides.as_ref();
                 let q_current = ctx0
                     .op_rope_inplace(
                         &ctx0.op_reshape_3d(
@@ -173,6 +174,7 @@ impl KnownModel for Llama {
                         session_len,
                         n_rot,
                         0,
+                        overrides,
                     )
                     .set_name("Qcur");
                 let k_current = ctx0
@@ -186,6 +188,7 @@ impl KnownModel for Llama {
                         session_len,
                         n_rot,
                         0,
+                        overrides,
                     )
                     .set_name("Kcur");