From aacb9d4e3e86495a3fbe1bd831acea9e0b4c284c Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Tue, 10 Dec 2024 08:06:47 -0500
Subject: [PATCH 01/19] feat: update memory chiplet to be element-addressable

---
 air/src/constraints/chiplets/memory/mod.rs    |   6 +-
 air/src/constraints/chiplets/memory/tests.rs  |  13 +-
 air/src/trace/chiplets/memory.rs              |  41 +-
 air/src/trace/chiplets/mod.rs                 |   2 +-
 air/src/trace/mod.rs                          |   2 +-
 assembly/src/assembler/instruction/mem_ops.rs |   9 +-
 assembly/src/assembler/mod.rs                 |  14 +-
 assembly/src/tests.rs                         |  11 +-
 miden/src/cli/debug/executor.rs               |   9 +-
 miden/src/repl/mod.rs                         |  21 +-
 miden/tests/integration/exec_iters.rs         |  50 +-
 processor/src/chiplets/memory/mod.rs          | 213 ++++--
 processor/src/chiplets/memory/segment.rs      | 285 ++++++--
 processor/src/chiplets/memory/tests.rs        | 660 ++++++++++++------
 processor/src/chiplets/mod.rs                 | 123 +---
 processor/src/chiplets/tests.rs               |   3 +-
 processor/src/debug.rs                        |  21 +-
 processor/src/decoder/mod.rs                  |  10 +-
 processor/src/decoder/tests.rs                |  10 +-
 processor/src/errors.rs                       |   8 +
 processor/src/host/debug.rs                   |  60 +-
 processor/src/lib.rs                          |  18 +-
 processor/src/operations/comb_ops.rs          |  12 +-
 processor/src/operations/io_ops.rs            | 347 +++++----
 .../operations/sys_ops/sys_event_handlers.rs  |  19 +-
 processor/src/trace/tests/chiplets/memory.rs  |  10 +-
 stdlib/tests/mem/mod.rs                       |  21 +-
 test-utils/src/lib.rs                         |  24 +-
 28 files changed, 1252 insertions(+), 770 deletions(-)

diff --git a/air/src/constraints/chiplets/memory/mod.rs b/air/src/constraints/chiplets/memory/mod.rs
index 1f1289be8e..709fe0a42f 100644
--- a/air/src/constraints/chiplets/memory/mod.rs
+++ b/air/src/constraints/chiplets/memory/mod.rs
@@ -5,7 +5,7 @@ use winter_air::TransitionConstraintDegree;
 use super::{EvaluationFrame, FieldElement};
 use crate::{
     trace::chiplets::{
-        memory::NUM_ELEMENTS, MEMORY_ADDR_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
+        memory::NUM_ELEMENTS_IN_BATCH, MEMORY_ADDR_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
         MEMORY_D0_COL_IDX, MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_TRACE_OFFSET,
         MEMORY_V_COL_RANGE,
     },
@@ -152,13 +152,13 @@ fn enforce_values<E: FieldElement>(
     let mut index = 0;
 
     // initialize memory to zero when reading from new context and address pair.
-    for i in 0..NUM_ELEMENTS {
+    for i in 0..NUM_ELEMENTS_IN_BATCH {
         result[index] = memory_flag * frame.init_read_flag() * frame.v(i);
         index += 1;
     }
 
     // copy previous values when reading memory that was previously accessed.
-    for i in 0..NUM_ELEMENTS {
+    for i in 0..NUM_ELEMENTS_IN_BATCH {
         result[index] = memory_flag * frame.copy_read_flag() * (frame.v_next(i) - frame.v(i));
         index += 1;
     }
diff --git a/air/src/constraints/chiplets/memory/tests.rs b/air/src/constraints/chiplets/memory/tests.rs
index cdb34b117d..077a2ef89b 100644
--- a/air/src/constraints/chiplets/memory/tests.rs
+++ b/air/src/constraints/chiplets/memory/tests.rs
@@ -4,13 +4,14 @@ use rand_utils::rand_value;
 
 use super::{
     EvaluationFrame, MEMORY_ADDR_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
-    MEMORY_D0_COL_IDX, MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_V_COL_RANGE, NUM_ELEMENTS,
+    MEMORY_D0_COL_IDX, MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_V_COL_RANGE,
+    NUM_ELEMENTS_IN_BATCH,
 };
 use crate::{
     chiplets::memory,
     trace::{
         chiplets::{
-            memory::{Selectors, MEMORY_COPY_READ, MEMORY_INIT_READ, MEMORY_WRITE},
+            memory::{Selectors, MEMORY_COPY_READ, MEMORY_INIT_READ, MEMORY_WRITE_SELECTOR},
             MEMORY_TRACE_OFFSET,
         },
         TRACE_WIDTH,
@@ -30,7 +31,7 @@ fn test_memory_write() {
 
     // Write to a new context.
     let result = get_constraint_evaluation(
-        MEMORY_WRITE,
+        MEMORY_WRITE_SELECTOR,
         MemoryTestDeltaType::Context,
         &old_values,
         &new_values,
@@ -39,7 +40,7 @@ fn test_memory_write() {
 
     // Write to a new address in the same context.
     let result = get_constraint_evaluation(
-        MEMORY_WRITE,
+        MEMORY_WRITE_SELECTOR,
         MemoryTestDeltaType::Address,
         &old_values,
         &new_values,
@@ -48,7 +49,7 @@ fn test_memory_write() {
 
     // Write to the same context and address at a new clock cycle.
     let result = get_constraint_evaluation(
-        MEMORY_WRITE,
+        MEMORY_WRITE_SELECTOR,
         MemoryTestDeltaType::Clock,
         &old_values,
         &new_values,
@@ -160,7 +161,7 @@ fn get_test_frame(
     next[MEMORY_CLK_COL_IDX] = Felt::new(delta_row[2]);
 
     // Set the old and new values.
-    for idx in 0..NUM_ELEMENTS {
+    for idx in 0..NUM_ELEMENTS_IN_BATCH {
         let old_value = Felt::new(old_values[idx] as u64);
         // Add a write for the old values to the current row.
         current[MEMORY_V_COL_RANGE.start + idx] = old_value;
diff --git a/air/src/trace/chiplets/memory.rs b/air/src/trace/chiplets/memory.rs
index 6e531d30d1..7f62ea5be3 100644
--- a/air/src/trace/chiplets/memory.rs
+++ b/air/src/trace/chiplets/memory.rs
@@ -4,8 +4,9 @@ use super::{create_range, Felt, Range, ONE, ZERO};
 // ================================================================================================
 
 /// Number of columns needed to record an execution trace of the memory chiplet.
-pub const TRACE_WIDTH: usize = 12;
+pub const TRACE_WIDTH: usize = 15;
 
+// TODO(plafer): get rid of all "selector" constants
 /// Number of selector columns in the trace.
 pub const NUM_SELECTORS: usize = 2;
 
@@ -15,8 +16,6 @@ pub const NUM_SELECTORS: usize = 2;
 /// read / write) is to be applied at a specific row of the memory execution trace.
 pub type Selectors = [Felt; NUM_SELECTORS];
 
-// --- OPERATION SELECTORS ------------------------------------------------------------------------
-
 /// Specifies an operation that initializes new memory and then reads it.
 pub const MEMORY_INIT_READ: Selectors = [ONE, ZERO];
 
@@ -24,7 +23,20 @@ pub const MEMORY_INIT_READ: Selectors = [ONE, ZERO];
 pub const MEMORY_COPY_READ: Selectors = [ONE, ONE];
 
 /// Specifies a memory write operation.
-pub const MEMORY_WRITE: Selectors = [ZERO, ZERO];
+pub const MEMORY_WRITE_SELECTOR: Selectors = [ZERO, ZERO];
+
+// --- OPERATION SELECTORS ------------------------------------------------------------------------
+
+/// Specifies the value of the `READ_WRITE` column when the operation is a write.
+pub const MEMORY_WRITE: Felt = ZERO;
+/// Specifies the value of the `READ_WRITE` column when the operation is a read.
+pub const MEMORY_READ: Felt = ONE;
+/// Specifies the value of the `ELEMENT_OR_WORD` column when the operation is over an element.
+pub const MEMORY_ACCESS_ELEMENT: Felt = ZERO;
+/// Specifies the value of the `ELEMENT_OR_WORD` column when the operation is over a word.
+pub const MEMORY_ACCESS_WORD: Felt = ONE;
+
+// TODO(plafer): figure out the new labels
 
 /// Unique label computed as 1 plus the full chiplet selector with the bits reversed.
 /// mem_read selector=[1, 1, 0, 1], rev(selector)=[1, 0, 1, 1], +1=[1, 1, 0, 0]
@@ -37,17 +49,25 @@ pub const MEMORY_WRITE_LABEL: u8 = 0b0100;
 // --- COLUMN ACCESSOR INDICES WITHIN THE CHIPLET -------------------------------------------------
 
 /// The number of elements accessible in one read or write memory access.
-pub const NUM_ELEMENTS: usize = 4;
+pub const NUM_ELEMENTS_IN_BATCH: usize = 4;
 
+/// Column to hold the whether the operation is a read or write.
+pub const READ_WRITE_COL_IDX: usize = 0;
+/// Column to hold the whether the operation was over an element or a word.
+pub const ELEMENT_OR_WORD_COL_IDX: usize = READ_WRITE_COL_IDX + 1;
 /// Column to hold the context ID of the current memory context.
-pub const CTX_COL_IDX: usize = NUM_SELECTORS;
+pub const CTX_COL_IDX: usize = ELEMENT_OR_WORD_COL_IDX + 1;
 /// Column to hold the memory address.
-pub const ADDR_COL_IDX: usize = CTX_COL_IDX + 1;
+pub const BATCH_COL_IDX: usize = CTX_COL_IDX + 1;
+/// Column to hold the first bit of the index of the address in the batch.
+pub const IDX0_COL_IDX: usize = BATCH_COL_IDX + 1;
+/// Column to hold the second bit of the index of the address in the batch.
+pub const IDX1_COL_IDX: usize = IDX0_COL_IDX + 1;
 /// Column for the clock cycle in which the memory operation occurred.
-pub const CLK_COL_IDX: usize = ADDR_COL_IDX + 1;
+pub const CLK_COL_IDX: usize = IDX1_COL_IDX + 1;
 /// Columns to hold the values stored at a given memory context, address, and clock cycle after
 /// the memory operation. When reading from a new address, these are initialized to zero.
-pub const V_COL_RANGE: Range<usize> = create_range(CLK_COL_IDX + 1, NUM_ELEMENTS);
+pub const V_COL_RANGE: Range<usize> = create_range(CLK_COL_IDX + 1, NUM_ELEMENTS_IN_BATCH);
 /// Column for the lower 16-bits of the delta between two consecutive context IDs, addresses, or
 /// clock cycles.
 pub const D0_COL_IDX: usize = V_COL_RANGE.end;
@@ -57,3 +77,6 @@ pub const D1_COL_IDX: usize = D0_COL_IDX + 1;
 /// Column for the inverse of the delta between two consecutive context IDs, addresses, or clock
 /// cycles, used to enforce that changes are correctly constrained.
 pub const D_INV_COL_IDX: usize = D1_COL_IDX + 1;
+/// Column to hold the flag indicating whether the current memory operation is in the same batch and
+/// same context as the previous operation.
+pub const FLAG_SAME_BATCH_AND_CONTEXT: usize = D_INV_COL_IDX + 1;
diff --git a/air/src/trace/chiplets/mod.rs b/air/src/trace/chiplets/mod.rs
index d892c67e36..91a82b19b1 100644
--- a/air/src/trace/chiplets/mod.rs
+++ b/air/src/trace/chiplets/mod.rs
@@ -92,7 +92,7 @@ pub const MEMORY_SELECTORS_COL_IDX: usize = MEMORY_TRACE_OFFSET;
 /// The index within the main trace of the column containing the memory context.
 pub const MEMORY_CTX_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::CTX_COL_IDX;
 /// The index within the main trace of the column containing the memory address.
-pub const MEMORY_ADDR_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::ADDR_COL_IDX;
+pub const MEMORY_ADDR_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::BATCH_COL_IDX;
 /// The index within the main trace of the column containing the clock cycle of the memory
 /// access.
 pub const MEMORY_CLK_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::CLK_COL_IDX;
diff --git a/air/src/trace/mod.rs b/air/src/trace/mod.rs
index 62c7a910f5..c4eef3e810 100644
--- a/air/src/trace/mod.rs
+++ b/air/src/trace/mod.rs
@@ -51,7 +51,7 @@ pub const RANGE_CHECK_TRACE_RANGE: Range<usize> =
 
 // Chiplets trace
 pub const CHIPLETS_OFFSET: usize = RANGE_CHECK_TRACE_RANGE.end;
-pub const CHIPLETS_WIDTH: usize = 17;
+pub const CHIPLETS_WIDTH: usize = 18;
 pub const CHIPLETS_RANGE: Range<usize> = range(CHIPLETS_OFFSET, CHIPLETS_WIDTH);
 
 pub const TRACE_WIDTH: usize = CHIPLETS_OFFSET + CHIPLETS_WIDTH;
diff --git a/assembly/src/assembler/instruction/mem_ops.rs b/assembly/src/assembler/instruction/mem_ops.rs
index cdfbdc79c5..a20a424988 100644
--- a/assembly/src/assembler/instruction/mem_ops.rs
+++ b/assembly/src/assembler/instruction/mem_ops.rs
@@ -1,6 +1,6 @@
 use alloc::string::ToString;
 
-use vm_core::{Felt, Operation::*};
+use vm_core::{Felt, Operation::*, WORD_SIZE};
 
 use super::{push_felt, push_u32_value, validate_param, BasicBlockBuilder};
 use crate::{assembler::ProcedureContext, diagnostics::Report, AssemblyError};
@@ -111,7 +111,7 @@ pub fn mem_write_imm(
 /// Returns an error if index is greater than the number of procedure locals.
 pub fn local_to_absolute_addr(
     block_builder: &mut BasicBlockBuilder,
-    index: u16,
+    index_of_local: u16,
     num_proc_locals: u16,
 ) -> Result<(), AssemblyError> {
     if num_proc_locals == 0 {
@@ -125,9 +125,10 @@ pub fn local_to_absolute_addr(
     }
 
     let max = num_proc_locals - 1;
-    validate_param(index, 0..=max)?;
+    validate_param(index_of_local, 0..=max)?;
 
-    push_felt(block_builder, -Felt::from(max - index));
+    let fmp_offset_of_local = (max - index_of_local) * WORD_SIZE as u16;
+    push_felt(block_builder, -Felt::from(fmp_offset_of_local));
     block_builder.push_op(FmpAdd);
 
     Ok(())
diff --git a/assembly/src/assembler/mod.rs b/assembly/src/assembler/mod.rs
index 2ddc94e31a..01804d83fa 100644
--- a/assembly/src/assembler/mod.rs
+++ b/assembly/src/assembler/mod.rs
@@ -7,7 +7,7 @@ use vm_core::{
     crypto::hash::RpoDigest,
     debuginfo::SourceSpan,
     mast::{DecoratorId, MastNodeId},
-    DecoratorList, Felt, Kernel, Operation, Program,
+    DecoratorList, Felt, Kernel, Operation, Program, WORD_SIZE,
 };
 
 use crate::{
@@ -574,12 +574,14 @@ impl Assembler {
         let proc_body_id = if num_locals > 0 {
             // for procedures with locals, we need to update fmp register before and after the
             // procedure body is executed. specifically:
-            // - to allocate procedure locals we need to increment fmp by the number of locals
-            // - to deallocate procedure locals we need to decrement it by the same amount
-            let num_locals = Felt::from(num_locals);
+            // - to allocate procedure locals we need to increment fmp by 4 times the number of
+            //   locals
+            // - to deallocate procedure locals we need to decrement it by the same amount We leave
+            // 4 elements between locals to properly support reading and writing words to locals.
+            let locals_frame = Felt::from(num_locals * WORD_SIZE as u16);
             let wrapper = BodyWrapper {
-                prologue: vec![Operation::Push(num_locals), Operation::FmpUpdate],
-                epilogue: vec![Operation::Push(-num_locals), Operation::FmpUpdate],
+                prologue: vec![Operation::Push(locals_frame), Operation::FmpUpdate],
+                epilogue: vec![Operation::Push(-locals_frame), Operation::FmpUpdate],
             };
             self.compile_body(proc.iter(), &mut proc_ctx, Some(wrapper), mast_forest_builder)?
         } else {
diff --git a/assembly/src/tests.rs b/assembly/src/tests.rs
index 0cc537e3a2..f9d578e5bd 100644
--- a/assembly/src/tests.rs
+++ b/assembly/src/tests.rs
@@ -1800,18 +1800,19 @@ fn program_with_proc_locals() -> TestResult {
             mul \
         end \
         begin \
-            push.4 push.3 push.2 \
+            push.10 push.9 push.8 \
             exec.foo \
         end"
     );
     let program = context.assemble(source)?;
+    // Note: 18446744069414584317 == -4 (mod 2^64 - 2^32 + 1)
     let expected = "\
 begin
     basic_block
+        push(10)
+        push(9)
+        push(8)
         push(4)
-        push(3)
-        push(2)
-        push(1)
         fmpupdate
         pad
         fmpadd
@@ -1822,7 +1823,7 @@ begin
         fmpadd
         mload
         mul
-        push(18446744069414584320)
+        push(18446744069414584317)
         fmpupdate
     end
 end";
diff --git a/miden/src/cli/debug/executor.rs b/miden/src/cli/debug/executor.rs
index d2a145fe03..c755f3ead0 100644
--- a/miden/src/cli/debug/executor.rs
+++ b/miden/src/cli/debug/executor.rs
@@ -154,7 +154,7 @@ impl DebugExecutor {
 
     /// print all memory entries.
     pub fn print_memory(&self) {
-        for (address, mem) in self.vm_state.memory.iter() {
+        for &(address, mem) in self.vm_state.memory.iter() {
             Self::print_memory_data(address, mem)
         }
     }
@@ -167,7 +167,7 @@ impl DebugExecutor {
         });
 
         match entry {
-            Some(mem) => Self::print_memory_data(&address, mem),
+            Some(&mem) => Self::print_memory_data(address, mem),
             None => println!("memory at address '{address}' not found"),
         }
     }
@@ -176,9 +176,8 @@ impl DebugExecutor {
     // --------------------------------------------------------------------------------------------
 
     /// print memory data.
-    fn print_memory_data(address: &u64, memory: &[Felt]) {
-        let mem_int = memory.iter().map(|&x| x.as_int()).collect::<Vec<_>>();
-        println!("{address} {mem_int:?}");
+    fn print_memory_data(address: u64, mem_value: Felt) {
+        println!("{address} {mem_value:?}");
     }
 
     /// print help message
diff --git a/miden/src/repl/mod.rs b/miden/src/repl/mod.rs
index 84dfe8df47..53cf3cecdb 100644
--- a/miden/src/repl/mod.rs
+++ b/miden/src/repl/mod.rs
@@ -1,7 +1,7 @@
 use std::{collections::BTreeSet, path::PathBuf};
 
 use assembly::{Assembler, Library};
-use miden_vm::{math::Felt, DefaultHost, StackInputs, Word};
+use miden_vm::{math::Felt, DefaultHost, StackInputs};
 use processor::ContextId;
 use rustyline::{error::ReadlineError, DefaultEditor};
 use stdlib::StdLibrary;
@@ -171,7 +171,7 @@ pub fn start_repl(library_paths: &Vec<PathBuf>, use_stdlib: bool) {
     let mut should_print_stack = false;
 
     // state of the entire memory at the latest clock cycle.
-    let mut memory: Vec<(u64, Word)> = Vec::new();
+    let mut memory: Vec<(u64, Felt)> = Vec::new();
 
     // initializing readline.
     let mut rl = DefaultEditor::new().expect("Readline couldn't be initialized");
@@ -224,9 +224,9 @@ pub fn start_repl(library_paths: &Vec<PathBuf>, use_stdlib: bool) {
                         println!("The memory has not been initialized yet");
                         continue;
                     }
-                    for (addr, mem) in &memory {
+                    for &(addr, mem) in &memory {
                         // prints out the address and memory value at that address.
-                        print_mem_address(*addr, mem);
+                        print_mem_address(addr, mem);
                     }
                 } else if line.len() > 6 && &line[..5] == "!mem[" {
                     // if user wants to see the state of a particular address in a memory, the input
@@ -238,8 +238,8 @@ pub fn start_repl(library_paths: &Vec<PathBuf>, use_stdlib: bool) {
                     // extracts the address from user input.
                     match read_mem_address(&line) {
                         Ok(addr) => {
-                            for (i, memory_value) in &memory {
-                                if *i == addr {
+                            for &(i, memory_value) in &memory {
+                                if i == addr {
                                     // prints the address and memory value at that address.
                                     print_mem_address(addr, memory_value);
                                     // sets the flag to true as the address has been initialized.
@@ -305,7 +305,7 @@ pub fn start_repl(library_paths: &Vec<PathBuf>, use_stdlib: bool) {
 fn execute(
     program: String,
     provided_libraries: &[Library],
-) -> Result<(Vec<(u64, Word)>, Vec<Felt>), String> {
+) -> Result<(Vec<(u64, Felt)>, Vec<Felt>), String> {
     // compile program
     let mut assembler = Assembler::default();
 
@@ -329,7 +329,7 @@ fn execute(
     }
 
     // loads the memory at the latest clock cycle.
-    let mem_state = chiplets.get_mem_state_at(ContextId::root(), system.clk());
+    let mem_state = chiplets.memory().get_state_at(ContextId::root(), system.clk());
     // loads the stack along with the overflow values at the latest clock cycle.
     let stack_state = stack.get_state_at(system.clk());
 
@@ -404,7 +404,6 @@ fn print_stack(stack: Vec<Felt>) {
 
 /// Accepts and returns a memory at an address by converting its register into integer
 /// from Felt.
-fn print_mem_address(addr: u64, mem: &Word) {
-    let mem_int = mem.iter().map(|&x| x.as_int()).collect::<Vec<_>>();
-    println!("{} {:?}", addr, mem_int)
+fn print_mem_address(addr: u64, mem_value: Felt) {
+    println!("{addr} {mem_value}")
 }
diff --git a/miden/tests/integration/exec_iters.rs b/miden/tests/integration/exec_iters.rs
index dd9b4f1208..4f05bd82ee 100644
--- a/miden/tests/integration/exec_iters.rs
+++ b/miden/tests/integration/exec_iters.rs
@@ -5,6 +5,7 @@ use vm_core::{debuginfo::Location, AssemblyOp, Operation};
 // EXEC ITER TESTS
 // =================================================================
 /// TODO: Reenable (and fix) after we stabilized the assembler
+/// Note: expect the memory values to be very wrong.
 #[test]
 #[ignore]
 fn test_exec_iter() {
@@ -18,7 +19,8 @@ fn test_exec_iter() {
     let traces = test.execute_iter();
     let fmp = Felt::new(2u64.pow(30));
     let next_fmp = fmp + ONE;
-    let mem = vec![(1_u64, slice_to_word(&[13, 14, 15, 16]))];
+    // TODO: double check this value
+    let mem = vec![(1_u64, Felt::from(13_u32))];
     let mem_storew1_loc = Some(Location {
         path: path.clone(),
         start: 33.into(),
@@ -309,10 +311,7 @@ fn test_exec_iter() {
             )),
             stack: [17, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0].to_elements(),
             fmp: next_fmp,
-            memory: vec![
-                (1_u64, slice_to_word(&[13, 14, 15, 16])),
-                (2u64.pow(30) + 1, slice_to_word(&[17, 0, 0, 0])),
-            ],
+            memory: vec![(1_u64, 13_u32.into()), (2u64.pow(30) + 1, 17_u32.into())],
         },
         VmState {
             clk: RowIndex::from(19),
@@ -330,10 +329,7 @@ fn test_exec_iter() {
             )),
             stack: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0].to_elements(),
             fmp: next_fmp,
-            memory: vec![
-                (1_u64, slice_to_word(&[13, 14, 15, 16])),
-                (2u64.pow(30) + 1, slice_to_word(&[17, 0, 0, 0])),
-            ],
+            memory: vec![(1_u64, 13_u32.into()), (2u64.pow(30) + 1, 17_u32.into())],
         },
         VmState {
             clk: RowIndex::from(20),
@@ -343,10 +339,7 @@ fn test_exec_iter() {
             stack: [18446744069414584320, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0]
                 .to_elements(),
             fmp: next_fmp,
-            memory: vec![
-                (1_u64, slice_to_word(&[13, 14, 15, 16])),
-                (2u64.pow(30) + 1, slice_to_word(&[17, 0, 0, 0])),
-            ],
+            memory: vec![(1_u64, 13_u32.into()), (2u64.pow(30) + 1, 17_u32.into())],
         },
         VmState {
             clk: RowIndex::from(21),
@@ -355,10 +348,7 @@ fn test_exec_iter() {
             asmop: None,
             stack: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0].to_elements(),
             fmp,
-            memory: vec![
-                (1_u64, slice_to_word(&[13, 14, 15, 16])),
-                (2u64.pow(30) + 1, slice_to_word(&[17, 0, 0, 0])),
-            ],
+            memory: vec![(1_u64, 13_u32.into()), (2u64.pow(30) + 1, 17_u32.into())],
         },
         VmState {
             clk: RowIndex::from(22),
@@ -367,10 +357,7 @@ fn test_exec_iter() {
             asmop: None,
             stack: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0].to_elements(),
             fmp,
-            memory: vec![
-                (1_u64, slice_to_word(&[13, 14, 15, 16])),
-                (2u64.pow(30) + 1, slice_to_word(&[17, 0, 0, 0])),
-            ],
+            memory: vec![(1_u64, 13_u32.into()), (2u64.pow(30) + 1, 17_u32.into())],
         },
         VmState {
             clk: RowIndex::from(23),
@@ -379,10 +366,7 @@ fn test_exec_iter() {
             asmop: None,
             stack: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0].to_elements(),
             fmp,
-            memory: vec![
-                (1_u64, slice_to_word(&[13, 14, 15, 16])),
-                (2u64.pow(30) + 1, slice_to_word(&[17, 0, 0, 0])),
-            ],
+            memory: vec![(1_u64, 13_u32.into()), (2u64.pow(30) + 1, 17_u32.into())],
         },
         VmState {
             clk: RowIndex::from(24),
@@ -391,10 +375,7 @@ fn test_exec_iter() {
             asmop: None,
             stack: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0].to_elements(),
             fmp,
-            memory: vec![
-                (1_u64, slice_to_word(&[13, 14, 15, 16])),
-                (2u64.pow(30) + 1, slice_to_word(&[17, 0, 0, 0])),
-            ],
+            memory: vec![(1_u64, 13_u32.into()), (2u64.pow(30) + 1, 17_u32.into())],
         },
     ];
     for (expected, t) in expected_states.iter().zip(traces) {
@@ -402,14 +383,3 @@ fn test_exec_iter() {
         assert_eq!(*expected, *state);
     }
 }
-
-// HELPER FUNCTIONS
-// =================================================================
-fn slice_to_word(values: &[i32]) -> [Felt; 4] {
-    [
-        Felt::new(values[0] as u64),
-        Felt::new(values[1] as u64),
-        Felt::new(values[2] as u64),
-        Felt::new(values[3] as u64),
-    ]
-}
diff --git a/processor/src/chiplets/memory/mod.rs b/processor/src/chiplets/memory/mod.rs
index 99f78d0a62..0b03df31fb 100644
--- a/processor/src/chiplets/memory/mod.rs
+++ b/processor/src/chiplets/memory/mod.rs
@@ -2,10 +2,14 @@ use alloc::{collections::BTreeMap, vec::Vec};
 
 use miden_air::{
     trace::chiplets::memory::{
-        ADDR_COL_IDX, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX, D1_COL_IDX, D_INV_COL_IDX, V_COL_RANGE,
+        BATCH_COL_IDX, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX, D1_COL_IDX, D_INV_COL_IDX,
+        ELEMENT_OR_WORD_COL_IDX, FLAG_SAME_BATCH_AND_CONTEXT, IDX0_COL_IDX, IDX1_COL_IDX,
+        MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_WRITE, READ_WRITE_COL_IDX,
+        V_COL_RANGE,
     },
     RowIndex,
 };
+use vm_core::{WORD_SIZE, ZERO};
 
 use super::{
     utils::{split_element_u32_into_u16, split_u32_into_u16},
@@ -14,7 +18,7 @@ use super::{
 use crate::{system::ContextId, ExecutionError};
 
 mod segment;
-use segment::MemorySegmentTrace;
+use segment::{MemoryOperation, MemorySegmentTrace};
 
 #[cfg(test)]
 mod tests;
@@ -34,46 +38,49 @@ const INIT_MEM_VALUE: Word = EMPTY_WORD;
 /// building an execution trace of all memory accesses.
 ///
 /// The memory is comprised of one or more segments, each segment accessible from a specific
-/// execution context. The root (kernel) context has context ID 0, and all additional contexts
-/// have increasing IDs. Within each segment, the memory is word-addressable. That is, four field
-/// elements are located at each memory address, and we can read and write elements to/from memory
-/// in batches of four.
+/// execution context. The root (kernel) context has context ID 0, and all additional contexts have
+/// increasing IDs. Within each segment, the memory is element-addressable, even though the trace
+/// tracks batches of four elements for optimization purposes. That is, a single field element is
+/// located at each memory address, and we can read and write elements to/from memory either
+/// individually or in batches of four.
 ///
-/// Memory for a a given address is always initialized to zeros. That is, reading from an address
-/// before writing to it will return four ZERO elements.
+/// Memory for a given address is always initialized to zero. That is, reading from an address
+/// before writing to it will return ZERO.
 ///
 /// ## Execution trace
 /// The layout of the memory access trace is shown below.
 ///
-///   s0   s1   ctx  addr   clk   v0   v1   v2   v3   d0   d1   d_inv
-/// ├────┴────┴────┴──────┴─────┴────┴────┴────┴────┴────┴────┴───────┤
+///   rw   ew   ctx  batch   idx0   idx1  clk   v0   v1   v2   v3   d0   d1   d_inv   f_scb
+/// ├────┴────┴────┴───────┴──────┴──────┴────┴────┴────┴────┴────┴────┴────┴───────┴───────┤
 ///
 /// In the above, the meaning of the columns is as follows:
-/// - `s0` is a selector column used to identify whether the memory access is a read or a write. A
-///   value of ZERO indicates a write, and ONE indicates a read.
-/// - `s1` is a selector column used to identify whether the memory access is a read of an existing
-///   memory value or not (i.e., this context/addr combination already existed and is being read). A
-///   value of ONE indicates a read of existing memory, meaning the previous value must be copied.
+/// - `rw` is a selector column used to identify whether the memory operation is a read or a write.
+/// - `ew` is a selector column used to identify whether the memory operation is over an element or
+///   a word.
 /// - `ctx` contains execution context ID. Values in this column must increase monotonically but
 ///   there can be gaps between two consecutive context IDs of up to 2^32. Also, two consecutive
 ///   values can be the same.
-/// - `addr` contains memory address. Values in this column must increase monotonically for a given
-///   context but there can be gaps between two consecutive values of up to 2^32. Also, two
-///   consecutive values can be the same.
-/// - `clk` contains clock cycle at which a memory operation happened. Values in this column must
-///   increase monotonically for a given context and memory address but there can be gaps between
-///   two consecutive values of up to 2^32.
-/// - Columns `v0`, `v1`, `v2`, `v3` contain field elements stored at a given context/address/clock
+/// - `batch` contains the the index of the batch of addresses, which is the address of the first
+///   element in the batch. For example, the value of `batch` for the batch of addresses 40, 41, 42,
+///   and 43 is 40. Note then that the first address of a batch *must* be divisible by 4. Values in
+///   this column must increase monotonically for a given context but there can be gaps between two
+///   consecutive values of up to 2^32. Also, two consecutive values can be the same.
+/// - `clk` contains the clock cycle at which a memory operation happened. Values in this column
+///   must increase monotonically for a given context and batch but there can be gaps between two
+///   consecutive values of up to 2^32.
+/// - Columns `v0`, `v1`, `v2`, `v3` contain field elements stored at a given context/batch/clock
 ///   cycle after the memory operation.
 /// - Columns `d0` and `d1` contain lower and upper 16 bits of the delta between two consecutive
-///   context IDs, addresses, or clock cycles. Specifically:
+///   context IDs, batches, or clock cycles. Specifically:
 ///   - When the context changes, these columns contain (`new_ctx` - `old_ctx`).
-///   - When the context remains the same but the address changes, these columns contain (`new_addr`
-///     - `old-addr`).
-///   - When both the context and the address remain the same, these columns contain (`new_clk` -
+///   - When the context remains the same but the batch changes, these columns contain (`new_batch`
+///     - `old_batch`).
+///   - When both the context and the batch remain the same, these columns contain (`new_clk` -
 ///     `old_clk` - 1).
-/// - `d_inv` contains the inverse of the delta between two consecutive context IDs, addresses, or
+/// - `d_inv` contains the inverse of the delta between two consecutive context IDs, batches, or
 ///   clock cycles computed as described above.
+/// - `f_scb` is a flag indicating whether the context and the batch are the same as in the next
+///   row.
 ///
 /// For the first row of the trace, values in `d0`, `d1`, and `d_inv` are set to zeros.
 #[derive(Debug, Default)]
@@ -101,25 +108,31 @@ impl Memory {
     ///
     /// Unlike read() which modifies the memory access trace, this method returns the value at the
     /// specified address (if one exists) without altering the memory access trace.
-    pub fn get_value(&self, ctx: ContextId, addr: u32) -> Option<Word> {
+    pub fn get_value(&self, ctx: ContextId, addr: u32) -> Option<Felt> {
         match self.trace.get(&ctx) {
             Some(segment) => segment.get_value(addr),
             None => None,
         }
     }
 
-    /// Returns the word at the specified context/address which should be used as the "old value"
-    /// for a write request. It will be the previously stored value, if one exists, or
-    /// initialized memory.
-    pub fn get_old_value(&self, ctx: ContextId, addr: u32) -> Word {
-        // get the stored word or return [0, 0, 0, 0], since the memory is initialized with zeros
-        self.get_value(ctx, addr).unwrap_or(INIT_MEM_VALUE)
+    /// Returns the word located in memory starting at the specified address, which must be word
+    /// aligned.
+    ///
+    /// # Errors
+    /// - Returns an error if `addr` is not word aligned.
+    pub fn get_word(&self, ctx: ContextId, addr: u32) -> Result<Option<Word>, ExecutionError> {
+        match self.trace.get(&ctx) {
+            Some(segment) => segment
+                .get_word(addr)
+                .map_err(|_| ExecutionError::UnalignedMemoryWordAccess { addr, ctx }),
+            None => Ok(None),
+        }
     }
 
     /// Returns the entire memory state for the specified execution context at the specified cycle.
     /// The state is returned as a vector of (address, value) tuples, and includes addresses which
     /// have been accessed at least once.
-    pub fn get_state_at(&self, ctx: ContextId, clk: RowIndex) -> Vec<(u64, Word)> {
+    pub fn get_state_at(&self, ctx: ContextId, clk: RowIndex) -> Vec<(u64, Felt)> {
         if clk == 0 {
             return vec![];
         }
@@ -130,41 +143,111 @@ impl Memory {
         }
     }
 
-    // STATE ACCESSORS AND MUTATORS
+    // STATE MUTATORS
     // --------------------------------------------------------------------------------------------
 
+    /// Returns the field element located in memory at the specified context/address.
+    ///
+    /// If the specified address hasn't been previously written to, ZERO is returned. This
+    /// effectively implies that memory is initialized to ZERO.
+    ///
+    /// # Errors
+    /// - Returns an error if the address is equal or greater than 2^32.
+    /// - Returns an error if the same address is accessed more than once in the same clock cycle.
+    pub fn read(
+        &mut self,
+        ctx: ContextId,
+        addr: Felt,
+        clk: RowIndex,
+    ) -> Result<Felt, ExecutionError> {
+        let addr: u32 = addr
+            .as_int()
+            .try_into()
+            .map_err(|_| ExecutionError::MemoryAddressOutOfBounds(addr.as_int()))?;
+        self.num_trace_rows += 1;
+        self.trace.entry(ctx).or_default().read(ctx, addr, Felt::from(clk))
+    }
+
     /// Returns a word located in memory at the specified context/address.
     ///
     /// If the specified address hasn't been previously written to, four ZERO elements are
     /// returned. This effectively implies that memory is initialized to ZERO.
     ///
     /// # Errors
+    /// - Returns an error if the address is equal or greater than 2^32.
+    /// - Returns an error if the address is not aligned to a word boundary.
     /// - Returns an error if the same address is accessed more than once in the same clock cycle.
-    pub fn read(
+    pub fn read_word(
         &mut self,
         ctx: ContextId,
-        addr: u32,
+        addr: Felt,
         clk: RowIndex,
     ) -> Result<Word, ExecutionError> {
+        let addr: u32 = addr
+            .as_int()
+            .try_into()
+            .map_err(|_| ExecutionError::MemoryAddressOutOfBounds(addr.as_int()))?;
+        if addr % WORD_SIZE as u32 != 0 {
+            return Err(ExecutionError::MemoryUnalignedWordAccess {
+                addr,
+                ctx,
+                clk: Felt::from(clk),
+            });
+        }
+
         self.num_trace_rows += 1;
-        self.trace.entry(ctx).or_default().read(ctx, addr, Felt::from(clk))
+        self.trace.entry(ctx).or_default().read_word(ctx, addr, Felt::from(clk))
     }
 
-    /// Writes the provided word at the specified context/address.
+    /// Writes the provided field element at the specified context/address.
     ///
     /// # Errors
+    /// - Returns an error if the address is equal or greater than 2^32.
     /// - Returns an error if the same address is accessed more than once in the same clock cycle.
     pub fn write(
         &mut self,
         ctx: ContextId,
-        addr: u32,
+        addr: Felt,
         clk: RowIndex,
-        value: Word,
+        value: Felt,
     ) -> Result<(), ExecutionError> {
+        let addr: u32 = addr
+            .as_int()
+            .try_into()
+            .map_err(|_| ExecutionError::MemoryAddressOutOfBounds(addr.as_int()))?;
         self.num_trace_rows += 1;
         self.trace.entry(ctx).or_default().write(ctx, addr, Felt::from(clk), value)
     }
 
+    /// Writes the provided word at the specified context/address.
+    ///
+    /// # Errors
+    /// - Returns an error if the address is equal or greater than 2^32.
+    /// - Returns an error if the address is not aligned to a word boundary.
+    /// - Returns an error if the same address is accessed more than once in the same clock cycle.
+    pub fn write_word(
+        &mut self,
+        ctx: ContextId,
+        addr: Felt,
+        clk: RowIndex,
+        value: Word,
+    ) -> Result<(), ExecutionError> {
+        let addr: u32 = addr
+            .as_int()
+            .try_into()
+            .map_err(|_| ExecutionError::MemoryAddressOutOfBounds(addr.as_int()))?;
+        if addr % WORD_SIZE as u32 != 0 {
+            return Err(ExecutionError::MemoryUnalignedWordAccess {
+                addr,
+                ctx,
+                clk: Felt::from(clk),
+            });
+        }
+
+        self.num_trace_rows += 1;
+        self.trace.entry(ctx).or_default().write_word(ctx, addr, Felt::from(clk), value)
+    }
+
     // EXECUTION TRACE GENERATION
     // --------------------------------------------------------------------------------------------
 
@@ -224,7 +307,7 @@ impl Memory {
         };
 
         // iterate through addresses in ascending order, and write trace row for each memory access
-        // into the trace. we expect the trace to be 14 columns wide.
+        // into the trace. we expect the trace to be 15 columns wide.
         let mut row: RowIndex = 0.into();
 
         for (ctx, segment) in self.trace {
@@ -235,13 +318,35 @@ impl Memory {
                 let felt_addr = Felt::from(addr);
                 for memory_access in addr_trace {
                     let clk = memory_access.clk();
-                    let value = memory_access.value();
+                    let value = memory_access.batch();
 
-                    let selectors = memory_access.op_selectors();
-                    trace.set(row, 0, selectors[0]);
-                    trace.set(row, 1, selectors[1]);
+                    match memory_access.operation() {
+                        MemoryOperation::Read => trace.set(row, READ_WRITE_COL_IDX, MEMORY_READ),
+                        MemoryOperation::Write => trace.set(row, READ_WRITE_COL_IDX, MEMORY_WRITE),
+                    }
+                    let (idx1, idx0) = match memory_access.access_type() {
+                        segment::MemoryAccessType::Element {
+                            addr_idx_in_batch: addr_idx_in_word,
+                        } => {
+                            trace.set(row, ELEMENT_OR_WORD_COL_IDX, MEMORY_ACCESS_ELEMENT);
+
+                            match addr_idx_in_word {
+                                0 => (ZERO, ZERO),
+                                1 => (ZERO, ONE),
+                                2 => (ONE, ZERO),
+                                3 => (ONE, ONE),
+                                _ => panic!("invalid address index in word: {addr_idx_in_word}"),
+                            }
+                        },
+                        segment::MemoryAccessType::Word => {
+                            trace.set(row, ELEMENT_OR_WORD_COL_IDX, MEMORY_ACCESS_WORD);
+                            (ZERO, ZERO)
+                        },
+                    };
                     trace.set(row, CTX_COL_IDX, ctx);
-                    trace.set(row, ADDR_COL_IDX, felt_addr);
+                    trace.set(row, BATCH_COL_IDX, felt_addr);
+                    trace.set(row, IDX0_COL_IDX, idx0);
+                    trace.set(row, IDX1_COL_IDX, idx1);
                     trace.set(row, CLK_COL_IDX, clk);
                     for (idx, col) in V_COL_RANGE.enumerate() {
                         trace.set(row, col, value[idx]);
@@ -262,6 +367,12 @@ impl Memory {
                     // TODO: switch to batch inversion to improve efficiency.
                     trace.set(row, D_INV_COL_IDX, delta.inv());
 
+                    if prev_ctx == ctx && prev_addr == felt_addr {
+                        trace.set(row, FLAG_SAME_BATCH_AND_CONTEXT, ONE);
+                    } else {
+                        trace.set(row, FLAG_SAME_BATCH_AND_CONTEXT, ZERO);
+                    };
+
                     // update values for the next iteration of the loop
                     prev_ctx = ctx;
                     prev_addr = felt_addr;
@@ -291,9 +402,9 @@ impl Memory {
     // TEST HELPERS
     // --------------------------------------------------------------------------------------------
 
-    /// Returns current size of the memory (in words) across all contexts.
+    /// Returns the number of batches that were accessed at least once across all contexts.
     #[cfg(test)]
-    pub fn size(&self) -> usize {
-        self.trace.iter().fold(0, |acc, (_, s)| acc + s.size())
+    pub fn num_accessed_batches(&self) -> usize {
+        self.trace.iter().fold(0, |acc, (_, s)| acc + s.num_accessed_batches())
     }
 }
diff --git a/processor/src/chiplets/memory/segment.rs b/processor/src/chiplets/memory/segment.rs
index 4957286564..dd9b014217 100644
--- a/processor/src/chiplets/memory/segment.rs
+++ b/processor/src/chiplets/memory/segment.rs
@@ -3,10 +3,8 @@ use alloc::{
     vec::Vec,
 };
 
-use miden_air::{
-    trace::chiplets::memory::{Selectors, MEMORY_COPY_READ, MEMORY_INIT_READ, MEMORY_WRITE},
-    RowIndex,
-};
+use miden_air::RowIndex;
+use vm_core::WORD_SIZE;
 
 use super::{Felt, Word, INIT_MEM_VALUE};
 use crate::{ContextId, ExecutionError};
@@ -26,21 +24,43 @@ impl MemorySegmentTrace {
     // PUBLIC ACCESSORS
     // --------------------------------------------------------------------------------------------
 
-    /// Returns a word located at the specified address, or None if the address hasn't been
+    /// Returns the element located at the specified address, or None if the address hasn't been
     /// accessed previously.
     ///
     /// Unlike read() which modifies the memory access trace, this method returns the value at the
     /// specified address (if one exists) without altering the memory access trace.
-    pub fn get_value(&self, addr: u32) -> Option<Word> {
-        match self.0.get(&addr) {
-            Some(addr_trace) => addr_trace.last().map(|access| access.value()),
+    pub fn get_value(&self, addr: u32) -> Option<Felt> {
+        let (batch, addr_idx_in_word) = addr_to_batch_and_idx(addr);
+
+        match self.0.get(&batch) {
+            Some(addr_trace) => {
+                addr_trace.last().map(|access| access.batch()[addr_idx_in_word as usize])
+            },
             None => None,
         }
     }
 
+    /// Returns the word located in memory starting at the specified address, which must be word
+    /// aligned.
+    ///
+    /// # Errors
+    /// - Returns an error if `addr` is not word aligned.
+    pub fn get_word(&self, addr: u32) -> Result<Option<Word>, ()> {
+        if addr % WORD_SIZE as u32 != 0 {
+            return Err(());
+        }
+
+        let (batch, _) = addr_to_batch_and_idx(addr);
+
+        match self.0.get(&batch) {
+            Some(addr_trace) => Ok(addr_trace.last().map(|access| access.batch())),
+            None => Ok(None),
+        }
+    }
+
     /// Returns the entire memory state at the beginning of the specified cycle.
-    pub fn get_state_at(&self, clk: RowIndex) -> Vec<(u64, Word)> {
-        let mut result: Vec<(u64, Word)> = Vec::new();
+    pub fn get_state_at(&self, clk: RowIndex) -> Vec<(u64, Felt)> {
+        let mut result: Vec<(u64, Felt)> = Vec::new();
 
         if clk == 0 {
             return result;
@@ -53,13 +73,29 @@ impl MemorySegmentTrace {
 
         for (&addr, addr_trace) in self.0.iter() {
             match addr_trace.binary_search_by(|access| access.clk().as_int().cmp(&search_clk)) {
-                Ok(i) => result.push((addr.into(), addr_trace[i].value())),
+                Ok(i) => {
+                    let batch = addr_trace[i].batch();
+                    let addr: u64 = addr.into();
+                    result.extend([
+                        (addr, batch[0]),
+                        (addr + 1, batch[1]),
+                        (addr + 2, batch[2]),
+                        (addr + 3, batch[3]),
+                    ]);
+                },
                 Err(i) => {
                     // Binary search finds the index of the data with the specified clock cycle.
                     // Decrement the index to get the trace from the previously accessed clock
                     // cycle to insert into the results.
                     if i > 0 {
-                        result.push((addr.into(), addr_trace[i - 1].value()));
+                        let batch = addr_trace[i - 1].batch();
+                        let addr: u64 = addr.into();
+                        result.extend([
+                            (addr, batch[0]),
+                            (addr + 1, batch[1]),
+                            (addr + 2, batch[2]),
+                            (addr + 3, batch[3]),
+                        ]);
                     }
                 },
             }
@@ -71,62 +107,136 @@ impl MemorySegmentTrace {
     // STATE MUTATORS
     // --------------------------------------------------------------------------------------------
 
-    /// Returns a word located in memory at the specified address. The memory access is assumed
-    /// to happen at the provided clock cycle.
+    /// Returns the element located at the specified address. The memory access is assumed to happen
+    /// at the provided clock cycle.
+    ///
+    /// If the element at the specified address hasn't been previously written to, ZERO is returned.
+    ///
+    /// # Errors
+    /// - Returns an error if the same address is accessed more than once in the same clock cycle.
+    pub fn read(&mut self, ctx: ContextId, addr: u32, clk: Felt) -> Result<Felt, ExecutionError> {
+        let (batch, addr_idx_in_word) = addr_to_batch_and_idx(addr);
+
+        let batch_values = self.read_batch(
+            ctx,
+            batch,
+            clk,
+            MemoryAccessType::Element { addr_idx_in_batch: addr_idx_in_word },
+        )?;
+
+        Ok(batch_values[addr_idx_in_word as usize])
+    }
+
+    /// Returns a word located in memory starting at the specified address, which must be word
+    /// aligned. The memory access is assumed to happen at the provided clock cycle.
+    ///
+    /// If the word starting at the specified address hasn't been previously written to, four ZERO
+    /// elements are returned. This effectively implies that memory is initialized to ZERO.
+    ///
+    /// # Errors
+    /// - Returns an error if the same address is accessed more than once in the same clock cycle.
+    pub fn read_word(
+        &mut self,
+        ctx: ContextId,
+        addr: u32,
+        clk: Felt,
+    ) -> Result<Word, ExecutionError> {
+        debug_assert!(addr % 4 == 0, "unaligned word access: {addr}");
+
+        let (batch, _) = addr_to_batch_and_idx(addr);
+        self.read_batch(ctx, batch, clk, MemoryAccessType::Word)
+    }
+
+    /// Writes the element located at the specified address. The memory access is assumed to happen
+    /// at the provided clock cycle.
     ///
-    /// If the specified address hasn't been previously written to, four ZERO elements are
-    /// returned. This effectively implies that memory is initialized to ZERO.
+    /// If the element at the specified address hasn't been previously written to, ZERO is returned.
     ///
     /// # Errors
     /// - Returns an error if the same address is accessed more than once in the same clock cycle.
-    pub fn read(&mut self, ctx: ContextId, addr: u32, clk: Felt) -> Result<Word, ExecutionError> {
-        // look up the previous value in the appropriate address trace and add (clk, prev_value)
-        // to it; if this is the first time we access this address, create address trace for it
-        // with entry (clk, [ZERO, 4]). in both cases, return the last value in the address trace.
-        match self.0.entry(addr) {
+    pub fn write(
+        &mut self,
+        ctx: ContextId,
+        addr: u32,
+        clk: Felt,
+        value: Felt,
+    ) -> Result<(), ExecutionError> {
+        let (batch, addr_idx_in_word) = addr_to_batch_and_idx(addr);
+
+        match self.0.entry(batch) {
             Entry::Vacant(vacant_entry) => {
-                let access =
-                    MemorySegmentAccess::new(clk, MemoryOperation::InitRead, INIT_MEM_VALUE);
+                // If this is the first access to the ctx/batch pair, then all values in the batch
+                // are initialized to 0, except for the address being written.
+                let batch = {
+                    let mut batch = Word::default();
+                    batch[addr_idx_in_word as usize] = value;
+                    batch
+                };
+
+                let access = MemorySegmentAccess::new(
+                    clk,
+                    MemoryOperation::Write,
+                    MemoryAccessType::Element { addr_idx_in_batch: addr_idx_in_word },
+                    batch,
+                );
                 vacant_entry.insert(vec![access]);
-                Ok(INIT_MEM_VALUE)
+                Ok(())
             },
             Entry::Occupied(mut occupied_entry) => {
+                // If the ctx/batch pair has been accessed before, then the values in the batch are
+                // the same as the previous access, except for the address being written.
                 let addr_trace = occupied_entry.get_mut();
                 if addr_trace.last().expect("empty address trace").clk() == clk {
                     Err(ExecutionError::DuplicateMemoryAccess { ctx, addr, clk })
                 } else {
-                    let last_value = addr_trace.last().expect("empty address trace").value();
-                    let access =
-                        MemorySegmentAccess::new(clk, MemoryOperation::CopyRead, last_value);
+                    let batch = {
+                        let mut last_batch =
+                            addr_trace.last().expect("empty address trace").batch();
+                        last_batch[addr_idx_in_word as usize] = value;
+
+                        last_batch
+                    };
+
+                    let access = MemorySegmentAccess::new(
+                        clk,
+                        MemoryOperation::Write,
+                        MemoryAccessType::Element { addr_idx_in_batch: addr_idx_in_word },
+                        batch,
+                    );
                     addr_trace.push(access);
 
-                    Ok(last_value)
+                    Ok(())
                 }
             },
         }
     }
 
-    /// Writes the provided word at the specified address. The memory access is assumed to happen
-    /// at the provided clock cycle.
+    /// Writes the provided word starting at the specified address. The memory access is assumed to
+    /// happen at the provided clock cycle.
     ///
     /// # Errors
     /// - Returns an error if the same address is accessed more than once in the same clock cycle.
-    pub fn write(
+    pub fn write_word(
         &mut self,
         ctx: ContextId,
         addr: u32,
         clk: Felt,
-        value: Word,
+        word: Word,
     ) -> Result<(), ExecutionError> {
-        // add a memory access to the appropriate address trace; if this is the first time
-        // we access this address, initialize address trace.
-        let access = MemorySegmentAccess::new(clk, MemoryOperation::Write, value);
-        match self.0.entry(addr) {
+        debug_assert!(addr % 4 == 0, "unaligned memory access: {addr}");
+
+        let (batch, _) = addr_to_batch_and_idx(addr);
+
+        let access =
+            MemorySegmentAccess::new(clk, MemoryOperation::Write, MemoryAccessType::Word, word);
+        match self.0.entry(batch) {
             Entry::Vacant(vacant_entry) => {
+                // All values in the batch are set to the word being written.
                 vacant_entry.insert(vec![access]);
                 Ok(())
             },
             Entry::Occupied(mut occupied_entry) => {
+                // All values in the batch are set to the word being written.
                 let addr_trace = occupied_entry.get_mut();
                 if addr_trace.last().expect("empty address trace").clk() == clk {
                     Err(ExecutionError::DuplicateMemoryAccess { ctx, addr, clk })
@@ -154,9 +264,55 @@ impl MemorySegmentTrace {
     // HELPER FUNCTIONS
     // --------------------------------------------------------------------------------------------
 
-    /// Returns current size (in words) of this memory segment.
+    /// Records a read operation on the specified batch at the specified clock cycle.
+    ///
+    /// The access type either specifies the element in batch that was read, or that the entire word
+    /// was read.
+    fn read_batch(
+        &mut self,
+        ctx: ContextId,
+        batch: u32,
+        clk: Felt,
+        access_type: MemoryAccessType,
+    ) -> Result<Word, ExecutionError> {
+        match self.0.entry(batch) {
+            Entry::Vacant(vacant_entry) => {
+                // If this is the first access to the ctx/batch pair, then all values in the batch
+                // are initialized to 0.
+                let access = MemorySegmentAccess::new(
+                    clk,
+                    MemoryOperation::Read,
+                    access_type,
+                    INIT_MEM_VALUE,
+                );
+                vacant_entry.insert(vec![access]);
+                Ok(INIT_MEM_VALUE)
+            },
+            Entry::Occupied(mut occupied_entry) => {
+                // If the ctx/batch pair has been accessed before, then the values in the batch are
+                // the same as the previous access.
+                let addr_trace = occupied_entry.get_mut();
+                if addr_trace.last().expect("empty address trace").clk() == clk {
+                    Err(ExecutionError::DuplicateMemoryAccess { ctx, addr: batch, clk })
+                } else {
+                    let last_batch = addr_trace.last().expect("empty address trace").batch();
+                    let access = MemorySegmentAccess::new(
+                        clk,
+                        MemoryOperation::Read,
+                        access_type,
+                        last_batch,
+                    );
+                    addr_trace.push(access);
+
+                    Ok(last_batch)
+                }
+            },
+        }
+    }
+
+    /// Returns the number of batches that were accessed at least once.
     #[cfg(test)]
-    pub fn size(&self) -> usize {
+    pub fn num_accessed_batches(&self) -> usize {
         self.0.len()
     }
 }
@@ -166,23 +322,29 @@ impl MemorySegmentTrace {
 
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub enum MemoryOperation {
-    InitRead,
-    CopyRead,
+    Read,
     Write,
 }
 
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum MemoryAccessType {
+    Element { addr_idx_in_batch: u8 },
+    Word,
+}
+
 /// A single memory access representing the specified memory operation with the specified value at
 /// the specified clock cycle.
 #[derive(Copy, Debug, Clone)]
 pub struct MemorySegmentAccess {
     clk: Felt,
-    op: MemoryOperation,
-    value: Word,
+    operation: MemoryOperation,
+    access_type: MemoryAccessType,
+    batch: Word,
 }
 
 impl MemorySegmentAccess {
-    fn new(clk: Felt, op: MemoryOperation, value: Word) -> Self {
-        Self { clk, op, value }
+    fn new(clk: Felt, op: MemoryOperation, access_type: MemoryAccessType, batch: Word) -> Self {
+        Self { clk, operation: op, access_type, batch }
     }
 
     /// Returns the clock cycle at which this memory access happened.
@@ -190,17 +352,32 @@ impl MemorySegmentAccess {
         self.clk
     }
 
-    /// Returns the selector values matching the operation used in this memory access.
-    pub(super) fn op_selectors(&self) -> Selectors {
-        match self.op {
-            MemoryOperation::InitRead => MEMORY_INIT_READ,
-            MemoryOperation::CopyRead => MEMORY_COPY_READ,
-            MemoryOperation::Write => MEMORY_WRITE,
-        }
+    /// Returns the operation associated with this memory access.
+    pub(super) fn operation(&self) -> MemoryOperation {
+        self.operation
+    }
+
+    /// Returns the access type associated with this memory access.
+    pub(super) fn access_type(&self) -> MemoryAccessType {
+        self.access_type
     }
 
-    /// Returns the word value for this memory access.
-    pub(super) fn value(&self) -> Word {
-        self.value
+    /// Returns the batch associated with this memory access.
+    ///
+    /// For example, if the memory access is an element read of address 42, the batch will contain
+    /// the values of addresses 40, 41, 42, and 43.
+    pub(super) fn batch(&self) -> Word {
+        self.batch
     }
 }
+
+// HELPERS
+// ================================================================================================
+
+/// Splits an address into two components:
+/// 1. a batch, which is the closest value to `addr` that is both smaller and word aligned,  and
+/// 2. the index within the batch which `addr` represents.
+pub fn addr_to_batch_and_idx(addr: u32) -> (u32, u8) {
+    let idx = addr % WORD_SIZE as u32;
+    (addr - idx, idx as u8)
+}
diff --git a/processor/src/chiplets/memory/tests.rs b/processor/src/chiplets/memory/tests.rs
index 5c169507f5..56cbce5850 100644
--- a/processor/src/chiplets/memory/tests.rs
+++ b/processor/src/chiplets/memory/tests.rs
@@ -2,23 +2,26 @@ use alloc::vec::Vec;
 
 use miden_air::{
     trace::chiplets::memory::{
-        Selectors, MEMORY_COPY_READ, MEMORY_INIT_READ, MEMORY_WRITE,
+        ELEMENT_OR_WORD_COL_IDX, FLAG_SAME_BATCH_AND_CONTEXT, IDX0_COL_IDX, IDX1_COL_IDX,
+        MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_WRITE, READ_WRITE_COL_IDX,
         TRACE_WIDTH as MEMORY_TRACE_WIDTH,
     },
     RowIndex,
 };
-use vm_core::Word;
+use vm_core::{assert_matches, Word, WORD_SIZE};
 
 use super::{
-    super::ZERO, Felt, FieldElement, Memory, TraceFragment, ADDR_COL_IDX, CLK_COL_IDX, CTX_COL_IDX,
-    D0_COL_IDX, D1_COL_IDX, D_INV_COL_IDX, EMPTY_WORD, ONE, V_COL_RANGE,
+    super::ZERO,
+    segment::{MemoryAccessType, MemoryOperation},
+    Felt, FieldElement, Memory, TraceFragment, BATCH_COL_IDX, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX,
+    D1_COL_IDX, D_INV_COL_IDX, EMPTY_WORD, ONE, V_COL_RANGE,
 };
-use crate::ContextId;
+use crate::{ContextId, ExecutionError};
 
 #[test]
 fn mem_init() {
     let mem = Memory::default();
-    assert_eq!(0, mem.size());
+    assert_eq!(0, mem.num_accessed_batches());
     assert_eq!(0, mem.trace_len());
 }
 
@@ -27,51 +30,98 @@ fn mem_read() {
     let mut mem = Memory::default();
 
     // read a value from address 0; clk = 1
-    let addr0 = 0;
+    let addr0 = ZERO;
     let value = mem.read(ContextId::root(), addr0, 1.into()).unwrap();
-    assert_eq!(EMPTY_WORD, value);
-    assert_eq!(1, mem.size());
+    assert_eq!(ZERO, value);
+    assert_eq!(1, mem.num_accessed_batches());
     assert_eq!(1, mem.trace_len());
 
     // read a value from address 3; clk = 2
-    let addr3 = 3;
+    let addr3 = Felt::from(3_u32);
     let value = mem.read(ContextId::root(), addr3, 2.into()).unwrap();
-    assert_eq!(EMPTY_WORD, value);
-    assert_eq!(2, mem.size());
+    assert_eq!(ZERO, value);
+    assert_eq!(1, mem.num_accessed_batches());
     assert_eq!(2, mem.trace_len());
 
     // read a value from address 0 again; clk = 3
     let value = mem.read(ContextId::root(), addr0, 3.into()).unwrap();
-    assert_eq!(EMPTY_WORD, value);
-    assert_eq!(2, mem.size());
+    assert_eq!(ZERO, value);
+    assert_eq!(1, mem.num_accessed_batches());
     assert_eq!(3, mem.trace_len());
 
     // read a value from address 2; clk = 4
-    let addr2 = 2;
+    let addr2 = Felt::from(2_u32);
     let value = mem.read(ContextId::root(), addr2, 4.into()).unwrap();
-    assert_eq!(EMPTY_WORD, value);
-    assert_eq!(3, mem.size());
+    assert_eq!(ZERO, value);
+    assert_eq!(1, mem.num_accessed_batches());
     assert_eq!(4, mem.trace_len());
 
-    // check generated trace and memory data provided to the ChipletsBus; rows should be sorted by
-    // address and then clock cycle
+    // check generated trace and memory data provided to the ChipletsBus; rows should be sorted only
+    // by clock cycle, since they all access the same batch
     let trace = build_trace(mem, 4);
 
-    // address 0
+    // clk 1
     let mut prev_row = [ZERO; MEMORY_TRACE_WIDTH];
-    let memory_access = MemoryAccess::new(ContextId::root(), addr0, 1.into(), EMPTY_WORD);
-    prev_row = verify_memory_access(&trace, 0, MEMORY_INIT_READ, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(ContextId::root(), addr0, 3.into(), EMPTY_WORD);
-    prev_row = verify_memory_access(&trace, 1, MEMORY_COPY_READ, &memory_access, prev_row);
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Element { addr_idx_in_batch: 0 },
+        ContextId::root(),
+        addr0,
+        1.into(),
+        EMPTY_WORD,
+    );
+    prev_row = verify_memory_access(&trace, 0, memory_access, prev_row);
+
+    // clk 2
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Element { addr_idx_in_batch: 3 },
+        ContextId::root(),
+        addr3,
+        2.into(),
+        EMPTY_WORD,
+    );
+    prev_row = verify_memory_access(&trace, 1, memory_access, prev_row);
+
+    // clk 3
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Element { addr_idx_in_batch: 0 },
+        ContextId::root(),
+        addr0,
+        3.into(),
+        EMPTY_WORD,
+    );
+    prev_row = verify_memory_access(&trace, 2, memory_access, prev_row);
+
+    // clk 4
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Element { addr_idx_in_batch: 2 },
+        ContextId::root(),
+        addr2,
+        4.into(),
+        EMPTY_WORD,
+    );
+    verify_memory_access(&trace, 3, memory_access, prev_row);
+}
 
-    // address 2
-    let memory_access = MemoryAccess::new(ContextId::root(), addr2, 4.into(), EMPTY_WORD);
-    prev_row = verify_memory_access(&trace, 2, MEMORY_INIT_READ, &memory_access, prev_row);
+/// Tests that writing a word to an address that is not aligned with the word boundary results in an
+/// error.
+#[test]
+fn mem_read_word_unaligned() {
+    let mut mem = Memory::default();
 
-    // address 3
-    let memory_access = MemoryAccess::new(ContextId::root(), addr3, 2.into(), EMPTY_WORD);
-    verify_memory_access(&trace, 3, MEMORY_INIT_READ, &memory_access, prev_row);
+    // write a value into address 0; clk = 1
+    let addr = ONE;
+    let clk = 1.into();
+    let ctx = ContextId::root();
+    let ret = mem.read_word(ctx, addr, clk);
+
+    assert_matches!(
+        ret,
+        Err(ExecutionError::MemoryUnalignedWordAccess { addr: _, ctx: _, clk: _ })
+    );
 }
 
 #[test]
@@ -79,224 +129,330 @@ fn mem_write() {
     let mut mem = Memory::default();
 
     // write a value into address 0; clk = 1
-    let addr0 = 0;
-    let value1 = [ONE, ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), addr0, 1.into(), value1).unwrap();
-    assert_eq!(value1, mem.get_value(ContextId::root(), addr0).unwrap());
-    assert_eq!(1, mem.size());
+    let addr0 = 0_u32;
+    let word1 = [ONE, ZERO, ZERO, ZERO];
+    mem.write_word(ContextId::root(), addr0.into(), 1.into(), word1).unwrap();
+    assert_eq!(word1, mem.get_word(ContextId::root(), addr0).unwrap().unwrap());
+    assert_eq!(1, mem.num_accessed_batches());
     assert_eq!(1, mem.trace_len());
 
     // write a value into address 2; clk = 2
-    let addr2 = 2;
-    let value5 = [Felt::new(5), ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), addr2, 2.into(), value5).unwrap();
+    let addr2 = 2_u32;
+    let value5 = Felt::new(5);
+    mem.write(ContextId::root(), addr2.into(), 2.into(), value5).unwrap();
     assert_eq!(value5, mem.get_value(ContextId::root(), addr2).unwrap());
-    assert_eq!(2, mem.size());
+    assert_eq!(1, mem.num_accessed_batches());
     assert_eq!(2, mem.trace_len());
 
     // write a value into address 1; clk = 3
-    let addr1 = 1;
-    let value7 = [Felt::new(7), ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), addr1, 3.into(), value7).unwrap();
+    let addr1 = 1_u32;
+    let value7 = Felt::new(7);
+    mem.write(ContextId::root(), addr1.into(), 3.into(), value7).unwrap();
     assert_eq!(value7, mem.get_value(ContextId::root(), addr1).unwrap());
-    assert_eq!(3, mem.size());
+    assert_eq!(1, mem.num_accessed_batches());
     assert_eq!(3, mem.trace_len());
 
-    // write a value into address 0; clk = 4
-    let value9 = [Felt::new(9), ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), addr0, 4.into(), value9).unwrap();
-    assert_eq!(value7, mem.get_value(ContextId::root(), addr1).unwrap());
-    assert_eq!(3, mem.size());
+    // write a value into address 3; clk = 4
+    let addr3 = 3_u32;
+    let value9 = Felt::new(9);
+    mem.write(ContextId::root(), addr3.into(), 4.into(), value9).unwrap();
+    assert_eq!(value9, mem.get_value(ContextId::root(), addr3).unwrap());
+    assert_eq!(1, mem.num_accessed_batches());
     assert_eq!(4, mem.trace_len());
 
+    // write a word into address 4; clk = 5
+    let addr4 = 4_u32;
+    let word1234 = [ONE, 2_u32.into(), 3_u32.into(), 4_u32.into()];
+    mem.write_word(ContextId::root(), addr4.into(), 5.into(), word1234).unwrap();
+    assert_eq!(word1234, mem.get_word(ContextId::root(), addr4).unwrap().unwrap());
+    assert_eq!(2, mem.num_accessed_batches());
+    assert_eq!(5, mem.trace_len());
+
+    // write a word into address 0; clk = 6
+    let word5678: [Felt; 4] = [5_u32.into(), 6_u32.into(), 7_u32.into(), 8_u32.into()];
+    mem.write_word(ContextId::root(), addr0.into(), 6.into(), word5678).unwrap();
+    assert_eq!(word5678, mem.get_word(ContextId::root(), addr0).unwrap().unwrap());
+    assert_eq!(2, mem.num_accessed_batches());
+    assert_eq!(6, mem.trace_len());
+
     // check generated trace and memory data provided to the ChipletsBus; rows should be sorted by
     // address and then clock cycle
-    let trace = build_trace(mem, 4);
+    let trace = build_trace(mem, 6);
 
-    // address 0
+    // batch 0
     let mut prev_row = [ZERO; MEMORY_TRACE_WIDTH];
-    let memory_access = MemoryAccess::new(ContextId::root(), addr0, 1.into(), value1);
-    prev_row = verify_memory_access(&trace, 0, MEMORY_WRITE, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(ContextId::root(), addr0, 4.into(), value9);
-    prev_row = verify_memory_access(&trace, 1, MEMORY_WRITE, &memory_access, prev_row);
-
-    // address 1
-    let memory_access = MemoryAccess::new(ContextId::root(), addr1, 3.into(), value7);
-    prev_row = verify_memory_access(&trace, 2, MEMORY_WRITE, &memory_access, prev_row);
-
-    // address 2
-    let memory_access = MemoryAccess::new(ContextId::root(), addr2, 2.into(), value5);
-    verify_memory_access(&trace, 3, MEMORY_WRITE, &memory_access, prev_row);
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Write,
+        MemoryAccessType::Word,
+        ContextId::root(),
+        addr0.into(),
+        1.into(),
+        word1,
+    );
+    prev_row = verify_memory_access(&trace, 0, memory_access, prev_row);
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Write,
+        MemoryAccessType::Element { addr_idx_in_batch: 2 },
+        ContextId::root(),
+        addr2.into(),
+        2.into(),
+        [ONE, ZERO, value5, ZERO],
+    );
+    prev_row = verify_memory_access(&trace, 1, memory_access, prev_row);
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Write,
+        MemoryAccessType::Element { addr_idx_in_batch: 1 },
+        ContextId::root(),
+        addr1.into(),
+        3.into(),
+        [ONE, value7, value5, ZERO],
+    );
+    prev_row = verify_memory_access(&trace, 2, memory_access, prev_row);
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Write,
+        MemoryAccessType::Element { addr_idx_in_batch: 3 },
+        ContextId::root(),
+        addr3.into(),
+        4.into(),
+        [ONE, value7, value5, value9],
+    );
+    prev_row = verify_memory_access(&trace, 3, memory_access, prev_row);
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Write,
+        MemoryAccessType::Word,
+        ContextId::root(),
+        addr0.into(),
+        6.into(),
+        word5678,
+    );
+    prev_row = verify_memory_access(&trace, 4, memory_access, prev_row);
+
+    // batch 1
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Write,
+        MemoryAccessType::Word,
+        ContextId::root(),
+        addr4.into(),
+        5.into(),
+        word1234,
+    );
+    verify_memory_access(&trace, 5, memory_access, prev_row);
 }
 
+/// Tests that writing a word to an address that is not aligned with the word boundary results in an
+/// error.
 #[test]
-fn mem_write_read() {
+fn mem_write_word_unaligned() {
     let mut mem = Memory::default();
 
-    // write 1 into address 5; clk = 1
-    let addr5 = 5;
-    let value1 = [ONE, ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), addr5, 1.into(), value1).unwrap();
-
-    // write 4 into address 2; clk = 2
-    let addr2 = 2;
-    let value4 = [Felt::new(4), ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), addr2, 2.into(), value4).unwrap();
-
-    // read a value from address 5; clk = 3
-    mem.read(ContextId::root(), addr5, 3.into()).unwrap();
-
-    // write 2 into address 5; clk = 4
-    let value2 = [Felt::new(2), ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), addr5, 4.into(), value2).unwrap();
-
-    // read a value from address 2; clk = 5
-    mem.read(ContextId::root(), addr2, 5.into()).unwrap();
-
-    // write 7 into address 2; clk = 6
-    let value7 = [Felt::new(7), ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), addr2, 6.into(), value7).unwrap();
-
-    // read a value from address 5; clk = 7
-    mem.read(ContextId::root(), addr5, 7.into()).unwrap();
-
-    // read a value from address 2; clk = 8
-    mem.read(ContextId::root(), addr2, 8.into()).unwrap();
-
-    // read a value from address 5; clk = 9
-    mem.read(ContextId::root(), addr5, 9.into()).unwrap();
-
-    // check generated trace and memory data provided to the ChipletsBus; rows should be sorted by
-    // address and then clock cycle
-    let trace = build_trace(mem, 9);
-
-    // address 2
-    let mut prev_row = [ZERO; MEMORY_TRACE_WIDTH];
-    let memory_access = MemoryAccess::new(ContextId::root(), addr2, 2.into(), value4);
-    prev_row = verify_memory_access(&trace, 0, MEMORY_WRITE, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(ContextId::root(), addr2, 5.into(), value4);
-    prev_row = verify_memory_access(&trace, 1, MEMORY_COPY_READ, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(ContextId::root(), addr2, 6.into(), value7);
-    prev_row = verify_memory_access(&trace, 2, MEMORY_WRITE, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(ContextId::root(), addr2, 8.into(), value7);
-    prev_row = verify_memory_access(&trace, 3, MEMORY_COPY_READ, &memory_access, prev_row);
-
-    // address 5
-    let memory_access = MemoryAccess::new(ContextId::root(), addr5, 1.into(), value1);
-    prev_row = verify_memory_access(&trace, 4, MEMORY_WRITE, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(ContextId::root(), addr5, 3.into(), value1);
-    prev_row = verify_memory_access(&trace, 5, MEMORY_COPY_READ, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(ContextId::root(), addr5, 4.into(), value2);
-    prev_row = verify_memory_access(&trace, 6, MEMORY_WRITE, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(ContextId::root(), addr5, 7.into(), value2);
-    prev_row = verify_memory_access(&trace, 7, MEMORY_COPY_READ, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(ContextId::root(), addr5, 9.into(), value2);
-    verify_memory_access(&trace, 8, MEMORY_COPY_READ, &memory_access, prev_row);
+    // write a value into address 0; clk = 1
+    let addr = ONE;
+    let word1 = [ONE, ZERO, ZERO, ZERO];
+    let clk = 1.into();
+    let ctx = ContextId::root();
+    let ret = mem.write_word(ctx, addr, clk, word1);
+
+    assert_matches!(
+        ret,
+        Err(ExecutionError::MemoryUnalignedWordAccess { addr: _, ctx: _, clk: _ })
+    );
 }
 
+/// Tests that values written are properly read back.
 #[test]
-fn mem_multi_context() {
+fn mem_write_read() {
     let mut mem = Memory::default();
-
-    // write a value into ctx = ContextId::root(), addr = 0; clk = 1
-    let value1 = [ONE, ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), 0, 1.into(), value1).unwrap();
-    assert_eq!(value1, mem.get_value(ContextId::root(), 0).unwrap());
-    assert_eq!(1, mem.size());
-    assert_eq!(1, mem.trace_len());
-
-    // write a value into ctx = 3, addr = 1; clk = 4
-    let value2 = [ZERO, ONE, ZERO, ZERO];
-    mem.write(3.into(), 1, 4.into(), value2).unwrap();
-    assert_eq!(value2, mem.get_value(3.into(), 1).unwrap());
-    assert_eq!(2, mem.size());
-    assert_eq!(2, mem.trace_len());
-
-    // read a value from ctx = 3, addr = 1; clk = 6
-    let value = mem.read(3.into(), 1, 6.into()).unwrap();
-    assert_eq!(value2, value);
-    assert_eq!(2, mem.size());
-    assert_eq!(3, mem.trace_len());
-
-    // write a value into ctx = 3, addr = 0; clk = 7
-    let value3 = [ZERO, ZERO, ONE, ZERO];
-    mem.write(3.into(), 0, 7.into(), value3).unwrap();
-    assert_eq!(value3, mem.get_value(3.into(), 0).unwrap());
-    assert_eq!(3, mem.size());
-    assert_eq!(4, mem.trace_len());
-
-    // read a value from ctx = 0, addr = 0; clk = 9
-    let value = mem.read(ContextId::root(), 0, 9.into()).unwrap();
-    assert_eq!(value1, value);
-    assert_eq!(3, mem.size());
-    assert_eq!(5, mem.trace_len());
+    let mut clk: RowIndex = 1.into();
+
+    // write [1,2,3,4] starting at address 0; clk = 1
+    let word1234 = [ONE, 2_u32.into(), 3_u32.into(), 4_u32.into()];
+    mem.write_word(ContextId::root(), ZERO, clk, word1234).unwrap();
+    clk += 1;
+
+    // read individual values from addresses 3,2,1,0; clk = 2,3,4,5
+    let value_read = mem.read(ContextId::root(), 3_u32.into(), clk).unwrap();
+    assert_eq!(value_read, 4_u32.into());
+    clk += 1;
+    let value_read = mem.read(ContextId::root(), 2_u32.into(), clk).unwrap();
+    assert_eq!(value_read, 3_u32.into());
+    clk += 1;
+    let value_read = mem.read(ContextId::root(), 1_u32.into(), clk).unwrap();
+    assert_eq!(value_read, 2_u32.into());
+    clk += 1;
+    let value_read = mem.read(ContextId::root(), ZERO, clk).unwrap();
+    assert_eq!(value_read, 1_u32.into());
+    clk += 1;
+
+    // read word from address 0; clk = 6
+    let word_read = mem.read_word(ContextId::root(), ZERO, clk).unwrap();
+    assert_eq!(word_read, word1234);
+    clk += 1;
+
+    // write 42 into address 2; clk = 7
+    mem.write(ContextId::root(), 2_u32.into(), clk, 42_u32.into()).unwrap();
+    clk += 1;
+
+    // read element from address 2; clk = 8
+    let value_read = mem.read(ContextId::root(), 2_u32.into(), clk).unwrap();
+    assert_eq!(value_read, 42_u32.into());
+    clk += 1;
+
+    // read word from address 0; clk = 9
+    let word_read = mem.read_word(ContextId::root(), ZERO, clk).unwrap();
+    assert_eq!(word_read, [ONE, 2_u32.into(), 42_u32.into(), 4_u32.into()]);
+    clk += 1;
 
     // check generated trace and memory data provided to the ChipletsBus; rows should be sorted by
     // address and then clock cycle
-    let trace = build_trace(mem, 5);
+    let trace = build_trace(mem, 9);
+    let mut clk: RowIndex = 1.into();
 
-    // ctx = 0, addr = 0
+    // address 2
     let mut prev_row = [ZERO; MEMORY_TRACE_WIDTH];
-    let memory_access = MemoryAccess::new(ContextId::root(), 0, 1.into(), value1);
-    prev_row = verify_memory_access(&trace, 0, MEMORY_WRITE, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(ContextId::root(), 0, 9.into(), value1);
-    prev_row = verify_memory_access(&trace, 1, MEMORY_COPY_READ, &memory_access, prev_row);
-
-    // ctx = 3, addr = 0
-    let memory_access = MemoryAccess::new(3.into(), 0, 7.into(), value3);
-    prev_row = verify_memory_access(&trace, 2, MEMORY_WRITE, &memory_access, prev_row);
-
-    // ctx = 3, addr = 1
-    let memory_access = MemoryAccess::new(3.into(), 1, 4.into(), value2);
-    prev_row = verify_memory_access(&trace, 3, MEMORY_WRITE, &memory_access, prev_row);
-
-    let memory_access = MemoryAccess::new(3.into(), 1, 6.into(), value2);
-    verify_memory_access(&trace, 4, MEMORY_COPY_READ, &memory_access, prev_row);
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Write,
+        MemoryAccessType::Word,
+        ContextId::root(),
+        ZERO,
+        clk,
+        word1234,
+    );
+    prev_row = verify_memory_access(&trace, 0, memory_access, prev_row);
+    clk += 1;
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Element { addr_idx_in_batch: 3 },
+        ContextId::root(),
+        3_u32.into(),
+        clk,
+        word1234,
+    );
+    prev_row = verify_memory_access(&trace, 1, memory_access, prev_row);
+    clk += 1;
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Element { addr_idx_in_batch: 2 },
+        ContextId::root(),
+        2_u32.into(),
+        clk,
+        word1234,
+    );
+    prev_row = verify_memory_access(&trace, 2, memory_access, prev_row);
+    clk += 1;
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Element { addr_idx_in_batch: 1 },
+        ContextId::root(),
+        1_u32.into(),
+        clk,
+        word1234,
+    );
+    prev_row = verify_memory_access(&trace, 3, memory_access, prev_row);
+    clk += 1;
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Element { addr_idx_in_batch: 0 },
+        ContextId::root(),
+        ZERO,
+        clk,
+        word1234,
+    );
+    prev_row = verify_memory_access(&trace, 4, memory_access, prev_row);
+    clk += 1;
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Word,
+        ContextId::root(),
+        ZERO,
+        clk,
+        word1234,
+    );
+    prev_row = verify_memory_access(&trace, 5, memory_access, prev_row);
+    clk += 1;
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Write,
+        MemoryAccessType::Element { addr_idx_in_batch: 2 },
+        ContextId::root(),
+        2_u32.into(),
+        clk,
+        [ONE, 2_u32.into(), 42_u32.into(), 4_u32.into()],
+    );
+    prev_row = verify_memory_access(&trace, 6, memory_access, prev_row);
+    clk += 1;
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Element { addr_idx_in_batch: 2 },
+        ContextId::root(),
+        2_u32.into(),
+        clk,
+        [ONE, 2_u32.into(), 42_u32.into(), 4_u32.into()],
+    );
+    prev_row = verify_memory_access(&trace, 7, memory_access, prev_row);
+    clk += 1;
+
+    let memory_access = MemoryAccess::new(
+        MemoryOperation::Read,
+        MemoryAccessType::Word,
+        ContextId::root(),
+        ZERO,
+        clk,
+        [ONE, 2_u32.into(), 42_u32.into(), 4_u32.into()],
+    );
+    verify_memory_access(&trace, 8, memory_access, prev_row);
 }
 
 #[test]
 fn mem_get_state_at() {
     let mut mem = Memory::default();
 
-    // Write 1 into (ctx = 0, addr = 5) at clk = 1.
-    // This means that mem[5] = 1 at the beginning of clk = 2
-    let value1 = [ONE, ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), 5, 1.into(), value1).unwrap();
+    let addr_start: u32 = 40_u32;
 
-    // Write 4 into (ctx = 0, addr = 2) at clk = 2.
-    // This means that mem[2] = 4 at the beginning of clk = 3
-    let value4 = [Felt::new(4), ZERO, ZERO, ZERO];
-    mem.write(ContextId::root(), 2, 2.into(), value4).unwrap();
+    // Write word starting at (ctx = 0, addr = 40) at clk = 1.
+    // This means that mem[40..43] is set at the beginning of clk = 2
+    let word1234 = [ONE, 2_u32.into(), 3_u32.into(), 4_u32.into()];
+    mem.write_word(ContextId::root(), addr_start.into(), 1.into(), word1234)
+        .unwrap();
 
-    // write 7 into (ctx = 3, addr = 3) at clk = 4
-    // This means that mem[3] = 7 at the beginning of clk = 4
-    let value7 = [Felt::new(7), ZERO, ZERO, ZERO];
-    mem.write(3.into(), 3, 4.into(), value7).unwrap();
+    let word4567: [Felt; 4] = [4_u32.into(), 5_u32.into(), 6_u32.into(), 7_u32.into()];
+    mem.write_word(ContextId::root(), addr_start.into(), 2.into(), word4567)
+        .unwrap();
 
     // Check memory state at clk = 2
-    assert_eq!(mem.get_state_at(ContextId::root(), 2.into()), vec![(5, value1)]);
-    assert_eq!(mem.get_state_at(3.into(), 2.into()), vec![]);
+    let clk: RowIndex = 2.into();
+    assert_eq!(
+        mem.get_state_at(ContextId::root(), clk),
+        vec![
+            (addr_start.into(), word1234[0]),
+            (u64::from(addr_start) + 1_u64, word1234[1]),
+            (u64::from(addr_start) + 2_u64, word1234[2]),
+            (u64::from(addr_start) + 3_u64, word1234[3])
+        ]
+    );
+    assert_eq!(mem.get_state_at(3.into(), clk), vec![]);
 
     // Check memory state at clk = 3
-    assert_eq!(mem.get_state_at(ContextId::root(), 3.into()), vec![(2, value4), (5, value1)]);
-    assert_eq!(mem.get_state_at(3.into(), 3.into()), vec![]);
-
-    // Check memory state at clk = 4
-    assert_eq!(mem.get_state_at(ContextId::root(), 4.into()), vec![(2, value4), (5, value1)]);
-    assert_eq!(mem.get_state_at(3.into(), 4.into()), vec![]);
-
-    // Check memory state at clk = 5
-    assert_eq!(mem.get_state_at(ContextId::root(), 5.into()), vec![(2, value4), (5, value1)]);
-    assert_eq!(mem.get_state_at(3.into(), 5.into()), vec![(3, value7)]);
+    let clk: RowIndex = 3.into();
+    assert_eq!(
+        mem.get_state_at(ContextId::root(), clk),
+        vec![
+            (addr_start.into(), word4567[0]),
+            (u64::from(addr_start) + 1_u64, word4567[1]),
+            (u64::from(addr_start) + 2_u64, word4567[2]),
+            (u64::from(addr_start) + 3_u64, word4567[3])
+        ]
+    );
+    assert_eq!(mem.get_state_at(3.into(), clk), vec![]);
 }
 
 // HELPER STRUCT & FUNCTIONS
@@ -304,19 +460,35 @@ fn mem_get_state_at() {
 
 /// Contains data representing a memory access.
 pub struct MemoryAccess {
+    operation: MemoryOperation,
+    access_type: MemoryAccessType,
     ctx: ContextId,
     addr: Felt,
     clk: Felt,
-    word: [Felt; 4],
+    batch_values: [Felt; 4],
 }
 
 impl MemoryAccess {
-    pub fn new(ctx: ContextId, addr: u32, clk: RowIndex, word: Word) -> Self {
+    pub fn new(
+        operation: MemoryOperation,
+        access_type: MemoryAccessType,
+        ctx: ContextId,
+        addr: Felt,
+        clk: RowIndex,
+        batch_values: Word,
+    ) -> Self {
+        if let MemoryAccessType::Element { addr_idx_in_batch: addr_idx_in_word } = access_type {
+            let addr: u32 = addr.try_into().unwrap();
+            assert_eq!(addr_idx_in_word as u32, addr % WORD_SIZE as u32);
+        }
+
         Self {
+            operation,
+            access_type,
             ctx,
-            addr: Felt::from(addr),
+            addr,
             clk: Felt::from(clk),
-            word,
+            batch_values,
         }
     }
 }
@@ -339,29 +511,57 @@ fn read_trace_row(trace: &[Vec<Felt>], step: usize) -> [Felt; MEMORY_TRACE_WIDTH
 }
 
 fn build_trace_row(
-    memory_access: &MemoryAccess,
-    op_selectors: Selectors,
+    memory_access: MemoryAccess,
     prev_row: [Felt; MEMORY_TRACE_WIDTH],
 ) -> [Felt; MEMORY_TRACE_WIDTH] {
-    let MemoryAccess { ctx, addr, clk, word: new_val } = *memory_access;
+    let MemoryAccess {
+        operation,
+        access_type,
+        ctx,
+        addr,
+        clk,
+        batch_values,
+    } = memory_access;
+
+    let (batch, idx1, idx0) = {
+        let addr: u32 = addr.try_into().unwrap();
+        let remainder = addr % WORD_SIZE as u32;
+        let batch = Felt::from(addr - remainder);
+
+        match remainder {
+            0 => (batch, ZERO, ZERO),
+            1 => (batch, ZERO, ONE),
+            2 => (batch, ONE, ZERO),
+            3 => (batch, ONE, ONE),
+            _ => unreachable!(),
+        }
+    };
 
     let mut row = [ZERO; MEMORY_TRACE_WIDTH];
 
-    row[0] = op_selectors[0];
-    row[1] = op_selectors[1];
+    row[READ_WRITE_COL_IDX] = match operation {
+        MemoryOperation::Read => MEMORY_READ,
+        MemoryOperation::Write => MEMORY_WRITE,
+    };
+    row[ELEMENT_OR_WORD_COL_IDX] = match access_type {
+        MemoryAccessType::Element { .. } => MEMORY_ACCESS_ELEMENT,
+        MemoryAccessType::Word => MEMORY_ACCESS_WORD,
+    };
     row[CTX_COL_IDX] = ctx.into();
-    row[ADDR_COL_IDX] = addr;
+    row[BATCH_COL_IDX] = batch;
+    row[IDX0_COL_IDX] = idx0;
+    row[IDX1_COL_IDX] = idx1;
     row[CLK_COL_IDX] = clk;
-    row[V_COL_RANGE.start] = new_val[0];
-    row[V_COL_RANGE.start + 1] = new_val[1];
-    row[V_COL_RANGE.start + 2] = new_val[2];
-    row[V_COL_RANGE.start + 3] = new_val[3];
+    row[V_COL_RANGE.start] = batch_values[0];
+    row[V_COL_RANGE.start + 1] = batch_values[1];
+    row[V_COL_RANGE.start + 2] = batch_values[2];
+    row[V_COL_RANGE.start + 3] = batch_values[3];
 
     if prev_row != [ZERO; MEMORY_TRACE_WIDTH] {
         let delta = if row[CTX_COL_IDX] != prev_row[CTX_COL_IDX] {
             row[CTX_COL_IDX] - prev_row[CTX_COL_IDX]
-        } else if row[ADDR_COL_IDX] != prev_row[ADDR_COL_IDX] {
-            row[ADDR_COL_IDX] - prev_row[ADDR_COL_IDX]
+        } else if row[BATCH_COL_IDX] != prev_row[BATCH_COL_IDX] {
+            row[BATCH_COL_IDX] - prev_row[BATCH_COL_IDX]
         } else {
             row[CLK_COL_IDX] - prev_row[CLK_COL_IDX] - ONE
         };
@@ -372,18 +572,24 @@ fn build_trace_row(
         row[D_INV_COL_IDX] = delta.inv();
     }
 
+    if row[BATCH_COL_IDX] == prev_row[BATCH_COL_IDX] && row[CTX_COL_IDX] == prev_row[CTX_COL_IDX] {
+        row[FLAG_SAME_BATCH_AND_CONTEXT] = ONE;
+    } else {
+        row[FLAG_SAME_BATCH_AND_CONTEXT] = ZERO;
+    }
+
     row
 }
 
 fn verify_memory_access(
     trace: &[Vec<Felt>],
     row: u32,
-    op_selectors: Selectors,
-    memory_access: &MemoryAccess,
+    mem_access: MemoryAccess,
     prev_row: [Felt; MEMORY_TRACE_WIDTH],
 ) -> [Felt; MEMORY_TRACE_WIDTH] {
-    let expected_row = build_trace_row(memory_access, op_selectors, prev_row);
-    assert_eq!(expected_row, read_trace_row(trace, row as usize));
+    let expected_row = build_trace_row(mem_access, prev_row);
+    let actual_row = read_trace_row(trace, row as usize);
+    assert_eq!(expected_row, actual_row);
 
     expected_row
 }
diff --git a/processor/src/chiplets/mod.rs b/processor/src/chiplets/mod.rs
index d2c8c13aae..c36dda1945 100644
--- a/processor/src/chiplets/mod.rs
+++ b/processor/src/chiplets/mod.rs
@@ -10,7 +10,6 @@ use super::{
     crypto::MerklePath, utils, ChipletsTrace, ExecutionError, Felt, FieldElement, RangeChecker,
     TraceFragment, Word, CHIPLETS_WIDTH, EMPTY_WORD, ONE, ZERO,
 };
-use crate::system::ContextId;
 
 mod bitwise;
 use bitwise::Bitwise;
@@ -44,7 +43,8 @@ mod tests;
 /// * Hasher segment: contains the trace and selector for the hasher chiplet. This segment fills the
 ///   first rows of the trace up to the length of the hasher `trace_len`.
 ///   - column 0: selector column with values set to ZERO
-///   - columns 1-17: execution trace of hash chiplet
+///   - columns 1-16: execution trace of hash chiplet
+///   - column 17: unused column padded with ZERO
 /// * Bitwise segment: contains the trace and selectors for the bitwise chiplet. This segment begins
 ///   at the end of the hasher segment and fills the next rows of the trace for the `trace_len` of
 ///   the bitwise chiplet.
@@ -52,13 +52,12 @@ mod tests;
 ///   - column 1: selector column with values set to ZERO
 ///   - columns 2-14: execution trace of bitwise chiplet
 ///   - columns 15-17: unused columns padded with ZERO
-/// * Memory segment: contains the trace and selectors for the memory chiplet * This segment begins
+/// * Memory segment: contains the trace and selectors for the memory chiplet.  This segment begins
 ///   at the end of the bitwise segment and fills the next rows of the trace for the `trace_len` of
 ///   the memory chiplet.
 ///   - column 0-1: selector columns with values set to ONE
 ///   - column 2: selector column with values set to ZERO
-///   - columns 3-14: execution trace of memory chiplet
-///   - columns 15-17: unused column padded with ZERO
+///   - columns 3-17: execution trace of memory chiplet
 /// * Kernel ROM segment: contains the trace and selectors for the kernel ROM chiplet * This segment
 ///   begins at the end of the memory segment and fills the next rows of the trace for the
 ///   `trace_len` of the kernel ROM chiplet.
@@ -89,11 +88,11 @@ mod tests;
 ///             | . | . |   selectors   |                                   |-------------|
 ///             | . | 0 |               |                                   |-------------|
 ///             | . +---+---+-----------------------------------------------+-------------+
-///             | . | 1 | 0 |                |                              |-------------|
-///             | . | . | . | Memory chiplet |      Memory chiplet          |-------------|
-///             | . | . | . | internal       |      12 columns              |-- Padding --|
-///             | . | . | . | selectors      |      constraint degree 9     |-------------|
-///             | . | . | 0 |                |                              |-------------|
+///             | . | 1 | 0 |                                               |-------------|
+///             | . | . | . |            Memory chiplet                     |-------------|
+///             | . | . | . |              15 columns                       |-- Padding --|
+///             | . | . | . |          constraint degree 9                  |-------------|
+///             | . | . | 0 |                                               |-------------|
 ///             | . + . |---+---+-------------------------------------------+-------------+
 ///             | . | . | 1 | 0 |                   |                       |-------------|
 ///             | . | . | . | . |  Kernel ROM       |   Kernel ROM chiplet  |-------------|
@@ -286,91 +285,14 @@ impl Chiplets {
     // MEMORY CHIPLET ACCESSORS
     // --------------------------------------------------------------------------------------------
 
-    /// Returns a word located in memory at the specified context/address while recording the
-    /// memory access in the memory trace.
-    ///
-    /// If the specified address hasn't been previously written to, four ZERO elements are
-    /// returned. This effectively implies that memory is initialized to ZERO.
-    pub fn read_mem(&mut self, ctx: ContextId, addr: u32) -> Result<Word, ExecutionError> {
-        // read the word from memory
-        self.memory.read(ctx, addr, self.clk)
-    }
-
-    /// Returns two words read from consecutive addresses started with `addr` in the specified
-    /// context while recording memory accesses in the memory trace.
-    ///
-    /// If either of the accessed addresses hasn't been previously written to, ZERO elements are
-    /// returned. This effectively implies that memory is initialized to ZERO.
-    pub fn read_mem_double(
-        &mut self,
-        ctx: ContextId,
-        addr: u32,
-    ) -> Result<[Word; 2], ExecutionError> {
-        // read two words from memory: from addr and from addr + 1
-        let addr2 = addr + 1;
-        Ok([self.memory.read(ctx, addr, self.clk)?, self.memory.read(ctx, addr2, self.clk)?])
-    }
-
-    /// Writes the provided word at the specified context/address.
-    pub fn write_mem(
-        &mut self,
-        ctx: ContextId,
-        addr: u32,
-        word: Word,
-    ) -> Result<(), ExecutionError> {
-        self.memory.write(ctx, addr, self.clk, word)
-    }
-
-    /// Writes the provided element into the specified context/address leaving the remaining 3
-    /// elements of the word previously stored at that address unchanged.
-    pub fn write_mem_element(
-        &mut self,
-        ctx: ContextId,
-        addr: u32,
-        value: Felt,
-    ) -> Result<Word, ExecutionError> {
-        let old_word = self.memory.get_old_value(ctx, addr);
-        let new_word = [value, old_word[1], old_word[2], old_word[3]];
-
-        self.memory.write(ctx, addr, self.clk, new_word)?;
-
-        Ok(old_word)
+    /// Returns a reference to the Memory chiplet.
+    pub fn memory(&self) -> &Memory {
+        &self.memory
     }
 
-    /// Writes the two provided words to two consecutive addresses in memory in the specified
-    /// context, starting at the specified address.
-    pub fn write_mem_double(
-        &mut self,
-        ctx: ContextId,
-        addr: u32,
-        words: [Word; 2],
-    ) -> Result<(), ExecutionError> {
-        let addr2 = addr + 1;
-        // write two words to memory at addr and addr + 1
-        self.memory.write(ctx, addr, self.clk, words[0])?;
-        self.memory.write(ctx, addr2, self.clk, words[1])
-    }
-
-    /// Returns a word located at the specified context/address, or None if the address hasn't
-    /// been accessed previously.
-    ///
-    /// Unlike mem_read() which modifies the memory access trace, this method returns the value at
-    /// the specified address (if one exists) without altering the memory access trace.
-    pub fn get_mem_value(&self, ctx: ContextId, addr: u32) -> Option<Word> {
-        self.memory.get_value(ctx, addr)
-    }
-
-    /// Returns the entire memory state for the specified execution context at the specified cycle.
-    /// The state is returned as a vector of (address, value) tuples, and includes addresses which
-    /// have been accessed at least once.
-    pub fn get_mem_state_at(&self, ctx: ContextId, clk: RowIndex) -> Vec<(u64, Word)> {
-        self.memory.get_state_at(ctx, clk)
-    }
-
-    /// Returns current size of the memory (in words) across all execution contexts.
-    #[cfg(test)]
-    pub fn get_mem_size(&self) -> usize {
-        self.memory.size()
+    /// Returns a mutable reference to the Memory chiplet.
+    pub fn memory_mut(&mut self) -> &mut Memory {
+        &mut self.memory
     }
 
     // KERNEL ROM ACCESSORS
@@ -469,7 +391,7 @@ impl Chiplets {
         // so they can be filled with the chiplet traces
         for (column_num, column) in trace.iter_mut().enumerate().skip(1) {
             match column_num {
-                1 | 15..=17 => {
+                1 => {
                     // columns 1 and 15 - 17 are relevant only for the hasher
                     hasher_fragment.push_column_slice(column, hasher.trace_len());
                 },
@@ -491,6 +413,19 @@ impl Chiplets {
                     let rest = memory_fragment.push_column_slice(rest, memory.trace_len());
                     kernel_rom_fragment.push_column_slice(rest, kernel_rom.trace_len());
                 },
+                15 | 16 => {
+                    // columns 15 and 16 are relevant only for the hasher and memory chiplets
+                    let rest = hasher_fragment.push_column_slice(column, hasher.trace_len());
+                    // skip bitwise chiplet
+                    let (_, rest) = rest.split_at_mut(bitwise.trace_len());
+                    memory_fragment.push_column_slice(rest, memory.trace_len());
+                },
+                17 => {
+                    // column 17 is relevant only for the memory chiplet
+                    // skip the hasher and bitwise chiplets
+                    let (_, rest) = column.split_at_mut(hasher.trace_len() + bitwise.trace_len());
+                    memory_fragment.push_column_slice(rest, memory.trace_len());
+                },
                 _ => panic!("invalid column index"),
             }
         }
diff --git a/processor/src/chiplets/tests.rs b/processor/src/chiplets/tests.rs
index 89253f6916..28606b983b 100644
--- a/processor/src/chiplets/tests.rs
+++ b/processor/src/chiplets/tests.rs
@@ -51,8 +51,9 @@ fn bitwise_chiplet_trace() {
 #[test]
 fn memory_chiplet_trace() {
     // --- single memory operation with no stack manipulation -------------------------------------
+    let addr = Felt::from(4_u32);
     let stack = [1, 2, 3, 4];
-    let operations = vec![Operation::Push(Felt::new(2)), Operation::MStoreW];
+    let operations = vec![Operation::Push(addr), Operation::MStoreW];
     let (chiplets_trace, trace_len) = build_trace(&stack, operations, Kernel::default());
     let memory_trace_len = 1;
 
diff --git a/processor/src/debug.rs b/processor/src/debug.rs
index ab908b5609..805b7f64a6 100644
--- a/processor/src/debug.rs
+++ b/processor/src/debug.rs
@@ -5,7 +5,7 @@ use alloc::{
 use core::fmt;
 
 use miden_air::RowIndex;
-use vm_core::{AssemblyOp, Operation, StackOutputs, Word};
+use vm_core::{AssemblyOp, Operation, StackOutputs};
 
 use crate::{
     range::RangeChecker, system::ContextId, Chiplets, ChipletsLengths, Decoder, ExecutionError,
@@ -21,17 +21,15 @@ pub struct VmState {
     pub asmop: Option<AsmOpInfo>,
     pub fmp: Felt,
     pub stack: Vec<Felt>,
-    pub memory: Vec<(u64, Word)>,
+    pub memory: Vec<(u64, Felt)>,
 }
 
 impl fmt::Display for VmState {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let stack: Vec<u64> = self.stack.iter().map(|x| x.as_int()).collect();
-        let memory: Vec<(u64, [u64; 4])> =
-            self.memory.iter().map(|x| (x.0, word_to_ints(&x.1))).collect();
         write!(
             f,
-            "clk={}{}{}, fmp={}, stack={stack:?}, memory={memory:?}",
+            "clk={}{}{}, fmp={}, stack={stack:?}, memory={:?}",
             self.clk,
             match self.op {
                 Some(op) => format!(", op={op}"),
@@ -41,7 +39,8 @@ impl fmt::Display for VmState {
                 Some(op) => format!(", {op}"),
                 None => "".to_string(),
             },
-            self.fmp
+            self.fmp,
+            self.memory
         )
     }
 }
@@ -166,7 +165,7 @@ impl VmStateIterator {
             asmop,
             fmp: self.system.get_fmp_at(self.clk),
             stack: self.stack.get_state_at(self.clk),
-            memory: self.chiplets.get_mem_state_at(ctx, self.clk),
+            memory: self.chiplets.memory().get_state_at(ctx, self.clk),
         });
 
         self.clk -= 1;
@@ -236,7 +235,7 @@ impl Iterator for VmStateIterator {
             asmop,
             fmp: self.system.get_fmp_at(self.clk),
             stack: self.stack.get_state_at(self.clk),
-            memory: self.chiplets.get_mem_state_at(ctx, self.clk),
+            memory: self.chiplets.memory().get_state_at(ctx, self.clk),
         }));
 
         self.clk += 1;
@@ -245,12 +244,6 @@ impl Iterator for VmStateIterator {
     }
 }
 
-// HELPER FUNCTIONS
-// ================================================================================================
-fn word_to_ints(word: &Word) -> [u64; 4] {
-    [word[0].as_int(), word[1].as_int(), word[2].as_int(), word[3].as_int()]
-}
-
 /// Contains assembly instruction and operation index in the sequence corresponding to the specified
 /// AsmOp decorator. This index starts from 1 instead of 0.
 #[derive(Clone, Debug, Eq, PartialEq)]
diff --git a/processor/src/decoder/mod.rs b/processor/src/decoder/mod.rs
index 2b7ee125ad..c7575e5b45 100644
--- a/processor/src/decoder/mod.rs
+++ b/processor/src/decoder/mod.rs
@@ -319,7 +319,10 @@ impl Process {
         let mem_addr = self.stack.get(0);
         // The callee hash is stored in memory, and the address is specified on the top of the
         // stack.
-        let callee_hash = self.read_mem_word(mem_addr)?;
+        let callee_hash =
+            self.chiplets
+                .memory_mut()
+                .read_word(self.system.ctx(), mem_addr, self.system.clk())?;
 
         let addr = self.chiplets.hash_control_block(
             EMPTY_WORD,
@@ -350,7 +353,10 @@ impl Process {
         let mem_addr = self.stack.get(0);
         // The callee hash is stored in memory, and the address is specified on the top of the
         // stack.
-        let callee_hash = self.read_mem_word(mem_addr)?;
+        let callee_hash =
+            self.chiplets
+                .memory_mut()
+                .read_word(self.system.ctx(), mem_addr, self.system.clk())?;
 
         // Note: other functions end in "executing a Noop", which
         // 1. ensures trace capacity,
diff --git a/processor/src/decoder/tests.rs b/processor/src/decoder/tests.rs
index 664517b7a2..ef7c810b88 100644
--- a/processor/src/decoder/tests.rs
+++ b/processor/src/decoder/tests.rs
@@ -1289,14 +1289,14 @@ fn dyn_block() {
     // end
     //
     // begin
-    //   # stack: [42, DIGEST]
+    //   # stack: [40, DIGEST]
     //   mstorew
     //   push.42
     //   dynexec
     // end
 
-    const FOO_ROOT_NODE_ADDR: u64 = 42;
-    const PUSH_42_OP: Operation = Operation::Push(Felt::new(FOO_ROOT_NODE_ADDR));
+    const FOO_ROOT_NODE_ADDR: u64 = 40;
+    const PUSH_40_OP: Operation = Operation::Push(Felt::new(FOO_ROOT_NODE_ADDR));
 
     let mut mast_forest = MastForest::new();
 
@@ -1308,7 +1308,7 @@ fn dyn_block() {
     let mstorew_node = MastNode::new_basic_block(vec![Operation::MStoreW], None).unwrap();
     let mstorew_node_id = mast_forest.add_node(mstorew_node.clone()).unwrap();
 
-    let push_node = MastNode::new_basic_block(vec![PUSH_42_OP], None).unwrap();
+    let push_node = MastNode::new_basic_block(vec![PUSH_40_OP], None).unwrap();
     let push_node_id = mast_forest.add_node(push_node.clone()).unwrap();
 
     let join_node = MastNode::new_join(mstorew_node_id, push_node_id, &mast_forest).unwrap();
@@ -1348,7 +1348,7 @@ fn dyn_block() {
     // starting second span
     let push_basic_block_addr = mstorew_basic_block_addr + EIGHT;
     check_op_decoding(&trace, 5, join_addr, Operation::Span, 2, 0, 0);
-    check_op_decoding(&trace, 6, push_basic_block_addr, PUSH_42_OP, 1, 0, 1);
+    check_op_decoding(&trace, 6, push_basic_block_addr, PUSH_40_OP, 1, 0, 1);
     check_op_decoding(&trace, 7, push_basic_block_addr, Operation::Noop, 0, 1, 1);
     check_op_decoding(&trace, 8, push_basic_block_addr, Operation::End, 0, 0, 0);
     // end inner join
diff --git a/processor/src/errors.rs b/processor/src/errors.rs
index 61d7319300..902b1df115 100644
--- a/processor/src/errors.rs
+++ b/processor/src/errors.rs
@@ -92,6 +92,10 @@ pub enum ExecutionError {
     NoMastForestWithProcedure { root_digest: Digest },
     #[error("memory address cannot exceed 2^32 but was {0}")]
     MemoryAddressOutOfBounds(u64),
+    #[error(
+        "word memory access at address {addr} in context {ctx} is unaligned at clock cycle {clk}"
+    )]
+    MemoryUnalignedWordAccess { addr: u32, ctx: ContextId, clk: Felt },
     #[error("merkle path verification failed for value {value} at index {index} in the Merkle tree with root {root} (error code: {err_code})", 
       value = to_hex(Felt::elements_as_bytes(value)),
       root = to_hex(root.as_bytes()),
@@ -129,6 +133,8 @@ pub enum ExecutionError {
       hex = to_hex(.0.as_bytes())
     )]
     SyscallTargetNotInKernel(Digest),
+    #[error("word access at memory address {addr} in context {ctx} is unaligned")]
+    UnalignedMemoryWordAccess { addr: u32, ctx: ContextId },
 }
 
 impl From<Ext2InttError> for ExecutionError {
@@ -152,6 +158,8 @@ pub enum Ext2InttError {
     InputSizeTooBig(u64),
     #[error("address of the first input must be smaller than 2^32, but was {0}")]
     InputStartAddressTooBig(u64),
+    #[error("address of the first input is not word aligned: {0}")]
+    InputStartNotWordAligned(u64),
     #[error("output size ({0}) cannot be greater than the input size ({1})")]
     OutputSizeTooBig(usize, usize),
     #[error("output size must be greater than 0")]
diff --git a/processor/src/host/debug.rs b/processor/src/host/debug.rs
index 6812d5bc65..bb4c7ad46d 100644
--- a/processor/src/host/debug.rs
+++ b/processor/src/host/debug.rs
@@ -2,7 +2,7 @@ use alloc::vec::Vec;
 use std::{print, println};
 
 use miden_air::RowIndex;
-use vm_core::{DebugOptions, Word};
+use vm_core::{DebugOptions, Felt};
 
 use super::ProcessState;
 use crate::system::ContextId;
@@ -74,19 +74,22 @@ impl Printer {
     /// Prints the whole memory state at the cycle `clk` in context `ctx`.
     fn print_mem_all(&self, process: ProcessState) {
         let mem = process.get_mem_state(self.ctx);
-        let padding =
-            mem.iter().fold(0, |max, value| word_elem_max_len(Some(value.1)).max(max)) as usize;
+        let ele_width = mem
+            .iter()
+            .map(|(_addr, value)| element_printed_width(Some(*value)))
+            .max()
+            .unwrap_or(0) as usize;
 
         println!("Memory state before step {} for the context {}:", self.clk, self.ctx);
 
         // print the main part of the memory (wihtout the last value)
         for (addr, value) in mem.iter().take(mem.len() - 1) {
-            print_mem_address(*addr as u32, Some(*value), false, false, padding);
+            print_mem_address(*addr as u32, Some(*value), false, false, ele_width);
         }
 
         // print the last memory value
         if let Some((addr, value)) = mem.last() {
-            print_mem_address(*addr as u32, Some(*value), true, false, padding);
+            print_mem_address(*addr as u32, Some(*value), true, false, ele_width);
         }
     }
 
@@ -150,18 +153,21 @@ impl Printer {
 ///
 /// If `is_local` is true, the output addresses are formatted as decimal values, otherwise as hex
 /// strings.
-fn print_interval(mem_interval: Vec<(u32, Option<Word>)>, is_local: bool) {
-    let padding =
-        mem_interval.iter().fold(0, |max, value| word_elem_max_len(value.1).max(max)) as usize;
+fn print_interval(mem_interval: Vec<(u32, Option<Felt>)>, is_local: bool) {
+    let ele_width = mem_interval
+        .iter()
+        .map(|(_addr, value)| element_printed_width(*value))
+        .max()
+        .unwrap_or(0) as usize;
 
     // print the main part of the memory (wihtout the last value)
-    for (addr, value) in mem_interval.iter().take(mem_interval.len() - 1) {
-        print_mem_address(*addr, *value, false, is_local, padding)
+    for (addr, mem_value) in mem_interval.iter().take(mem_interval.len() - 1) {
+        print_mem_address(*addr, *mem_value, false, is_local, ele_width)
     }
 
     // print the last memory value
     if let Some((addr, value)) = mem_interval.last() {
-        print_mem_address(*addr, *value, true, is_local, padding);
+        print_mem_address(*addr, *value, true, is_local, ele_width);
     }
 }
 
@@ -171,27 +177,26 @@ fn print_interval(mem_interval: Vec<(u32, Option<Word>)>, is_local: bool) {
 /// string.
 fn print_mem_address(
     addr: u32,
-    value: Option<Word>,
+    mem_value: Option<Felt>,
     is_last: bool,
     is_local: bool,
-    padding: usize,
+    ele_width: usize,
 ) {
-    if let Some(value) = value {
+    if let Some(value) = mem_value {
         if is_last {
             if is_local {
                 print!("└── {addr:>5}: ");
             } else {
                 print!("└── {addr:#010x}: ");
             }
-            print_word(value, padding);
-            println!();
+            println!("{:>width$}\n", value.as_int(), width = ele_width);
         } else {
             if is_local {
                 print!("├── {addr:>5}: ");
             } else {
                 print!("├── {addr:#010x}: ");
             }
-            print_word(value, padding);
+            println!("{:>width$}", value.as_int(), width = ele_width);
         }
     } else if is_last {
         if is_local {
@@ -206,23 +211,10 @@ fn print_mem_address(
     }
 }
 
-/// Prints the provided Word with specified padding.
-fn print_word(value: Word, padding: usize) {
-    println!(
-        "[{:>width$}, {:>width$}, {:>width$}, {:>width$}]",
-        value[0].as_int(),
-        value[1].as_int(),
-        value[2].as_int(),
-        value[3].as_int(),
-        width = padding
-    )
-}
-
-/// Returns the maximum length among the word elements.
-fn word_elem_max_len(word: Option<Word>) -> u32 {
-    if let Some(word) = word {
-        word.iter()
-            .fold(0, |max, value| (value.as_int().checked_ilog10().unwrap_or(1) + 1).max(max))
+/// Returns the number of digits required to print the provided element.
+fn element_printed_width(element: Option<Felt>) -> u32 {
+    if let Some(element) = element {
+        element.as_int().checked_ilog10().unwrap_or(1) + 1
     } else {
         0
     }
diff --git a/processor/src/lib.rs b/processor/src/lib.rs
index 2f186fd728..c1acc513a2 100644
--- a/processor/src/lib.rs
+++ b/processor/src/lib.rs
@@ -676,10 +676,18 @@ impl ProcessState<'_> {
         self.stack.get_state_at(self.system.clk())
     }
 
-    /// Returns a word located at the specified context/address, or None if the address hasn't
+    /// Returns the element located at the specified context/address, or None if the address hasn't
     /// been accessed previously.
-    pub fn get_mem_value(&self, ctx: ContextId, addr: u32) -> Option<Word> {
-        self.chiplets.get_mem_value(ctx, addr)
+    pub fn get_mem_value(&self, ctx: ContextId, addr: u32) -> Option<Felt> {
+        self.chiplets.memory().get_value(ctx, addr)
+    }
+
+    /// Returns the batch of elements starting at the specified context/address.
+    ///
+    /// # Errors
+    /// - If the address is not word aligned.
+    pub fn get_mem_word(&self, ctx: ContextId, addr: u32) -> Result<Option<Word>, ExecutionError> {
+        self.chiplets.memory().get_word(ctx, addr)
     }
 
     /// Returns the entire memory state for the specified execution context at the current clock
@@ -687,8 +695,8 @@ impl ProcessState<'_> {
     ///
     /// The state is returned as a vector of (address, value) tuples, and includes addresses which
     /// have been accessed at least once.
-    pub fn get_mem_state(&self, ctx: ContextId) -> Vec<(u64, Word)> {
-        self.chiplets.get_mem_state_at(ctx, self.system.clk())
+    pub fn get_mem_state(&self, ctx: ContextId) -> Vec<(u64, Felt)> {
+        self.chiplets.memory().get_state_at(ctx, self.system.clk())
     }
 }
 
diff --git a/processor/src/operations/comb_ops.rs b/processor/src/operations/comb_ops.rs
index 498026f038..73bece2e4b 100644
--- a/processor/src/operations/comb_ops.rs
+++ b/processor/src/operations/comb_ops.rs
@@ -125,7 +125,7 @@ impl Process {
     fn get_randomness(&mut self) -> Result<QuadFelt, ExecutionError> {
         let ctx = self.system.ctx();
         let addr = self.stack.get(14);
-        let word = self.chiplets.read_mem(ctx, addr.as_int() as u32)?;
+        let word = self.chiplets.memory_mut().read_word(ctx, addr, self.system.clk())?;
         let a0 = word[0];
         let a1 = word[1];
 
@@ -136,7 +136,7 @@ impl Process {
     fn get_ood_values(&mut self) -> Result<[QuadFelt; 2], ExecutionError> {
         let ctx = self.system.ctx();
         let addr = self.stack.get(13);
-        let word = self.chiplets.read_mem(ctx, addr.as_int() as u32)?;
+        let word = self.chiplets.memory_mut().read_word(ctx, addr, self.system.clk())?;
 
         Ok([QuadFelt::new(word[0], word[1]), QuadFelt::new(word[2], word[3])])
     }
@@ -203,9 +203,11 @@ mod tests {
         let tztgz = rand_array::<Felt, 4>();
         process
             .chiplets
-            .write_mem(
+            .memory_mut()
+            .write_word(
                 ctx,
                 inputs[2].as_int().try_into().expect("Shouldn't fail by construction"),
+                process.system.clk(),
                 tztgz,
             )
             .unwrap();
@@ -213,9 +215,11 @@ mod tests {
         let a = rand_array::<Felt, 4>();
         process
             .chiplets
-            .write_mem(
+            .memory_mut()
+            .write_word(
                 ctx,
                 inputs[1].as_int().try_into().expect("Shouldn't fail by construction"),
+                process.system.clk(),
                 a,
             )
             .unwrap();
diff --git a/processor/src/operations/io_ops.rs b/processor/src/operations/io_ops.rs
index 96c98c0b81..b1ded47a2c 100644
--- a/processor/src/operations/io_ops.rs
+++ b/processor/src/operations/io_ops.rs
@@ -1,4 +1,6 @@
-use super::{ExecutionError, Felt, Operation, Process};
+use vm_core::WORD_SIZE;
+
+use super::{ExecutionError, Felt, Process};
 use crate::{AdviceProvider, Host, Word};
 
 // INPUT / OUTPUT OPERATIONS
@@ -20,19 +22,23 @@ impl Process {
     // MEMORY READING AND WRITING
     // --------------------------------------------------------------------------------------------
 
-    /// Loads a word (4 elements) from the specified memory address onto the stack.
+    /// Loads a word (4 elements) starting at the specified memory address onto the stack.
     ///
     /// The operation works as follows:
     /// - The memory address is popped off the stack.
-    /// - A word is retrieved from memory at the specified address. The memory is always initialized
-    ///   to ZEROs, and thus, if the specified address has never been written to, four ZERO elements
-    ///   are returned.
+    /// - A word is retrieved from memory starting at the specified address, which must be aligned
+    ///   to a word boundary. The memory is always initialized to ZEROs, and thus, for any of the
+    ///   four addresses which were not previously been written to, four ZERO elements are returned.
     /// - The top four elements of the stack are overwritten with values retrieved from memory.
     ///
     /// Thus, the net result of the operation is that the stack is shifted left by one item.
     pub(super) fn op_mloadw(&mut self) -> Result<(), ExecutionError> {
         // get the address from the stack and read the word from current memory context
-        let mut word = self.read_mem_word(self.stack.get(0))?;
+        let mut word = self.chiplets.memory_mut().read_word(
+            self.system.ctx(),
+            self.stack.get(0),
+            self.system.clk(),
+        )?;
         word.reverse();
 
         // update the stack state
@@ -44,68 +50,24 @@ impl Process {
         Ok(())
     }
 
-    /// Loads the first element from the specified memory address onto the stack.
+    /// Loads the element from the specified memory address onto the stack.
     ///
     /// The operation works as follows:
     /// - The memory address is popped off the stack.
-    /// - A word is retrieved from memory at the specified address. The memory is always initialized
-    ///   to ZEROs, and thus, if the specified address has never been written to, four ZERO elements
-    ///   are returned.
-    /// - The first element of the word retrieved from memory is pushed to the top of the stack.
-    ///
-    /// The first 3 helper registers are filled with the elements of the word which were not pushed
-    /// to the stack. They are stored in stack order, with the last element of the word in helper
-    /// register 0.
+    /// - The element is retrieved from memory at the specified address. The memory is always
+    ///   initialized to ZEROs, and thus, if the specified address has never been written to, the
+    ///   ZERO element is returned.
+    /// - The element retrieved from memory is pushed to the top of the stack.
     pub(super) fn op_mload(&mut self) -> Result<(), ExecutionError> {
-        // get the address from the stack and read the word from memory
-        let mut word = self.read_mem_word(self.stack.get(0))?;
-        word.reverse();
+        let element = self.chiplets.memory_mut().read(
+            self.system.ctx(),
+            self.stack.get(0),
+            self.system.clk(),
+        )?;
 
-        // update the stack state
-        self.stack.set(0, word[3]);
+        self.stack.set(0, element);
         self.stack.copy_state(1);
 
-        // write the 3 unused elements to the helpers so they're available for constraint evaluation
-        self.decoder.set_user_op_helpers(Operation::MLoad, &word[..3]);
-
-        Ok(())
-    }
-
-    /// Loads two words from memory and replaces the top 8 elements of the stack with their
-    /// contents.
-    ///
-    /// The operation works as follows:
-    /// - The memory address of the first word is retrieved from 13th stack element (position 12).
-    /// - Two consecutive words, starting at this address, are loaded from memory.
-    /// - Elements of these words are written to the top 8 elements of the stack (element-wise, in
-    ///   stack order).
-    /// - Memory address (in position 12) is incremented by 2.
-    /// - All other stack elements remain the same.
-    pub(super) fn op_mstream(&mut self) -> Result<(), ExecutionError> {
-        // get the address from position 12 on the stack
-        let ctx = self.system.ctx();
-        let addr = Self::get_valid_address(self.stack.get(12))?;
-
-        // load two words from memory
-        let words = self.chiplets.read_mem_double(ctx, addr)?;
-
-        // replace the stack elements with the elements from memory (in stack order)
-        for (i, &mem_value) in words.iter().flat_map(|word| word.iter()).rev().enumerate() {
-            self.stack.set(i, mem_value);
-        }
-
-        // copy over the next 4 elements
-        for i in 8..12 {
-            let stack_value = self.stack.get(i);
-            self.stack.set(i, stack_value);
-        }
-
-        // increment the address by 2
-        self.stack.set(12, Felt::from(addr + 2));
-
-        // copy over the rest of the stack
-        self.stack.copy_state(13);
-
         Ok(())
     }
 
@@ -113,20 +75,21 @@ impl Process {
     ///
     /// The operation works as follows:
     /// - The memory address is popped off the stack.
-    /// - The top four stack items are saved into the specified memory address. The items are not
-    ///   removed from the stack.
+    /// - The top four stack items are saved starting at the specified memory address, which must be
+    ///   aligned on a word boundary. The items are not removed from the stack.
     ///
     /// Thus, the net result of the operation is that the stack is shifted left by one item.
     pub(super) fn op_mstorew(&mut self) -> Result<(), ExecutionError> {
         // get the address from the stack and build the word to be saved from the stack values
-        let ctx = self.system.ctx();
-        let addr = Self::get_valid_address(self.stack.get(0))?;
+        let addr = self.stack.get(0);
 
         // build the word in memory order (reverse of stack order)
         let word = [self.stack.get(4), self.stack.get(3), self.stack.get(2), self.stack.get(1)];
 
-        // write the word to memory and get the previous word
-        self.chiplets.write_mem(ctx, addr, word)?;
+        // write the word to memory
+        self.chiplets
+            .memory_mut()
+            .write_word(self.system.ctx(), addr, self.system.clk(), word)?;
 
         // reverse the order of the memory word & update the stack state
         for (i, &value) in word.iter().rev().enumerate() {
@@ -141,28 +104,18 @@ impl Process {
     ///
     /// The operation works as follows:
     /// - The memory address is popped off the stack.
-    /// - The top stack element is saved into the first element of the word located at the specified
-    ///   memory address. The remaining 3 elements of the word are not affected. The element is not
-    ///   removed from the stack.
+    /// - The top stack element is saved at the specified memory address. The element is not removed
+    ///   from the stack.
     ///
     /// Thus, the net result of the operation is that the stack is shifted left by one item.
-    ///
-    /// The first 3 helper registers are filled with the remaining elements of the word which were
-    /// previously stored in memory and not overwritten by the operation. They are stored in stack
-    /// order, with the last element at helper register 0.
     pub(super) fn op_mstore(&mut self) -> Result<(), ExecutionError> {
         // get the address and the value from the stack
         let ctx = self.system.ctx();
-        let addr = Self::get_valid_address(self.stack.get(0))?;
+        let addr = self.stack.get(0);
         let value = self.stack.get(1);
 
         // write the value to the memory and get the previous word
-        let mut old_word = self.chiplets.write_mem_element(ctx, addr, value)?;
-        // put the retrieved word into stack order
-        old_word.reverse();
-
-        // write the 3 unused elements to the helpers so they're available for constraint evaluation
-        self.decoder.set_user_op_helpers(Operation::MStore, &old_word[..3]);
+        self.chiplets.memory_mut().write(ctx, addr, self.system.clk(), value)?;
 
         // update the stack state
         self.stack.shift_left(1);
@@ -170,6 +123,51 @@ impl Process {
         Ok(())
     }
 
+    /// Loads two words from memory and replaces the top 8 elements of the stack with their
+    /// contents.
+    ///
+    /// The operation works as follows:
+    /// - The memory address of the first word is retrieved from 13th stack element (position 12).
+    /// - Two consecutive words, starting at this address, are loaded from memory.
+    /// - Elements of these words are written to the top 8 elements of the stack (element-wise, in
+    ///   stack order).
+    /// - Memory address (in position 12) is incremented by 8.
+    /// - All other stack elements remain the same.
+    pub(super) fn op_mstream(&mut self) -> Result<(), ExecutionError> {
+        const MEM_ADDR_STACK_IDX: usize = 12;
+
+        let ctx = self.system.ctx();
+        let clk = self.system.clk();
+        let addr_first_word = self.stack.get(MEM_ADDR_STACK_IDX);
+        let addr_second_word = addr_first_word + Felt::from(WORD_SIZE as u32);
+
+        // load two words from memory
+        let words = [
+            self.chiplets.memory_mut().read_word(ctx, addr_first_word, clk)?,
+            self.chiplets.memory_mut().read_word(ctx, addr_second_word, clk)?,
+        ];
+
+        // replace the stack elements with the elements from memory (in stack order)
+        for (i, &mem_value) in words.iter().flat_map(|word| word.iter()).rev().enumerate() {
+            self.stack.set(i, mem_value);
+        }
+
+        // copy over the next 4 elements
+        for i in 8..MEM_ADDR_STACK_IDX {
+            let stack_value = self.stack.get(i);
+            self.stack.set(i, stack_value);
+        }
+
+        // increment the address by 8 (2 words)
+        self.stack
+            .set(MEM_ADDR_STACK_IDX, addr_first_word + Felt::from(WORD_SIZE as u32 * 2));
+
+        // copy over the rest of the stack
+        self.stack.copy_state(13);
+
+        Ok(())
+    }
+
     /// Moves 8 elements from the advice stack to the memory, via the operand stack.
     ///
     /// The operation works as follows:
@@ -178,18 +176,23 @@ impl Process {
     ///   (position 12).
     /// - The two words are written to memory consecutively, starting at this address.
     /// - These words replace the top 8 elements of the stack (element-wise, in stack order).
-    /// - Memory address (in position 12) is incremented by 2.
+    /// - Memory address (in position 12) is incremented by 8.
     /// - All other stack elements remain the same.
     pub(super) fn op_pipe(&mut self, host: &mut impl Host) -> Result<(), ExecutionError> {
+        const MEM_ADDR_STACK_IDX: usize = 12;
+
         // get the address from position 12 on the stack
         let ctx = self.system.ctx();
-        let addr = Self::get_valid_address(self.stack.get(12))?;
+        let clk = self.system.clk();
+        let addr_first_word = self.stack.get(MEM_ADDR_STACK_IDX);
+        let addr_second_word = addr_first_word + Felt::from(WORD_SIZE as u32);
 
         // pop two words from the advice stack
         let words = host.advice_provider_mut().pop_stack_dword(self.into())?;
 
         // write the words memory
-        self.chiplets.write_mem_double(ctx, addr, words)?;
+        self.chiplets.memory_mut().write_word(ctx, addr_first_word, clk, words[0])?;
+        self.chiplets.memory_mut().write_word(ctx, addr_second_word, clk, words[1])?;
 
         // replace the elements on the stack with the word elements (in stack order)
         for (i, &adv_value) in words.iter().flat_map(|word| word.iter()).rev().enumerate() {
@@ -202,8 +205,9 @@ impl Process {
             self.stack.set(i, stack_value);
         }
 
-        // increment the address by 2
-        self.stack.set(12, Felt::from(addr + 2));
+        // increment the address by 8 (2 words)
+        self.stack
+            .set(MEM_ADDR_STACK_IDX, addr_first_word + Felt::from(WORD_SIZE as u32 * 2));
 
         // copy over the rest of the stack
         self.stack.copy_state(13);
@@ -241,30 +245,6 @@ impl Process {
 
         Ok(())
     }
-
-    // HELPER FUNCTIONS
-    // --------------------------------------------------------------------------------------------
-
-    /// Returns the memory word at address `addr` in the current context.
-    pub(crate) fn read_mem_word(&mut self, addr: Felt) -> Result<Word, ExecutionError> {
-        let ctx = self.system.ctx();
-        let mem_addr = Self::get_valid_address(addr)?;
-        let word_at_addr = self.chiplets.read_mem(ctx, mem_addr)?;
-
-        Ok(word_at_addr)
-    }
-
-    /// Checks that provided address is less than u32::MAX and returns it cast to u32.
-    ///
-    /// # Errors
-    /// Returns an error if the provided address is greater than u32::MAX.
-    fn get_valid_address(addr: Felt) -> Result<u32, ExecutionError> {
-        let addr = addr.as_int();
-        if addr > u32::MAX as u64 {
-            return Err(ExecutionError::MemoryAddressOutOfBounds(addr));
-        }
-        Ok(addr as u32)
-    }
 }
 
 // TESTS
@@ -316,11 +296,11 @@ mod tests {
     fn op_mloadw() {
         let mut host = DefaultHost::default();
         let mut process = Process::new_dummy_with_decoder_helpers_and_empty_stack();
-        assert_eq!(0, process.chiplets.get_mem_size());
+        assert_eq!(0, process.chiplets.memory().num_accessed_batches());
 
-        // push a word onto the stack and save it at address 1
+        // push a word onto the stack and save it at address 4
         let word = [1, 3, 5, 7].to_elements().try_into().unwrap();
-        store_value(&mut process, 1, word, &mut host);
+        store_value(&mut process, 4, word, &mut host);
 
         // push four zeros onto the stack
         for _ in 0..4 {
@@ -328,15 +308,18 @@ mod tests {
         }
 
         // push the address onto the stack and load the word
-        process.execute_op(Operation::Push(ONE), &mut host).unwrap();
+        process.execute_op(Operation::Push(4_u32.into()), &mut host).unwrap();
         process.execute_op(Operation::MLoadW, &mut host).unwrap();
 
         let expected_stack = build_expected_stack(&[7, 5, 3, 1, 7, 5, 3, 1]);
         assert_eq!(expected_stack, process.stack.trace_state());
 
         // check memory state
-        assert_eq!(1, process.chiplets.get_mem_size());
-        assert_eq!(word, process.chiplets.get_mem_value(ContextId::root(), 1).unwrap());
+        assert_eq!(1, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(
+            word,
+            process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
+        );
 
         // --- calling MLOADW with address greater than u32::MAX leads to an error ----------------
         process.execute_op(Operation::Push(Felt::new(u64::MAX / 2)), &mut host).unwrap();
@@ -351,22 +334,25 @@ mod tests {
     fn op_mload() {
         let mut host = DefaultHost::default();
         let mut process = Process::new_dummy_with_decoder_helpers_and_empty_stack();
-        assert_eq!(0, process.chiplets.get_mem_size());
+        assert_eq!(0, process.chiplets.memory().num_accessed_batches());
 
-        // push a word onto the stack and save it at address 2
+        // push a word onto the stack and save it at address 4
         let word = [1, 3, 5, 7].to_elements().try_into().unwrap();
-        store_value(&mut process, 2, word, &mut host);
+        store_value(&mut process, 4, word, &mut host);
 
         // push the address onto the stack and load the element
-        process.execute_op(Operation::Push(Felt::new(2)), &mut host).unwrap();
+        process.execute_op(Operation::Push(Felt::new(4)), &mut host).unwrap();
         process.execute_op(Operation::MLoad, &mut host).unwrap();
 
         let expected_stack = build_expected_stack(&[1, 7, 5, 3, 1]);
         assert_eq!(expected_stack, process.stack.trace_state());
 
         // check memory state
-        assert_eq!(1, process.chiplets.get_mem_size());
-        assert_eq!(word, process.chiplets.get_mem_value(ContextId::root(), 2).unwrap());
+        assert_eq!(1, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(
+            word,
+            process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
+        );
 
         // --- calling MLOAD with address greater than u32::MAX leads to an error -----------------
         process.execute_op(Operation::Push(Felt::new(u64::MAX / 2)), &mut host).unwrap();
@@ -382,18 +368,24 @@ mod tests {
         let mut host = DefaultHost::default();
         let mut process = Process::new_dummy_with_decoder_helpers_and_empty_stack();
 
-        // save two words into memory addresses 1 and 2
+        // save two words into memory addresses 4 and 8
         let word1 = [30, 29, 28, 27];
         let word2 = [26, 25, 24, 23];
         let word1_felts: Word = word1.to_elements().try_into().unwrap();
         let word2_felts: Word = word2.to_elements().try_into().unwrap();
-        store_value(&mut process, 1, word1_felts, &mut host);
-        store_value(&mut process, 2, word2_felts, &mut host);
+        store_value(&mut process, 4, word1_felts, &mut host);
+        store_value(&mut process, 8, word2_felts, &mut host);
 
         // check memory state
-        assert_eq!(2, process.chiplets.get_mem_size());
-        assert_eq!(word1_felts, process.chiplets.get_mem_value(ContextId::root(), 1).unwrap());
-        assert_eq!(word2_felts, process.chiplets.get_mem_value(ContextId::root(), 2).unwrap());
+        assert_eq!(2, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(
+            word1_felts,
+            process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
+        );
+        assert_eq!(
+            word2_felts,
+            process.chiplets.memory().get_word(ContextId::root(), 8).unwrap().unwrap()
+        );
 
         // clear the stack
         for _ in 0..8 {
@@ -402,11 +394,11 @@ mod tests {
 
         // arrange the stack such that:
         // - 101 is at position 13 (to make sure it is not overwritten)
-        // - 1 (the address) is at position 12
+        // - 4 (the address) is at position 12
         // - values 1 - 12 are at positions 0 - 11. Adding the first 8 of these values to the values
         //   stored in memory should result in 35.
         process.execute_op(Operation::Push(Felt::new(101)), &mut host).unwrap();
-        process.execute_op(Operation::Push(ONE), &mut host).unwrap();
+        process.execute_op(Operation::Push(4_u32.into()), &mut host).unwrap();
         for i in 1..13 {
             process.execute_op(Operation::Push(Felt::new(i)), &mut host).unwrap();
         }
@@ -417,8 +409,20 @@ mod tests {
         // the first 8 values should contain the values from memory. the next 4 values should remain
         // unchanged, and the address should be incremented by 2 (i.e., 1 -> 3).
         let stack_values = [
-            word2[3], word2[2], word2[1], word2[0], word1[3], word1[2], word1[1], word1[0], 4, 3,
-            2, 1, 3, 101,
+            word2[3],
+            word2[2],
+            word2[1],
+            word2[0],
+            word1[3],
+            word1[2],
+            word1[1],
+            word1[0],
+            4,
+            3,
+            2,
+            1,
+            4 + 8, // initial address + 2 words
+            101,   // rest of stack
         ];
         let expected_stack = build_expected_stack(&stack_values);
         assert_eq!(expected_stack, process.stack.trace_state());
@@ -428,7 +432,7 @@ mod tests {
     fn op_mstorew() {
         let mut host = DefaultHost::default();
         let mut process = Process::new_dummy_with_decoder_helpers_and_empty_stack();
-        assert_eq!(0, process.chiplets.get_mem_size());
+        assert_eq!(0, process.chiplets.memory().num_accessed_batches());
 
         // push the first word onto the stack and save it at address 0
         let word1 = [1, 3, 5, 7].to_elements().try_into().unwrap();
@@ -439,21 +443,30 @@ mod tests {
         assert_eq!(expected_stack, process.stack.trace_state());
 
         // check memory state
-        assert_eq!(1, process.chiplets.get_mem_size());
-        assert_eq!(word1, process.chiplets.get_mem_value(ContextId::root(), 0).unwrap());
+        assert_eq!(1, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(
+            word1,
+            process.chiplets.memory().get_word(ContextId::root(), 0).unwrap().unwrap()
+        );
 
-        // push the second word onto the stack and save it at address 3
+        // push the second word onto the stack and save it at address 4
         let word2 = [2, 4, 6, 8].to_elements().try_into().unwrap();
-        store_value(&mut process, 3, word2, &mut host);
+        store_value(&mut process, 4, word2, &mut host);
 
         // check stack state
         let expected_stack = build_expected_stack(&[8, 6, 4, 2, 7, 5, 3, 1]);
         assert_eq!(expected_stack, process.stack.trace_state());
 
         // check memory state
-        assert_eq!(2, process.chiplets.get_mem_size());
-        assert_eq!(word1, process.chiplets.get_mem_value(ContextId::root(), 0).unwrap());
-        assert_eq!(word2, process.chiplets.get_mem_value(ContextId::root(), 3).unwrap());
+        assert_eq!(2, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(
+            word1,
+            process.chiplets.memory().get_word(ContextId::root(), 0).unwrap().unwrap()
+        );
+        assert_eq!(
+            word2,
+            process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
+        );
 
         // --- calling MSTOREW with address greater than u32::MAX leads to an error ----------------
         process.execute_op(Operation::Push(Felt::new(u64::MAX / 2)), &mut host).unwrap();
@@ -468,7 +481,7 @@ mod tests {
     fn op_mstore() {
         let mut host = DefaultHost::default();
         let mut process = Process::new_dummy_with_decoder_helpers_and_empty_stack();
-        assert_eq!(0, process.chiplets.get_mem_size());
+        assert_eq!(0, process.chiplets.memory().num_accessed_batches());
 
         // push new element onto the stack and save it as first element of the word on
         // uninitialized memory at address 0
@@ -481,16 +494,19 @@ mod tests {
 
         // check memory state
         let mem_0 = [element, ZERO, ZERO, ZERO];
-        assert_eq!(1, process.chiplets.get_mem_size());
-        assert_eq!(mem_0, process.chiplets.get_mem_value(ContextId::root(), 0).unwrap());
+        assert_eq!(1, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(
+            mem_0,
+            process.chiplets.memory().get_word(ContextId::root(), 0).unwrap().unwrap()
+        );
 
-        // push the word onto the stack and save it at address 2
+        // push the word onto the stack and save it at address 4
         let word_2 = [1, 3, 5, 7].to_elements().try_into().unwrap();
-        store_value(&mut process, 2, word_2, &mut host);
+        store_value(&mut process, 4, word_2, &mut host);
 
         // push new element onto the stack and save it as first element of the word at address 2
         let element = Felt::new(12);
-        store_element(&mut process, 2, element, &mut host);
+        store_element(&mut process, 4, element, &mut host);
 
         // check stack state
         let expected_stack = build_expected_stack(&[12, 7, 5, 3, 1, 10]);
@@ -498,8 +514,11 @@ mod tests {
 
         // check memory state to make sure the other 3 elements were not affected
         let mem_2 = [element, Felt::new(3), Felt::new(5), Felt::new(7)];
-        assert_eq!(2, process.chiplets.get_mem_size());
-        assert_eq!(mem_2, process.chiplets.get_mem_value(ContextId::root(), 2).unwrap());
+        assert_eq!(2, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(
+            mem_2,
+            process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
+        );
 
         // --- calling MSTORE with address greater than u32::MAX leads to an error ----------------
         process.execute_op(Operation::Push(Felt::new(u64::MAX / 2)), &mut host).unwrap();
@@ -527,12 +546,12 @@ mod tests {
 
         // arrange the stack such that:
         // - 101 is at position 13 (to make sure it is not overwritten)
-        // - 1 (the address) is at position 12
+        // - 4 (the address) is at position 12
         // - values 1 - 12 are at positions 0 - 11. Replacing the first 8 of these values with the
         //   values from the advice stack should result in 30 through 23 in stack order (with 23 at
         //   stack[0]).
         process.execute_op(Operation::Push(Felt::new(101)), &mut host).unwrap();
-        process.execute_op(Operation::Push(ONE), &mut host).unwrap();
+        process.execute_op(Operation::Push(4_u32.into()), &mut host).unwrap();
         for i in 1..13 {
             process.execute_op(Operation::Push(Felt::new(i)), &mut host).unwrap();
         }
@@ -541,15 +560,33 @@ mod tests {
         process.execute_op(Operation::Pipe, &mut host).unwrap();
 
         // check memory state contains the words from the advice stack
-        assert_eq!(2, process.chiplets.get_mem_size());
-        assert_eq!(word1_felts, process.chiplets.get_mem_value(ContextId::root(), 1).unwrap());
-        assert_eq!(word2_felts, process.chiplets.get_mem_value(ContextId::root(), 2).unwrap());
+        assert_eq!(2, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(
+            word1_felts,
+            process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
+        );
+        assert_eq!(
+            word2_felts,
+            process.chiplets.memory().get_word(ContextId::root(), 8).unwrap().unwrap()
+        );
 
         // the first 8 values should be the values from the advice stack. the next 4 values should
         // remain unchanged, and the address should be incremented by 2 (i.e., 1 -> 3).
         let stack_values = [
-            word2[3], word2[2], word2[1], word2[0], word1[3], word1[2], word1[1], word1[0], 4, 3,
-            2, 1, 3, 101,
+            word2[3],
+            word2[2],
+            word2[1],
+            word2[0],
+            word1[3],
+            word1[2],
+            word1[1],
+            word1[0],
+            4,
+            3,
+            2,
+            1,
+            4 + 8, // initial address + 2 words
+            101,   // rest of stack
         ];
         let expected_stack = build_expected_stack(&stack_values);
         assert_eq!(expected_stack, process.stack.trace_state());
diff --git a/processor/src/operations/sys_ops/sys_event_handlers.rs b/processor/src/operations/sys_ops/sys_event_handlers.rs
index 13e4a796be..7ff647945b 100644
--- a/processor/src/operations/sys_ops/sys_event_handlers.rs
+++ b/processor/src/operations/sys_ops/sys_event_handlers.rs
@@ -6,7 +6,7 @@ use vm_core::{
         merkle::{EmptySubtreeRoots, Smt, SMT_DEPTH},
     },
     sys_events::SystemEvent,
-    Felt, FieldElement, SignatureKind, Word, EMPTY_WORD, WORD_SIZE, ZERO,
+    Felt, FieldElement, SignatureKind, Word, WORD_SIZE, ZERO,
 };
 use winter_prover::math::fft;
 
@@ -91,8 +91,8 @@ pub fn insert_mem_values_into_adv_map(
 
     let mut values = Vec::with_capacity(((end_addr - start_addr) as usize) * WORD_SIZE);
     for addr in start_addr..end_addr {
-        let mem_value = process.get_mem_value(ctx, addr).unwrap_or(EMPTY_WORD);
-        values.extend_from_slice(&mem_value);
+        let mem_value = process.get_mem_value(ctx, addr).unwrap_or(ZERO);
+        values.push(mem_value);
     }
 
     let key = process.get_stack_word(0);
@@ -403,8 +403,8 @@ pub fn push_ext2_inv_result(
 /// Returns an error if:
 /// - `input_size` less than or equal to 1, or is not a power of 2.
 /// - `output_size` is 0 or is greater than the `input_size`.
-/// - `input_ptr` is greater than 2^32.
-/// - `input_ptr + input_size / 2` is greater than 2^32.
+/// - `input_ptr` is greater than 2^32, or is not aligned on a word boundary.
+/// - `input_ptr + input_size * 2` is greater than 2^32.
 pub fn push_ext2_intt_result(
     advice_provider: &mut impl AdviceProvider,
     process: ProcessState,
@@ -422,11 +422,14 @@ pub fn push_ext2_intt_result(
     if input_start_ptr >= u32::MAX as u64 {
         return Err(Ext2InttError::InputStartAddressTooBig(input_start_ptr).into());
     }
+    if input_start_ptr % WORD_SIZE as u64 != 0 {
+        return Err(Ext2InttError::InputStartNotWordAligned(input_start_ptr).into());
+    }
     if input_size > u32::MAX as usize {
         return Err(Ext2InttError::InputSizeTooBig(input_size as u64).into());
     }
 
-    let input_end_ptr = input_start_ptr + (input_size / 2) as u64;
+    let input_end_ptr = input_start_ptr + (input_size * 2) as u64;
     if input_end_ptr > u32::MAX as u64 {
         return Err(Ext2InttError::InputEndAddressTooBig(input_end_ptr).into());
     }
@@ -439,9 +442,9 @@ pub fn push_ext2_intt_result(
     }
 
     let mut poly = Vec::with_capacity(input_size);
-    for addr in (input_start_ptr as u32)..(input_end_ptr as u32) {
+    for addr in ((input_start_ptr as u32)..(input_end_ptr as u32)).step_by(4) {
         let word = process
-            .get_mem_value(process.ctx(), addr)
+            .get_mem_word(process.ctx(), addr)?
             .ok_or(Ext2InttError::UninitializedMemoryAddress(addr))?;
 
         poly.push(QuadFelt::new(word[0], word[1]));
diff --git a/processor/src/trace/tests/chiplets/memory.rs b/processor/src/trace/tests/chiplets/memory.rs
index 5415a38def..d4617145d7 100644
--- a/processor/src/trace/tests/chiplets/memory.rs
+++ b/processor/src/trace/tests/chiplets/memory.rs
@@ -1,6 +1,8 @@
 use miden_air::{
     trace::chiplets::{
-        memory::{MEMORY_READ_LABEL, MEMORY_WRITE, MEMORY_WRITE_LABEL, NUM_ELEMENTS},
+        memory::{
+            MEMORY_READ_LABEL, MEMORY_WRITE_LABEL, MEMORY_WRITE_SELECTOR, NUM_ELEMENTS_IN_BATCH,
+        },
         MEMORY_ADDR_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_SELECTORS_COL_IDX,
         MEMORY_V_COL_RANGE,
     },
@@ -156,7 +158,7 @@ fn build_expected_memory(
     word: Word,
 ) -> Felt {
     let mut word_value = ZERO;
-    for i in 0..NUM_ELEMENTS {
+    for i in 0..NUM_ELEMENTS_IN_BATCH {
         word_value += alphas[i + 5] * word[i];
     }
 
@@ -176,7 +178,7 @@ fn build_expected_memory_from_trace(
     // get the memory access operation
     let s0 = trace.main_trace.get_column(MEMORY_SELECTORS_COL_IDX)[row];
     let s1 = trace.main_trace.get_column(MEMORY_SELECTORS_COL_IDX + 1)[row];
-    let op_label = if s0 == MEMORY_WRITE[0] {
+    let op_label = if s0 == MEMORY_WRITE_SELECTOR[0] {
         debug_assert!(s1 == ZERO);
         MEMORY_WRITE_LABEL
     } else {
@@ -189,7 +191,7 @@ fn build_expected_memory_from_trace(
     let clk = trace.main_trace.get_column(MEMORY_CLK_COL_IDX)[row];
 
     // get the memory value
-    let mut word = [ZERO; NUM_ELEMENTS];
+    let mut word = [ZERO; NUM_ELEMENTS_IN_BATCH];
     for (i, element) in word.iter_mut().enumerate() {
         *element = trace.main_trace.get_column(MEMORY_V_COL_RANGE.start + i)[row];
     }
diff --git a/stdlib/tests/mem/mod.rs b/stdlib/tests/mem/mod.rs
index 8cfe9e4226..773a0ade11 100644
--- a/stdlib/tests/mem/mod.rs
+++ b/stdlib/tests/mem/mod.rs
@@ -37,54 +37,55 @@ fn test_memcopy() {
         Process::new(program.kernel().clone(), StackInputs::default(), ExecutionOptions::default());
     process.execute(&program, &mut host).unwrap();
 
+    // TODO(plafer): this will fail due to addresses being too close to each other
     assert_eq!(
-        process.chiplets.get_mem_value(ContextId::root(), 1000),
+        process.chiplets.memory().get_word(ContextId::root(), 1000).unwrap(),
         Some([ZERO, ZERO, ZERO, ONE]),
         "Address 1000"
     );
     assert_eq!(
-        process.chiplets.get_mem_value(ContextId::root(), 1001),
+        process.chiplets.memory().get_word(ContextId::root(), 1001).unwrap(),
         Some([ZERO, ZERO, ONE, ZERO]),
         "Address 1001"
     );
     assert_eq!(
-        process.chiplets.get_mem_value(ContextId::root(), 1002),
+        process.chiplets.memory().get_word(ContextId::root(), 1002).unwrap(),
         Some([ZERO, ZERO, ONE, ONE]),
         "Address 1002"
     );
     assert_eq!(
-        process.chiplets.get_mem_value(ContextId::root(), 1003),
+        process.chiplets.memory().get_word(ContextId::root(), 1003).unwrap(),
         Some([ZERO, ONE, ZERO, ZERO]),
         "Address 1003"
     );
     assert_eq!(
-        process.chiplets.get_mem_value(ContextId::root(), 1004),
+        process.chiplets.memory().get_word(ContextId::root(), 1004).unwrap(),
         Some([ZERO, ONE, ZERO, ONE]),
         "Address 1004"
     );
 
     assert_eq!(
-        process.chiplets.get_mem_value(ContextId::root(), 2000),
+        process.chiplets.memory().get_word(ContextId::root(), 2000).unwrap(),
         Some([ZERO, ZERO, ZERO, ONE]),
         "Address 2000"
     );
     assert_eq!(
-        process.chiplets.get_mem_value(ContextId::root(), 2001),
+        process.chiplets.memory().get_word(ContextId::root(), 2001).unwrap(),
         Some([ZERO, ZERO, ONE, ZERO]),
         "Address 2001"
     );
     assert_eq!(
-        process.chiplets.get_mem_value(ContextId::root(), 2002),
+        process.chiplets.memory().get_word(ContextId::root(), 2002).unwrap(),
         Some([ZERO, ZERO, ONE, ONE]),
         "Address 2002"
     );
     assert_eq!(
-        process.chiplets.get_mem_value(ContextId::root(), 2003),
+        process.chiplets.memory().get_word(ContextId::root(), 2003).unwrap(),
         Some([ZERO, ONE, ZERO, ZERO]),
         "Address 2003"
     );
     assert_eq!(
-        process.chiplets.get_mem_value(ContextId::root(), 2004),
+        process.chiplets.memory().get_word(ContextId::root(), 2004).unwrap(),
         Some([ZERO, ONE, ZERO, ONE]),
         "Address 2004"
     );
diff --git a/test-utils/src/lib.rs b/test-utils/src/lib.rs
index 2d048db53f..ae5b934873 100644
--- a/test-utils/src/lib.rs
+++ b/test-utils/src/lib.rs
@@ -23,6 +23,7 @@ pub use processor::{
 };
 #[cfg(not(target_family = "wasm"))]
 use proptest::prelude::{Arbitrary, Strategy};
+use prover::utils::range;
 pub use prover::{prove, MemAdviceProvider, MerkleTreeVC, ProvingOptions};
 pub use test_case::test_case;
 pub use verifier::{verify, AcceptableOptions, VerifierError};
@@ -221,7 +222,7 @@ impl Test {
     pub fn expect_stack_and_memory(
         &self,
         final_stack: &[u64],
-        mut mem_start_addr: u32,
+        mem_start_addr: u32,
         expected_mem: &[u64],
     ) {
         // compile the program
@@ -243,21 +244,22 @@ impl Test {
         process.execute(&program, &mut host).unwrap();
 
         // validate the memory state
-        for data in expected_mem.chunks(WORD_SIZE) {
-            // Main memory is zeroed by default, use zeros as a fallback when unwrap to make testing
-            // easier
+        for (addr, mem_value) in
+            (range(mem_start_addr as usize, expected_mem.len())).zip(expected_mem.iter())
+        {
             let mem_state = process
                 .chiplets
-                .get_mem_value(ContextId::root(), mem_start_addr)
-                .unwrap_or(EMPTY_WORD);
-
-            let mem_state = felt_slice_to_ints(&mem_state);
+                .memory()
+                .get_value(ContextId::root(), addr as u32)
+                .unwrap_or(ZERO);
             assert_eq!(
-                data, mem_state,
+                *mem_value,
+                mem_state.as_int(),
                 "Expected memory [{}] => {:?}, found {:?}",
-                mem_start_addr, data, mem_state
+                addr,
+                mem_value,
+                mem_state
             );
-            mem_start_addr += 1;
         }
 
         // validate the stack states

From 3de84415b7deaa52acbe481923486aa22c8d7cda Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Mon, 16 Dec 2024 08:47:30 -0500
Subject: [PATCH 02/19] feat(air): fix air constraints for new memory chiplet

---
 air/src/constraints/chiplets/memory/mod.rs   | 271 +++++++++++--------
 air/src/constraints/chiplets/memory/tests.rs | 106 ++++----
 air/src/constraints/chiplets/mod.rs          |  52 +++-
 air/src/trace/chiplets/mod.rs                |  16 +-
 air/src/trace/main_trace.rs                  |   4 +-
 processor/src/operations/comb_ops.rs         |  22 +-
 processor/src/trace/tests/chiplets/memory.rs |   4 +-
 7 files changed, 289 insertions(+), 186 deletions(-)

diff --git a/air/src/constraints/chiplets/memory/mod.rs b/air/src/constraints/chiplets/memory/mod.rs
index 709fe0a42f..f9318ec615 100644
--- a/air/src/constraints/chiplets/memory/mod.rs
+++ b/air/src/constraints/chiplets/memory/mod.rs
@@ -5,9 +5,10 @@ use winter_air::TransitionConstraintDegree;
 use super::{EvaluationFrame, FieldElement};
 use crate::{
     trace::chiplets::{
-        memory::NUM_ELEMENTS_IN_BATCH, MEMORY_ADDR_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
-        MEMORY_D0_COL_IDX, MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_TRACE_OFFSET,
-        MEMORY_V_COL_RANGE,
+        MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_D0_COL_IDX,
+        MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_ELEMENT_OR_WORD_COL_IDX,
+        MEMORY_FLAG_SAME_BATCH_AND_CONTEXT, MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX,
+        MEMORY_READ_WRITE_COL_IDX, MEMORY_V_COL_RANGE,
     },
     utils::{binary_not, is_binary, EvaluationResult},
 };
@@ -19,16 +20,17 @@ mod tests;
 // ================================================================================================
 
 /// The number of constraints on the management of the memory chiplet.
-pub const NUM_CONSTRAINTS: usize = 17;
+pub const NUM_CONSTRAINTS: usize = 22;
 /// The degrees of constraints on the management of the memory chiplet. All constraint degrees are
 /// increased by 3 due to the selectors for the memory chiplet.
 pub const CONSTRAINT_DEGREES: [usize; NUM_CONSTRAINTS] = [
-    5, 5, // Enforce that the memory selectors are binary.
-    9, 8, // Enforce s1 is set to 1 when reading existing memory and 0 otherwise.
+    5, 5, 5, 5, // Enforce that rw, ew, idx0 and idx1 are binary.
     7, 6, 9, 8, // Constrain the values in the d inverse column.
-    8, // Enforce values in ctx, addr, clk transition correctly.
-    6, 6, 6, 6, // Enforce correct memory initialization when reading from new memory.
-    5, 5, 5, 5, // Enforce correct memory copy when reading from existing memory
+    8, // Enforce values in ctx, batch, clk transition correctly.
+    7, // Enforce the correct value for the f_scb flag.
+    9, 9, 9, 9, // Constrain the values in the first row of the chiplet.
+    9, 9, 9, 9, // Constrain the values in non-first rows, new batch or context is started.
+    9, 9, 9, 9, // Constrain the values in non-first rows, same batch or context.
 ];
 
 // MEMORY TRANSITION CONSTRAINTS
@@ -52,68 +54,60 @@ pub fn enforce_constraints<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
     memory_flag: E,
+    memory_flag_no_last: E,
+    memory_flag_first_row: E,
 ) {
-    // Constrain the operation selectors.
-    let mut index = enforce_selectors(frame, result, memory_flag);
+    // Constrain the binary columns.
+    let mut index = enforce_binary_columns(frame, result, memory_flag);
 
     // Constrain the values in the d inverse column.
-    index += enforce_d_inv(frame, &mut result[index..], memory_flag);
+    index += enforce_d_inv(frame, &mut result[index..], memory_flag_no_last);
 
     // Enforce values in ctx, addr, clk transition correctly.
-    index += enforce_delta(frame, &mut result[index..], memory_flag);
+    index += enforce_delta(frame, &mut result[index..], memory_flag_no_last);
+
+    // Enforce the correct value for the f_scb flag.
+    index += enforce_flag_same_context_and_batch(frame, &mut result[index..], memory_flag_no_last);
 
     // Constrain the memory values.
-    enforce_values(frame, &mut result[index..], memory_flag);
+    enforce_values(frame, &mut result[index..], memory_flag_no_last, memory_flag_first_row);
 }
 
 // TRANSITION CONSTRAINT HELPERS
 // ================================================================================================
 
-fn enforce_selectors<E: FieldElement>(
+fn enforce_binary_columns<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
     memory_flag: E,
 ) -> usize {
-    let mut index = 0;
-
-    // s0 and s1 are binary.
-    result[index] = memory_flag * is_binary(frame.selector(0));
-    index += 1;
-    result[index] = memory_flag * is_binary(frame.selector(1));
-    index += 1;
-
-    // s1 is set to 1 when existing memory is being read. this happens when ctx and addr haven't
-    // changed, and the next operation is a read (s0 is set).
-    result[index] = memory_flag
-        * frame.reaccess_flag()
-        * frame.selector_next(0)
-        * binary_not(frame.selector_next(1));
-    index += 1;
-
-    // s1 is set to 0 in all other cases. this happens when ctx changed, or ctx stayed the same but
-    // addr changed, or the operation was a write.
-    result[index] = memory_flag
-        * (frame.n0() + frame.not_n0() * frame.n1() + binary_not(frame.selector_next(0)))
-        * frame.selector_next(1);
-    index += 1;
-
-    index
+    result[0] = memory_flag * is_binary(frame.read_write());
+    result[1] = memory_flag * is_binary(frame.element_or_word());
+    result[2] = memory_flag * is_binary(frame.idx0());
+    result[3] = memory_flag * is_binary(frame.idx1());
+
+    4
 }
 
+// TODO(plafer): review these constraints
 /// A constraint evaluation function to enforce that the `d_inv` "delta inverse" column used to
 /// constrain the delta between two consecutive contexts, addresses, or clock cycles is updated
 /// correctly.
 fn enforce_d_inv<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
-    memory_flag: E,
+    memory_flag_no_last: E,
 ) -> usize {
     let constraint_count = 4;
 
-    result.agg_constraint(0, memory_flag, is_binary(frame.n0()));
-    result.agg_constraint(1, memory_flag * frame.not_n0(), frame.ctx_change());
-    result.agg_constraint(2, memory_flag * frame.not_n0(), is_binary(frame.n1()));
-    result.agg_constraint(3, memory_flag * frame.reaccess_flag(), frame.addr_change());
+    // n0 is binary
+    result[0] = memory_flag_no_last * is_binary(frame.n0());
+    // when the context changes, n0 should be set to 1.
+    result[1] = memory_flag_no_last * frame.not_n0() * frame.ctx_change();
+    // when n0 is 0, n1 is binary.
+    result[2] = memory_flag_no_last * frame.not_n0() * is_binary(frame.n1());
+    // TODO(plafer)
+    result[3] = memory_flag_no_last * frame.not_n0() * frame.not_n1() * frame.addr_change();
 
     constraint_count
 }
@@ -123,47 +117,86 @@ fn enforce_d_inv<E: FieldElement>(
 fn enforce_delta<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
-    memory_flag: E,
+    memory_flag_no_last: E,
 ) -> usize {
     let constraint_count = 1;
 
     // If the context changed, include the difference.
-    result.agg_constraint(0, memory_flag * frame.n0(), frame.ctx_change());
-    // If the context is the same, include the address difference if it changed or else include the
+    result[0] = memory_flag_no_last * frame.n0() * frame.ctx_change();
+    // If the context is the same, include the batch difference if it changed or else include the
     // clock change.
     result.agg_constraint(
         0,
-        memory_flag * frame.not_n0(),
+        memory_flag_no_last * frame.not_n0(),
         frame.n1() * frame.addr_change() + frame.not_n1() * frame.clk_change(),
     );
     // Always subtract the delta. It should offset the other changes.
-    result[0] -= memory_flag * frame.delta_next();
+    result[0] -= memory_flag_no_last * frame.delta_next();
 
     constraint_count
 }
 
+/// A constraint evaluation function to enforce that the `f_scb` flag is set to 1 when the next row
+/// is in the same context and batch, and 0 otherwise.
+fn enforce_flag_same_context_and_batch<E: FieldElement>(
+    frame: &EvaluationFrame<E>,
+    result: &mut [E],
+    memory_flag_no_last: E,
+) -> usize {
+    result[0] = memory_flag_no_last
+        * (frame.f_scb_next() - binary_not(frame.n0() + frame.not_n0() * frame.n1()));
+
+    1
+}
+
 /// A constraint evaluation function to enforce that memory is initialized to zero when it is read
 /// before being written and that when existing memory values are read they remain unchanged.
 fn enforce_values<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
-    memory_flag: E,
+    memory_flag_no_last: E,
+    memory_flag_first_row: E,
 ) -> usize {
-    let mut index = 0;
-
-    // initialize memory to zero when reading from new context and address pair.
-    for i in 0..NUM_ELEMENTS_IN_BATCH {
-        result[index] = memory_flag * frame.init_read_flag() * frame.v(i);
-        index += 1;
-    }
-
-    // copy previous values when reading memory that was previously accessed.
-    for i in 0..NUM_ELEMENTS_IN_BATCH {
-        result[index] = memory_flag * frame.copy_read_flag() * (frame.v_next(i) - frame.v(i));
-        index += 1;
-    }
-
-    index
+    // intuition: c_i is set to 1 when `v'[i]` is *not* written to, and 0 otherwise.
+    // in other words, c_i is set to 1 when `v'[i]` needs to be constrained.
+    let (c0, c1, c2, c3) = {
+        // intuition: the i'th `f` flag is set to 1 when `i == 2 * idx1 + idx0`
+        let f0 = binary_not(frame.idx1_next()) * binary_not(frame.idx0_next());
+        let f1 = binary_not(frame.idx1_next()) * frame.idx0_next();
+        let f2 = frame.idx1_next() * binary_not(frame.idx0_next());
+        let f3 = frame.idx1_next() * frame.idx0_next();
+
+        let c_i = |f_i| {
+            frame.read_write_next()
+                + binary_not(frame.read_write_next())
+                    * binary_not(frame.element_or_word_next())
+                    * binary_not(f_i)
+        };
+
+        (c_i(f0), c_i(f1), c_i(f2), c_i(f3))
+    };
+
+    // first row constraints
+    result[0] = memory_flag_first_row * c0 * frame.v_next(0);
+    result[1] = memory_flag_first_row * c1 * frame.v_next(1);
+    result[2] = memory_flag_first_row * c2 * frame.v_next(2);
+    result[3] = memory_flag_first_row * c3 * frame.v_next(3);
+
+    // non-first row, new batch or context constraints: when  row' is a new batch/ctx, and v'[i] is
+    // not written to, then v'[i] must be 0.
+    result[4] = memory_flag_no_last * binary_not(frame.f_scb_next()) * c0 * frame.v_next(0);
+    result[5] = memory_flag_no_last * binary_not(frame.f_scb_next()) * c1 * frame.v_next(1);
+    result[6] = memory_flag_no_last * binary_not(frame.f_scb_next()) * c2 * frame.v_next(2);
+    result[7] = memory_flag_no_last * binary_not(frame.f_scb_next()) * c3 * frame.v_next(3);
+
+    // non-first row, same batch or context constraints: when row' is in the same batch/ctx, and
+    // v'[i] is not written to, then v'[i] must be equal to v[i].
+    result[8] = memory_flag_no_last * frame.f_scb_next() * c0 * (frame.v_next(0) - frame.v(0));
+    result[9] = memory_flag_no_last * frame.f_scb_next() * c1 * (frame.v_next(1) - frame.v(1));
+    result[10] = memory_flag_no_last * frame.f_scb_next() * c2 * (frame.v_next(2) - frame.v(2));
+    result[11] = memory_flag_no_last * frame.f_scb_next() * c3 * (frame.v_next(3) - frame.v(3));
+
+    12
 }
 
 // MEMORY FRAME EXTENSION TRAIT
@@ -174,16 +207,28 @@ fn enforce_values<E: FieldElement>(
 trait EvaluationFrameExt<E: FieldElement> {
     // --- Column accessors -----------------------------------------------------------------------
 
-    /// Gets the value of the specified selector column in the current row.
-    fn selector(&self, idx: usize) -> E;
-    /// Gets the value of the specified selector column in the next row.
-    fn selector_next(&self, idx: usize) -> E;
+    /// Gets the value of the read/write column in the current row.
+    fn read_write(&self) -> E;
+    /// Gets the value of the read/write column in the next row.
+    fn read_write_next(&self) -> E;
+    /// Gets the value of the element/word column in the current row.
+    fn element_or_word(&self) -> E;
+    /// Gets the value of the element/word column in the next row.
+    fn element_or_word_next(&self) -> E;
     /// The current context value.
     #[allow(dead_code)]
     fn ctx(&self) -> E;
     /// The current address.
     #[allow(dead_code)]
-    fn addr(&self) -> E;
+    fn batch(&self) -> E;
+    /// The 0'th bit of the index of the memory address in the current batch.
+    fn idx0(&self) -> E;
+    /// The 0'th bit of the index of the memory address in the next batch.
+    fn idx0_next(&self) -> E;
+    /// The 1st bit of the index of the memory address in the current batch.
+    fn idx1(&self) -> E;
+    /// The 1st bit of the index of the memory address in the next batch.
+    fn idx1_next(&self) -> E;
     /// The current clock cycle.
     #[allow(dead_code)]
     fn clk(&self) -> E;
@@ -203,6 +248,10 @@ trait EvaluationFrameExt<E: FieldElement> {
     /// The next value of the column tracking the inverse delta used for constraint evaluations.
     fn d_inv_next(&self) -> E;
 
+    // The flag that indicates whether the next row is in the same batch and context as the current
+    // row.
+    fn f_scb_next(&self) -> E;
+
     // --- Intermediate variables & helpers -------------------------------------------------------
 
     /// The change between the current value in the specified column and the next value, calculated
@@ -226,33 +275,29 @@ trait EvaluationFrameExt<E: FieldElement> {
     fn clk_change(&self) -> E;
     /// The delta between two consecutive context IDs, addresses, or clock cycles.
     fn delta_next(&self) -> E;
-
-    // --- Flags ----------------------------------------------------------------------------------
-
-    /// A flag to indicate that previously assigned memory is being accessed. In other words, the
-    /// context and address have not changed.
-    fn reaccess_flag(&self) -> E;
-
-    /// A flag to indicate that there is a read in the current row which requires the values to be
-    /// initialized to zero.
-    fn init_read_flag(&self) -> E;
-
-    /// A flag to indicate that the operation in the next row is a read which requires copying the
-    /// values from the current row to the next row.
-    fn copy_read_flag(&self) -> E;
 }
 
 impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
     // --- Column accessors -----------------------------------------------------------------------
 
     #[inline(always)]
-    fn selector(&self, idx: usize) -> E {
-        self.current()[MEMORY_TRACE_OFFSET + idx]
+    fn read_write(&self) -> E {
+        self.current()[MEMORY_READ_WRITE_COL_IDX]
+    }
+
+    #[inline(always)]
+    fn read_write_next(&self) -> E {
+        self.next()[MEMORY_READ_WRITE_COL_IDX]
+    }
+
+    #[inline(always)]
+    fn element_or_word(&self) -> E {
+        self.current()[MEMORY_ELEMENT_OR_WORD_COL_IDX]
     }
 
     #[inline(always)]
-    fn selector_next(&self, idx: usize) -> E {
-        self.next()[MEMORY_TRACE_OFFSET + idx]
+    fn element_or_word_next(&self) -> E {
+        self.next()[MEMORY_ELEMENT_OR_WORD_COL_IDX]
     }
 
     #[inline(always)]
@@ -261,8 +306,28 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
     }
 
     #[inline(always)]
-    fn addr(&self) -> E {
-        self.next()[MEMORY_ADDR_COL_IDX]
+    fn batch(&self) -> E {
+        self.next()[MEMORY_BATCH_COL_IDX]
+    }
+
+    #[inline(always)]
+    fn idx0(&self) -> E {
+        self.current()[MEMORY_IDX0_COL_IDX]
+    }
+
+    #[inline(always)]
+    fn idx0_next(&self) -> E {
+        self.next()[MEMORY_IDX0_COL_IDX]
+    }
+
+    #[inline(always)]
+    fn idx1(&self) -> E {
+        self.current()[MEMORY_IDX1_COL_IDX]
+    }
+
+    #[inline(always)]
+    fn idx1_next(&self) -> E {
+        self.next()[MEMORY_IDX1_COL_IDX]
     }
 
     #[inline(always)]
@@ -300,6 +365,11 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
         self.next()[MEMORY_D_INV_COL_IDX]
     }
 
+    #[inline(always)]
+    fn f_scb_next(&self) -> E {
+        self.next()[MEMORY_FLAG_SAME_BATCH_AND_CONTEXT]
+    }
+
     // --- Intermediate variables & helpers -------------------------------------------------------
 
     #[inline(always)]
@@ -319,7 +389,7 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
 
     #[inline(always)]
     fn n1(&self) -> E {
-        self.change(MEMORY_ADDR_COL_IDX) * self.d_inv_next()
+        self.change(MEMORY_BATCH_COL_IDX) * self.d_inv_next()
     }
 
     #[inline(always)]
@@ -334,7 +404,7 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
 
     #[inline(always)]
     fn addr_change(&self) -> E {
-        self.change(MEMORY_ADDR_COL_IDX)
+        self.change(MEMORY_BATCH_COL_IDX)
     }
 
     #[inline(always)]
@@ -346,21 +416,4 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
     fn delta_next(&self) -> E {
         E::from(2_u32.pow(16)) * self.d1_next() + self.d0_next()
     }
-
-    // --- Flags ----------------------------------------------------------------------------------
-
-    #[inline(always)]
-    fn reaccess_flag(&self) -> E {
-        self.not_n0() * self.not_n1()
-    }
-
-    #[inline(always)]
-    fn init_read_flag(&self) -> E {
-        self.selector(0) * binary_not(self.selector(1))
-    }
-
-    #[inline(always)]
-    fn copy_read_flag(&self) -> E {
-        self.selector_next(1)
-    }
 }
diff --git a/air/src/constraints/chiplets/memory/tests.rs b/air/src/constraints/chiplets/memory/tests.rs
index 077a2ef89b..20622cb2d0 100644
--- a/air/src/constraints/chiplets/memory/tests.rs
+++ b/air/src/constraints/chiplets/memory/tests.rs
@@ -1,22 +1,23 @@
 use alloc::vec::Vec;
 
 use rand_utils::rand_value;
+use vm_core::{Felt, FieldElement};
 
 use super::{
-    EvaluationFrame, MEMORY_ADDR_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
+    EvaluationFrame, MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
     MEMORY_D0_COL_IDX, MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_V_COL_RANGE,
-    NUM_ELEMENTS_IN_BATCH,
 };
 use crate::{
     chiplets::memory,
     trace::{
         chiplets::{
-            memory::{Selectors, MEMORY_COPY_READ, MEMORY_INIT_READ, MEMORY_WRITE_SELECTOR},
-            MEMORY_TRACE_OFFSET,
+            memory::{MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_WRITE, NUM_ELEMENTS_IN_BATCH},
+            MEMORY_ELEMENT_OR_WORD_COL_IDX, MEMORY_FLAG_SAME_BATCH_AND_CONTEXT,
+            MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX, MEMORY_READ_WRITE_COL_IDX,
         },
         TRACE_WIDTH,
     },
-    Felt, FieldElement, ONE, ZERO,
+    ONE, ZERO,
 };
 
 // UNIT TESTS
@@ -24,37 +25,25 @@ use crate::{
 
 #[test]
 fn test_memory_write() {
-    let expected = [ZERO; memory::NUM_CONSTRAINTS];
+    let expected_constraint_evals = [ZERO; memory::NUM_CONSTRAINTS];
 
-    let old_values = vec![0, 0, 0, 0];
-    let new_values = vec![1, 0, 0, 0];
+    let old_word = vec![0, 0, 0, 0];
+    let new_word = vec![1, 0, 0, 0];
 
     // Write to a new context.
-    let result = get_constraint_evaluation(
-        MEMORY_WRITE_SELECTOR,
-        MemoryTestDeltaType::Context,
-        &old_values,
-        &new_values,
-    );
-    assert_eq!(expected, result);
+    let result =
+        get_constraint_evaluation(MEMORY_WRITE, MemoryTestDeltaType::Context, &old_word, &new_word);
+    assert_eq!(expected_constraint_evals, result);
 
     // Write to a new address in the same context.
-    let result = get_constraint_evaluation(
-        MEMORY_WRITE_SELECTOR,
-        MemoryTestDeltaType::Address,
-        &old_values,
-        &new_values,
-    );
-    assert_eq!(expected, result);
+    let result =
+        get_constraint_evaluation(MEMORY_WRITE, MemoryTestDeltaType::Batch, &old_word, &new_word);
+    assert_eq!(expected_constraint_evals, result);
 
     // Write to the same context and address at a new clock cycle.
-    let result = get_constraint_evaluation(
-        MEMORY_WRITE_SELECTOR,
-        MemoryTestDeltaType::Clock,
-        &old_values,
-        &new_values,
-    );
-    assert_eq!(expected, result);
+    let result =
+        get_constraint_evaluation(MEMORY_WRITE, MemoryTestDeltaType::Clock, &old_word, &new_word);
+    assert_eq!(expected_constraint_evals, result);
 }
 
 #[test]
@@ -66,7 +55,7 @@ fn test_memory_read() {
 
     // Read from a new context.
     let result = get_constraint_evaluation(
-        MEMORY_INIT_READ,
+        MEMORY_READ,
         MemoryTestDeltaType::Context,
         &old_values,
         &init_values,
@@ -75,8 +64,8 @@ fn test_memory_read() {
 
     // Read from a new address in the same context.
     let result = get_constraint_evaluation(
-        MEMORY_INIT_READ,
-        MemoryTestDeltaType::Address,
+        MEMORY_READ,
+        MemoryTestDeltaType::Batch,
         &old_values,
         &init_values,
     );
@@ -84,7 +73,7 @@ fn test_memory_read() {
 
     // Read from the same context and address at a new clock cycle.
     let result = get_constraint_evaluation(
-        MEMORY_COPY_READ,
+        MEMORY_READ,
         MemoryTestDeltaType::Clock,
         &old_values,
         &old_values,
@@ -101,7 +90,7 @@ fn test_memory_read() {
 /// - Clock: when the delta occurs in the clock column, context and address must stay fixed.
 enum MemoryTestDeltaType {
     Context,
-    Address,
+    Batch,
     Clock,
 }
 
@@ -114,17 +103,17 @@ enum MemoryTestDeltaType {
 /// - To test a valid read, the `delta_type` must be Clock and the `old_values` and `new_values`
 ///   must be equal.
 fn get_constraint_evaluation(
-    selectors: Selectors,
+    read_write: Felt,
     delta_type: MemoryTestDeltaType,
     old_values: &[u32],
     new_values: &[u32],
 ) -> [Felt; memory::NUM_CONSTRAINTS] {
     let delta_row = get_test_delta_row(&delta_type);
-    let frame = get_test_frame(selectors, &delta_type, &delta_row, old_values, new_values);
+    let frame = get_test_frame(read_write, &delta_type, &delta_row, old_values, new_values);
 
     let mut result = [ZERO; memory::NUM_CONSTRAINTS];
 
-    memory::enforce_constraints(&frame, &mut result, ONE);
+    memory::enforce_constraints(&frame, &mut result, ONE, ONE, ZERO);
 
     result
 }
@@ -142,7 +131,7 @@ fn get_constraint_evaluation(
 ///   row.
 /// - `new_values`: specifies the new values, which are placed in the value columns of the next row.
 fn get_test_frame(
-    selectors: Selectors,
+    read_write: Felt,
     delta_type: &MemoryTestDeltaType,
     delta_row: &[u64],
     old_values: &[u32],
@@ -152,12 +141,12 @@ fn get_test_frame(
     let mut next = vec![ZERO; TRACE_WIDTH];
 
     // Set the operation in the next row.
-    next[MEMORY_TRACE_OFFSET] = selectors[0];
-    next[MEMORY_TRACE_OFFSET + 1] = selectors[1];
+    next[MEMORY_READ_WRITE_COL_IDX] = read_write;
+    next[MEMORY_ELEMENT_OR_WORD_COL_IDX] = MEMORY_ACCESS_WORD;
 
     // Set the context, addr, and clock columns in the next row to the values in the delta row.
     next[MEMORY_CTX_COL_IDX] = Felt::new(delta_row[0]);
-    next[MEMORY_ADDR_COL_IDX] = Felt::new(delta_row[1]);
+    next[MEMORY_BATCH_COL_IDX] = Felt::new(delta_row[1]);
     next[MEMORY_CLK_COL_IDX] = Felt::new(delta_row[2]);
 
     // Set the old and new values.
@@ -178,27 +167,40 @@ fn get_test_frame(
     let delta: u64 = match delta_type {
         MemoryTestDeltaType::Clock => delta_row[MemoryTestDeltaType::Clock as usize] - 1,
         MemoryTestDeltaType::Context => delta_row[MemoryTestDeltaType::Context as usize],
-        MemoryTestDeltaType::Address => delta_row[MemoryTestDeltaType::Address as usize],
+        MemoryTestDeltaType::Batch => delta_row[MemoryTestDeltaType::Batch as usize],
     };
     next[MEMORY_D0_COL_IDX] = Felt::new(delta as u16 as u64);
     next[MEMORY_D1_COL_IDX] = Felt::new(delta >> 16);
     next[MEMORY_D_INV_COL_IDX] = (Felt::new(delta)).inv();
 
+    // since we're always writing a word, the idx0 and idx1 columns should be zero
+    next[MEMORY_IDX0_COL_IDX] = ZERO;
+    next[MEMORY_IDX1_COL_IDX] = ZERO;
+
+    // If the context or batch columns are changed, the same batch and context flag should be zero.
+    if delta_row[MemoryTestDeltaType::Batch as usize] > 0
+        || delta_row[MemoryTestDeltaType::Context as usize] > 0
+    {
+        next[MEMORY_FLAG_SAME_BATCH_AND_CONTEXT] = ZERO;
+    } else {
+        next[MEMORY_FLAG_SAME_BATCH_AND_CONTEXT] = ONE;
+    }
+
     EvaluationFrame::<Felt>::from_rows(current, next)
 }
 
-/// Generates a row of valid test values for the context, address, and clock columns according to
+/// Generates a row of valid test values for the context, batch, and clock columns according to
 /// the specified delta type, which determines the column over which the delta and delta inverse
 /// values of the trace would be calculated.
 ///
 /// - When the delta type is Context, the address and clock columns can be anything.
-/// - When the delta type is Address, the context must remain unchanged but the clock can change.
+/// - When the delta type is Batch, the context must remain unchanged but the clock can change.
 /// - When the delta type is Clock, both the context and address columns must remain unchanged.
 fn get_test_delta_row(delta_type: &MemoryTestDeltaType) -> Vec<u64> {
-    let delta_value = rand_value::<u32>() as u64;
+    let delta_value = word_aligned_rand_value() as u64;
     let mut row = vec![0; 3];
     let ctx_idx = MemoryTestDeltaType::Context as usize;
-    let addr_idx = MemoryTestDeltaType::Address as usize;
+    let batch_idx = MemoryTestDeltaType::Batch as usize;
     let clk_idx = MemoryTestDeltaType::Clock as usize;
 
     // Set the context, addr, and clock columns according to the specified delta type.
@@ -208,13 +210,13 @@ fn get_test_delta_row(delta_type: &MemoryTestDeltaType) -> Vec<u64> {
             row[ctx_idx] = delta_value;
 
             // Set addr and clock in the row column to random values.
-            row[addr_idx] = rand_value::<u32>() as u64;
+            row[batch_idx] = word_aligned_rand_value() as u64;
             row[clk_idx] = rand_value::<u32>() as u64;
         },
-        MemoryTestDeltaType::Address => {
+        MemoryTestDeltaType::Batch => {
             // Keep the context value the same in current and row rows (leave it as ZERO).
             // Set the row value for the address.
-            row[addr_idx] = delta_value;
+            row[batch_idx] = delta_value;
 
             // Set clock in the row column to a random value.
             row[clk_idx] = rand_value::<u32>() as u64;
@@ -228,3 +230,9 @@ fn get_test_delta_row(delta_type: &MemoryTestDeltaType) -> Vec<u64> {
 
     row
 }
+
+/// Returns a random value that is aligned to a word boundary (i.e. divisible by 4).
+fn word_aligned_rand_value() -> u32 {
+    let value = rand_value::<u32>();
+    value - (value % 4)
+}
diff --git a/air/src/constraints/chiplets/mod.rs b/air/src/constraints/chiplets/mod.rs
index ea950755fc..a0ebff2faa 100644
--- a/air/src/constraints/chiplets/mod.rs
+++ b/air/src/constraints/chiplets/mod.rs
@@ -86,8 +86,15 @@ pub fn enforce_constraints<E: FieldElement<BaseField = Felt>>(
     );
     constraint_offset += bitwise::get_transition_constraint_count();
 
+    // TODO(plafer): refactor
     // memory transition constraints
-    memory::enforce_constraints(frame, &mut result[constraint_offset..], frame.memory_flag(false));
+    memory::enforce_constraints(
+        frame,
+        &mut result[constraint_offset..],
+        frame.memory_flag(),
+        frame.memory_flag_no_last(),
+        frame.memory_flag_first_row(),
+    );
 }
 
 // TRANSITION CONSTRAINT HELPERS
@@ -144,11 +151,19 @@ trait EvaluationFrameExt<E: FieldElement> {
     fn bitwise_flag(&self) -> E;
 
     /// Flag to indicate whether the frame is in the memory portion of the Chiplets trace.
-    /// When `include_last_row` is true, the memory flag is true for every row where the memory
-    /// selectors are set. When false, the last row is excluded. When this flag is used for
-    /// transition constraints with `include_last_row = false`, they will not be applied to the
-    /// final row of the memory trace.
-    fn memory_flag(&self, include_last_row: bool) -> E;
+    fn memory_flag(&self) -> E;
+
+    /// Flag to indicate whether the frame is in the memory portion of the Chiplets trace, except
+    /// for the last memory chiplet row.
+    fn memory_flag_no_last(&self) -> E;
+
+    /// Flag to indicate whether the next row in the frame is in the memory portion of the Chiplets
+    /// trace.
+    fn memory_flag_next(&self) -> E;
+
+    /// Flag to indicate whether the frame is in the first row of the memory portion of the Chiplets
+    /// trace.
+    fn memory_flag_first_row(&self) -> E;
 }
 
 impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
@@ -175,12 +190,23 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
     }
 
     #[inline(always)]
-    fn memory_flag(&self, include_last_row: bool) -> E {
-        if include_last_row {
-            self.s(0) * self.s(1) * binary_not(self.s(2))
-        } else {
-            self.s(0) * self.s(1) * binary_not(self.s_next(2))
-        }
+    fn memory_flag(&self) -> E {
+        self.s(0) * self.s(1) * binary_not(self.s(2))
+    }
+
+    #[inline(always)]
+    fn memory_flag_no_last(&self) -> E {
+        self.s(0) * self.s(1) * binary_not(self.s_next(2))
+    }
+
+    #[inline(always)]
+    fn memory_flag_next(&self) -> E {
+        self.s_next(0) * self.s_next(1) * binary_not(self.s_next(2))
+    }
+
+    #[inline(always)]
+    fn memory_flag_first_row(&self) -> E {
+        self.hasher_flag() * self.memory_flag_next()
     }
 }
 
@@ -196,6 +222,6 @@ pub trait ChipletsFrameExt<E: FieldElement> {
 impl<E: FieldElement> ChipletsFrameExt<E> for &EvaluationFrame<E> {
     #[inline(always)]
     fn chiplets_memory_flag(&self) -> E {
-        self.memory_flag(true)
+        self.memory_flag()
     }
 }
diff --git a/air/src/trace/chiplets/mod.rs b/air/src/trace/chiplets/mod.rs
index 91a82b19b1..b62a7e7d2d 100644
--- a/air/src/trace/chiplets/mod.rs
+++ b/air/src/trace/chiplets/mod.rs
@@ -86,13 +86,23 @@ pub const BITWISE_OUTPUT_COL_IDX: usize = BITWISE_TRACE_OFFSET + bitwise::OUTPUT
 
 // --- GLOBALLY-INDEXED CHIPLET COLUMN ACCESSORS: MEMORY ------------------------------------------
 
+// TODO(plafer): remove unused constants at the end
 /// The index within the main trace of the column containing the first memory selector, which
 /// indicates the operation (read or write).
 pub const MEMORY_SELECTORS_COL_IDX: usize = MEMORY_TRACE_OFFSET;
+/// The index within the main trace of the column containing the memory read/write column.
+pub const MEMORY_READ_WRITE_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::READ_WRITE_COL_IDX;
+/// The index within the main trace of the column containing the memory element/word column.
+pub const MEMORY_ELEMENT_OR_WORD_COL_IDX: usize =
+    MEMORY_TRACE_OFFSET + memory::ELEMENT_OR_WORD_COL_IDX;
 /// The index within the main trace of the column containing the memory context.
 pub const MEMORY_CTX_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::CTX_COL_IDX;
 /// The index within the main trace of the column containing the memory address.
-pub const MEMORY_ADDR_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::BATCH_COL_IDX;
+pub const MEMORY_BATCH_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::BATCH_COL_IDX;
+/// The index within the main trace of the column containing the 0'th memory index.
+pub const MEMORY_IDX0_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::IDX0_COL_IDX;
+/// The index within the main trace of the column containing the 1st memory index.
+pub const MEMORY_IDX1_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::IDX1_COL_IDX;
 /// The index within the main trace of the column containing the clock cycle of the memory
 /// access.
 pub const MEMORY_CLK_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::CLK_COL_IDX;
@@ -111,3 +121,7 @@ pub const MEMORY_D1_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::D1_COL_IDX;
 /// memory context IDs, addresses, or clock cycles, used to enforce that changes are correctly
 /// constrained.
 pub const MEMORY_D_INV_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::D_INV_COL_IDX;
+/// Column to hold the flag indicating whether the current memory operation is in the same batch and
+/// same context as the previous operation.
+pub const MEMORY_FLAG_SAME_BATCH_AND_CONTEXT: usize =
+    MEMORY_TRACE_OFFSET + memory::FLAG_SAME_BATCH_AND_CONTEXT;
diff --git a/air/src/trace/main_trace.rs b/air/src/trace/main_trace.rs
index eebc8e2779..e6674cb482 100644
--- a/air/src/trace/main_trace.rs
+++ b/air/src/trace/main_trace.rs
@@ -9,7 +9,7 @@ use super::{
     chiplets::{
         hasher::{DIGEST_LEN, HASH_CYCLE_LEN, STATE_WIDTH},
         BITWISE_A_COL_IDX, BITWISE_B_COL_IDX, BITWISE_OUTPUT_COL_IDX, HASHER_NODE_INDEX_COL_IDX,
-        HASHER_STATE_COL_RANGE, MEMORY_ADDR_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
+        HASHER_STATE_COL_RANGE, MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
         MEMORY_V_COL_RANGE,
     },
     decoder::{
@@ -372,7 +372,7 @@ impl MainTrace {
 
     /// Returns the i-th row of the chiplet column containing memory address.
     pub fn chiplet_memory_addr(&self, i: RowIndex) -> Felt {
-        self.columns.get_column(MEMORY_ADDR_COL_IDX)[i]
+        self.columns.get_column(MEMORY_BATCH_COL_IDX)[i]
     }
 
     /// Returns the i-th row of the chiplet column containing clock cycle.
diff --git a/processor/src/operations/comb_ops.rs b/processor/src/operations/comb_ops.rs
index 73bece2e4b..3a93a6809f 100644
--- a/processor/src/operations/comb_ops.rs
+++ b/processor/src/operations/comb_ops.rs
@@ -1,4 +1,4 @@
-use vm_core::{Felt, Operation, ONE, ZERO};
+use vm_core::{Felt, Operation, ZERO};
 
 use crate::{ExecutionError, Process, QuadFelt};
 
@@ -35,7 +35,7 @@ impl Process {
     /// Output:
     ///
     /// +------+------+------+------+------+------+------+------+------+------+------+------+------+--------+--------+---+
-    /// |  T0  |  T7  |  T6  |  T5  |  T4  |  T3  |  T2  |  T1  |  p1' |  p0' |  r1' |  r0' |x_addr|z_addr+1|a_addr+1| - |
+    /// |  T0  |  T7  |  T6  |  T5  |  T4  |  T3  |  T2  |  T1  |  p1' |  p0' |  r1' |  r0' |x_addr|z_addr+4|a_addr+4| - |
     /// +------+------+------+------+------+------+------+------+------+------+------+------+------+--------+--------+---+
     ///
     ///
@@ -91,9 +91,10 @@ impl Process {
         self.stack.set(11, r_new.to_base_elements()[0]);
 
         // --- update the memory pointers ---------------------------------------------------------
+        const FOUR: Felt = Felt::new(4);
         self.stack.set(12, self.stack.get(12));
-        self.stack.set(13, self.stack.get(13) + ONE);
-        self.stack.set(14, self.stack.get(14) + ONE);
+        self.stack.set(13, self.stack.get(13) + FOUR);
+        self.stack.set(14, self.stack.get(14) + FOUR);
 
         // --- copy the rest of the stack ---------------------------------------------------------
         self.stack.copy_state(15);
@@ -172,7 +173,7 @@ mod tests {
     use alloc::{borrow::ToOwned, vec::Vec};
 
     use test_utils::{build_test, rand::rand_array, TRUNCATE_STACK_PROC};
-    use vm_core::{Felt, FieldElement, Operation, StackInputs, ONE, ZERO};
+    use vm_core::{Felt, FieldElement, Operation, StackInputs, ZERO};
 
     use crate::{ContextId, DefaultHost, Process, QuadFelt};
 
@@ -271,9 +272,10 @@ mod tests {
         assert_eq!(r_new.to_base_elements()[0], stack_state[11]);
 
         // --- check that memory pointers were updated --------------------------------------------
+        const FOUR: Felt = Felt::new(4);
         assert_eq!(inputs[12], stack_state[12]);
-        assert_eq!(inputs[13] + ONE, stack_state[13]);
-        assert_eq!(inputs[14] + ONE, stack_state[14]);
+        assert_eq!(inputs[13] + FOUR, stack_state[13]);
+        assert_eq!(inputs[14] + FOUR, stack_state[14]);
 
         // --- check that the helper registers were updated correctly -----------------------------
         let helper_reg_expected = [tz0, tz1, tgz0, tgz1, a0, a1];
@@ -309,8 +311,8 @@ mod tests {
                 # 5) Prepare stack
 
                 ## a) Push pointers
-                push.10     # a_ptr
-                push.2      # z_ptr
+                push.40     # a_ptr
+                push.8      # z_ptr
                 push.0      # x_ptr
 
                 ## b) Push accumulators
@@ -363,7 +365,7 @@ mod tests {
         // create the expected operand stack
         let mut expected = Vec::new();
         // updated pointers
-        expected.extend_from_slice(&[ZERO, Felt::from(18_u8), Felt::from(10_u8), Felt::from(2_u8)]);
+        expected.extend_from_slice(&[ZERO, Felt::from(72_u8), Felt::from(40_u8), Felt::from(8_u8)]);
         // updated accumulators
         expected.extend_from_slice(&[
             r.to_base_elements()[0],
diff --git a/processor/src/trace/tests/chiplets/memory.rs b/processor/src/trace/tests/chiplets/memory.rs
index d4617145d7..5c4f5221e8 100644
--- a/processor/src/trace/tests/chiplets/memory.rs
+++ b/processor/src/trace/tests/chiplets/memory.rs
@@ -3,7 +3,7 @@ use miden_air::{
         memory::{
             MEMORY_READ_LABEL, MEMORY_WRITE_LABEL, MEMORY_WRITE_SELECTOR, NUM_ELEMENTS_IN_BATCH,
         },
-        MEMORY_ADDR_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_SELECTORS_COL_IDX,
+        MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_SELECTORS_COL_IDX,
         MEMORY_V_COL_RANGE,
     },
     RowIndex,
@@ -187,7 +187,7 @@ fn build_expected_memory_from_trace(
 
     // get the memory access data
     let ctx = trace.main_trace.get_column(MEMORY_CTX_COL_IDX)[row];
-    let addr = trace.main_trace.get_column(MEMORY_ADDR_COL_IDX)[row];
+    let addr = trace.main_trace.get_column(MEMORY_BATCH_COL_IDX)[row];
     let clk = trace.main_trace.get_column(MEMORY_CLK_COL_IDX)[row];
 
     // get the memory value

From fdda2f8f6f28145a100f3f26a7868c81c6fb662a Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Tue, 17 Dec 2024 09:34:22 -0500
Subject: [PATCH 03/19] feat(bus): fix bus for element-addressable memory

---
 air/src/constraints/chiplets/memory/tests.rs |   6 +-
 air/src/trace/chiplets/memory.rs             |  49 ++--
 air/src/trace/main_trace.rs                  |  14 +-
 processor/src/chiplets/aux_trace/mod.rs      | 241 ++++++++++++-------
 processor/src/trace/tests/chiplets/memory.rs | 164 +++++++++----
 stdlib/asm/collections/mmr.masm              |  31 ++-
 stdlib/asm/crypto/hashes/rpo.masm            |  15 +-
 stdlib/tests/crypto/rpo.rs                   |  36 +--
 8 files changed, 354 insertions(+), 202 deletions(-)

diff --git a/air/src/constraints/chiplets/memory/tests.rs b/air/src/constraints/chiplets/memory/tests.rs
index 20622cb2d0..159def8ba5 100644
--- a/air/src/constraints/chiplets/memory/tests.rs
+++ b/air/src/constraints/chiplets/memory/tests.rs
@@ -1,7 +1,7 @@
 use alloc::vec::Vec;
 
 use rand_utils::rand_value;
-use vm_core::{Felt, FieldElement};
+use vm_core::{Felt, FieldElement, WORD_SIZE};
 
 use super::{
     EvaluationFrame, MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
@@ -11,7 +11,7 @@ use crate::{
     chiplets::memory,
     trace::{
         chiplets::{
-            memory::{MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_WRITE, NUM_ELEMENTS_IN_BATCH},
+            memory::{MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_WRITE},
             MEMORY_ELEMENT_OR_WORD_COL_IDX, MEMORY_FLAG_SAME_BATCH_AND_CONTEXT,
             MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX, MEMORY_READ_WRITE_COL_IDX,
         },
@@ -150,7 +150,7 @@ fn get_test_frame(
     next[MEMORY_CLK_COL_IDX] = Felt::new(delta_row[2]);
 
     // Set the old and new values.
-    for idx in 0..NUM_ELEMENTS_IN_BATCH {
+    for idx in 0..WORD_SIZE {
         let old_value = Felt::new(old_values[idx] as u64);
         // Add a write for the old values to the current row.
         current[MEMORY_V_COL_RANGE.start + idx] = old_value;
diff --git a/air/src/trace/chiplets/memory.rs b/air/src/trace/chiplets/memory.rs
index 7f62ea5be3..e17689331d 100644
--- a/air/src/trace/chiplets/memory.rs
+++ b/air/src/trace/chiplets/memory.rs
@@ -1,3 +1,5 @@
+use vm_core::WORD_SIZE;
+
 use super::{create_range, Felt, Range, ONE, ZERO};
 
 // CONSTANTS
@@ -6,25 +8,6 @@ use super::{create_range, Felt, Range, ONE, ZERO};
 /// Number of columns needed to record an execution trace of the memory chiplet.
 pub const TRACE_WIDTH: usize = 15;
 
-// TODO(plafer): get rid of all "selector" constants
-/// Number of selector columns in the trace.
-pub const NUM_SELECTORS: usize = 2;
-
-/// Type for Memory trace selectors.
-///
-/// These selectors are used to define which operation and memory state update (init & read / copy &
-/// read / write) is to be applied at a specific row of the memory execution trace.
-pub type Selectors = [Felt; NUM_SELECTORS];
-
-/// Specifies an operation that initializes new memory and then reads it.
-pub const MEMORY_INIT_READ: Selectors = [ONE, ZERO];
-
-/// Specifies an operation that copies existing memory and then reads it.
-pub const MEMORY_COPY_READ: Selectors = [ONE, ONE];
-
-/// Specifies a memory write operation.
-pub const MEMORY_WRITE_SELECTOR: Selectors = [ZERO, ZERO];
-
 // --- OPERATION SELECTORS ------------------------------------------------------------------------
 
 /// Specifies the value of the `READ_WRITE` column when the operation is a write.
@@ -36,20 +19,26 @@ pub const MEMORY_ACCESS_ELEMENT: Felt = ZERO;
 /// Specifies the value of the `ELEMENT_OR_WORD` column when the operation is over a word.
 pub const MEMORY_ACCESS_WORD: Felt = ONE;
 
-// TODO(plafer): figure out the new labels
+// --- BUS LABELS ------------------------------------------------------------------------
 
-/// Unique label computed as 1 plus the full chiplet selector with the bits reversed.
-/// mem_read selector=[1, 1, 0, 1], rev(selector)=[1, 0, 1, 1], +1=[1, 1, 0, 0]
-pub const MEMORY_READ_LABEL: u8 = 0b1100;
+// All bus labels encode the chiplet selector (1, 1, 0), as well as the read/write and element/word
+// columns. The purpose of the label is to force the chiplet to assign the correct values to the
+// read/write and element/word columns. We also include the chiplet selector as a "namespace" for
+// memory chiplet labels (to really ensure they don't collide with labels from other chiplets).
 
-/// Unique label computed as 1 plus the full chiplet selector with the bits reversed.
-/// mem_write selector=[1, 1, 0, 0] rev(selector)=[0, 0, 1, 1] +1=[0, 1, 0, 0]
-pub const MEMORY_WRITE_LABEL: u8 = 0b0100;
+/// Unique label when r/w=0 and e/w=0.
+pub const MEMORY_WRITE_ELEMENT_LABEL: u8 = 0b11000;
 
-// --- COLUMN ACCESSOR INDICES WITHIN THE CHIPLET -------------------------------------------------
+/// Unique label when r/w=0 and e/w=1.
+pub const MEMORY_WRITE_WORD_LABEL: u8 = 0b11001;
+
+/// Unique label when r/w=1 and e/w=0.
+pub const MEMORY_READ_ELEMENT_LABEL: u8 = 0b11010;
 
-/// The number of elements accessible in one read or write memory access.
-pub const NUM_ELEMENTS_IN_BATCH: usize = 4;
+/// Unique label when r/w=1 and e/w=1.
+pub const MEMORY_READ_WORD_LABEL: u8 = 0b11011;
+
+// --- COLUMN ACCESSOR INDICES WITHIN THE CHIPLET -------------------------------------------------
 
 /// Column to hold the whether the operation is a read or write.
 pub const READ_WRITE_COL_IDX: usize = 0;
@@ -67,7 +56,7 @@ pub const IDX1_COL_IDX: usize = IDX0_COL_IDX + 1;
 pub const CLK_COL_IDX: usize = IDX1_COL_IDX + 1;
 /// Columns to hold the values stored at a given memory context, address, and clock cycle after
 /// the memory operation. When reading from a new address, these are initialized to zero.
-pub const V_COL_RANGE: Range<usize> = create_range(CLK_COL_IDX + 1, NUM_ELEMENTS_IN_BATCH);
+pub const V_COL_RANGE: Range<usize> = create_range(CLK_COL_IDX + 1, WORD_SIZE);
 /// Column for the lower 16-bits of the delta between two consecutive context IDs, addresses, or
 /// clock cycles.
 pub const D0_COL_IDX: usize = V_COL_RANGE.end;
diff --git a/air/src/trace/main_trace.rs b/air/src/trace/main_trace.rs
index e6674cb482..5f6d7981eb 100644
--- a/air/src/trace/main_trace.rs
+++ b/air/src/trace/main_trace.rs
@@ -10,7 +10,7 @@ use super::{
         hasher::{DIGEST_LEN, HASH_CYCLE_LEN, STATE_WIDTH},
         BITWISE_A_COL_IDX, BITWISE_B_COL_IDX, BITWISE_OUTPUT_COL_IDX, HASHER_NODE_INDEX_COL_IDX,
         HASHER_STATE_COL_RANGE, MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
-        MEMORY_V_COL_RANGE,
+        MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX, MEMORY_V_COL_RANGE,
     },
     decoder::{
         GROUP_COUNT_COL_IDX, HASHER_STATE_OFFSET, IN_SPAN_COL_IDX, IS_CALL_FLAG_COL_IDX,
@@ -371,10 +371,20 @@ impl MainTrace {
     }
 
     /// Returns the i-th row of the chiplet column containing memory address.
-    pub fn chiplet_memory_addr(&self, i: RowIndex) -> Felt {
+    pub fn chiplet_memory_batch(&self, i: RowIndex) -> Felt {
         self.columns.get_column(MEMORY_BATCH_COL_IDX)[i]
     }
 
+    /// Returns the i-th row of the chiplet column containing 0th bit of the batch index.
+    pub fn chiplet_memory_idx0(&self, i: RowIndex) -> Felt {
+        self.columns.get_column(MEMORY_IDX0_COL_IDX)[i]
+    }
+
+    /// Returns the i-th row of the chiplet column containing 1st bit of the batch index.
+    pub fn chiplet_memory_idx1(&self, i: RowIndex) -> Felt {
+        self.columns.get_column(MEMORY_IDX1_COL_IDX)[i]
+    }
+
     /// Returns the i-th row of the chiplet column containing clock cycle.
     pub fn chiplet_memory_clk(&self, i: RowIndex) -> Felt {
         self.columns.get_column(MEMORY_CLK_COL_IDX)[i]
diff --git a/processor/src/chiplets/aux_trace/mod.rs b/processor/src/chiplets/aux_trace/mod.rs
index db3e637dc5..92266961ee 100644
--- a/processor/src/chiplets/aux_trace/mod.rs
+++ b/processor/src/chiplets/aux_trace/mod.rs
@@ -10,7 +10,10 @@ use miden_air::{
                 RETURN_STATE_LABEL, STATE_WIDTH,
             },
             kernel_rom::KERNEL_PROC_LABEL,
-            memory::{MEMORY_READ_LABEL, MEMORY_WRITE_LABEL},
+            memory::{
+                MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ_ELEMENT_LABEL,
+                MEMORY_READ_WORD_LABEL, MEMORY_WRITE_ELEMENT_LABEL, MEMORY_WRITE_WORD_LABEL,
+            },
         },
         main_trace::MainTrace,
     },
@@ -29,6 +32,7 @@ use super::{super::trace::AuxColumnBuilder, Felt, FieldElement};
 // ================================================================================================
 
 const NUM_HEADER_ALPHAS: usize = 4;
+const FOUR: Felt = Felt::new(4);
 
 // CHIPLETS AUXILIARY TRACE BUILDER
 // ================================================================================================
@@ -297,10 +301,18 @@ impl<E: FieldElement<BaseField = Felt>> AuxColumnBuilder<E> for BusColumnBuilder
             OPCODE_END => build_end_block_request(main_trace, alphas, row),
             OPCODE_U32AND => build_bitwise_request(main_trace, ZERO, alphas, row),
             OPCODE_U32XOR => build_bitwise_request(main_trace, ONE, alphas, row),
-            OPCODE_MLOADW => build_mem_request_word(main_trace, MEMORY_READ_LABEL, alphas, row),
-            OPCODE_MSTOREW => build_mem_request_word(main_trace, MEMORY_WRITE_LABEL, alphas, row),
-            OPCODE_MLOAD => build_mem_request_element(main_trace, MEMORY_READ_LABEL, alphas, row),
-            OPCODE_MSTORE => build_mem_request_element(main_trace, MEMORY_WRITE_LABEL, alphas, row),
+            OPCODE_MLOADW => {
+                build_mem_mloadw_mstorew_request(main_trace, MEMORY_READ_WORD_LABEL, alphas, row)
+            },
+            OPCODE_MSTOREW => {
+                build_mem_mloadw_mstorew_request(main_trace, MEMORY_WRITE_WORD_LABEL, alphas, row)
+            },
+            OPCODE_MLOAD => {
+                build_mem_mload_mstore_request(main_trace, MEMORY_READ_ELEMENT_LABEL, alphas, row)
+            },
+            OPCODE_MSTORE => {
+                build_mem_mload_mstore_request(main_trace, MEMORY_WRITE_ELEMENT_LABEL, alphas, row)
+            },
             OPCODE_MSTREAM => build_mstream_request(main_trace, alphas, row),
             OPCODE_RCOMBBASE => build_rcomb_base_request(main_trace, alphas, row),
             OPCODE_HPERM => build_hperm_request(main_trace, alphas, row),
@@ -365,7 +377,14 @@ fn build_dyn_block_request<E: FieldElement<BaseField = Felt>>(
         let mem_addr = main_trace.stack_element(0, row);
         let mem_value = main_trace.decoder_hasher_state_first_half(row);
 
-        compute_memory_request(main_trace, MEMORY_READ_LABEL, alphas, row, mem_addr, mem_value)
+        compute_mem_request_word(
+            main_trace,
+            MEMORY_READ_WORD_LABEL,
+            alphas,
+            row,
+            mem_addr,
+            mem_value,
+        )
     };
 
     control_block_req * memory_req
@@ -474,42 +493,6 @@ fn build_bitwise_request<E: FieldElement<BaseField = Felt>>(
         + alphas[4].mul_base(z)
 }
 
-/// Builds `MLOAD` and `MSTORE` requests made to the memory chiplet.
-fn build_mem_request_element<E: FieldElement<BaseField = Felt>>(
-    main_trace: &MainTrace,
-    op_label: u8,
-    alphas: &[E],
-    row: RowIndex,
-) -> E {
-    let word = [
-        main_trace.stack_element(0, row + 1),
-        main_trace.helper_register(2, row),
-        main_trace.helper_register(1, row),
-        main_trace.helper_register(0, row),
-    ];
-    let addr = main_trace.stack_element(0, row);
-
-    compute_memory_request(main_trace, op_label, alphas, row, addr, word)
-}
-
-/// Builds `MLOADW` and `MSTOREW` requests made to the memory chiplet.
-fn build_mem_request_word<E: FieldElement<BaseField = Felt>>(
-    main_trace: &MainTrace,
-    op_label: u8,
-    alphas: &[E],
-    row: RowIndex,
-) -> E {
-    let word = [
-        main_trace.stack_element(3, row + 1),
-        main_trace.stack_element(2, row + 1),
-        main_trace.stack_element(1, row + 1),
-        main_trace.stack_element(0, row + 1),
-    ];
-    let addr = main_trace.stack_element(0, row);
-
-    compute_memory_request(main_trace, op_label, alphas, row, addr, word)
-}
-
 /// Builds `MSTREAM` requests made to the memory chiplet.
 fn build_mstream_request<E: FieldElement<BaseField = Felt>>(
     main_trace: &MainTrace,
@@ -529,10 +512,10 @@ fn build_mstream_request<E: FieldElement<BaseField = Felt>>(
         main_trace.stack_element(0, row + 1),
     ];
     let addr = main_trace.stack_element(12, row);
-    let op_label = MEMORY_READ_LABEL;
+    let op_label = MEMORY_READ_WORD_LABEL;
 
-    let factor1 = compute_memory_request(main_trace, op_label, alphas, row, addr, word1);
-    let factor2 = compute_memory_request(main_trace, op_label, alphas, row, addr + ONE, word2);
+    let factor1 = compute_mem_request_word(main_trace, op_label, alphas, row, addr, word1);
+    let factor2 = compute_mem_request_word(main_trace, op_label, alphas, row, addr + FOUR, word2);
 
     factor1 * factor2
 }
@@ -556,12 +539,12 @@ fn build_pipe_request<E: FieldElement<BaseField = Felt>>(
         main_trace.stack_element(0, row + 1),
     ];
     let addr = main_trace.stack_element(12, row);
-    let op_label = MEMORY_WRITE_LABEL;
+    let op_label = MEMORY_WRITE_WORD_LABEL;
 
-    let factor1 = compute_memory_request(main_trace, op_label, alphas, row, addr, word1);
-    let factor2 = compute_memory_request(main_trace, op_label, alphas, row, addr + ONE, word2);
+    let req1 = compute_mem_request_word(main_trace, op_label, alphas, row, addr, word1);
+    let req2 = compute_mem_request_word(main_trace, op_label, alphas, row, addr + FOUR, word2);
 
-    factor1 * factor2
+    req1 * req2
 }
 
 /// Builds `RCOMBBASE` requests made to the memory chiplet.
@@ -578,14 +561,14 @@ fn build_rcomb_base_request<E: FieldElement<BaseField = Felt>>(
     let a1 = main_trace.helper_register(5, row);
     let z_ptr = main_trace.stack_element(13, row);
     let a_ptr = main_trace.stack_element(14, row);
-    let op_label = MEMORY_READ_LABEL;
+    let op_label = MEMORY_READ_WORD_LABEL;
 
-    let factor1 =
-        compute_memory_request(main_trace, op_label, alphas, row, z_ptr, [tz0, tz1, tzg0, tzg1]);
-    let factor2 =
-        compute_memory_request(main_trace, op_label, alphas, row, a_ptr, [a0, a1, ZERO, ZERO]);
+    let req1 =
+        compute_mem_request_word(main_trace, op_label, alphas, row, z_ptr, [tz0, tz1, tzg0, tzg1]);
+    let req2 =
+        compute_mem_request_word(main_trace, op_label, alphas, row, a_ptr, [a0, a1, ZERO, ZERO]);
 
-    factor1 * factor2
+    req1 * req2
 }
 
 /// Builds `HPERM` requests made to the hash chiplet.
@@ -919,26 +902,59 @@ fn build_memory_chiplet_responses<E>(main_trace: &MainTrace, row: RowIndex, alph
 where
     E: FieldElement<BaseField = Felt>,
 {
-    let is_read = main_trace.chiplet_selector_3(row);
-    let op_label = get_op_label(ONE, ONE, ZERO, is_read);
+    let element_word = main_trace.chiplet_selector_4(row);
+    let header = {
+        let read_write = main_trace.chiplet_selector_3(row);
+        let op_label = get_memory_op_label(read_write, element_word);
+
+        let ctx = main_trace.chiplet_memory_ctx(row);
+        let clk = main_trace.chiplet_memory_clk(row);
+        let address = {
+            let batch = main_trace.chiplet_memory_batch(row);
+            let idx0 = main_trace.chiplet_memory_idx0(row);
+            let idx1 = main_trace.chiplet_memory_idx1(row);
+
+            batch + idx1.mul_small(2) + idx0
+        };
 
-    let ctx = main_trace.chiplet_memory_ctx(row);
-    let clk = main_trace.chiplet_memory_clk(row);
-    let addr = main_trace.chiplet_memory_addr(row);
-    let value0 = main_trace.chiplet_memory_value_0(row);
-    let value1 = main_trace.chiplet_memory_value_1(row);
-    let value2 = main_trace.chiplet_memory_value_2(row);
-    let value3 = main_trace.chiplet_memory_value_3(row);
+        alphas[0]
+            + alphas[1].mul_base(op_label)
+            + alphas[2].mul_base(ctx)
+            + alphas[3].mul_base(address)
+            + alphas[4].mul_base(clk)
+    };
 
-    alphas[0]
-        + alphas[1].mul_base(op_label)
-        + alphas[2].mul_base(ctx)
-        + alphas[3].mul_base(addr)
-        + alphas[4].mul_base(clk)
-        + alphas[5].mul_base(value0)
-        + alphas[6].mul_base(value1)
-        + alphas[7].mul_base(value2)
-        + alphas[8].mul_base(value3)
+    if element_word == MEMORY_ACCESS_ELEMENT {
+        let idx0 = main_trace.chiplet_memory_idx0(row);
+        let idx1 = main_trace.chiplet_memory_idx1(row);
+
+        let value = if idx1 == ZERO && idx0 == ZERO {
+            main_trace.chiplet_memory_value_0(row)
+        } else if idx1 == ZERO && idx0 == ONE {
+            main_trace.chiplet_memory_value_1(row)
+        } else if idx1 == ONE && idx0 == ZERO {
+            main_trace.chiplet_memory_value_2(row)
+        } else if idx1 == ONE && idx0 == ONE {
+            main_trace.chiplet_memory_value_3(row)
+        } else {
+            panic!("Invalid batch indices. idx0: {idx0}, idx1: {idx1}");
+        };
+
+        header + alphas[5].mul_base(value)
+    } else if element_word == MEMORY_ACCESS_WORD {
+        let value0 = main_trace.chiplet_memory_value_0(row);
+        let value1 = main_trace.chiplet_memory_value_1(row);
+        let value2 = main_trace.chiplet_memory_value_2(row);
+        let value3 = main_trace.chiplet_memory_value_3(row);
+
+        header
+            + alphas[5].mul_base(value0)
+            + alphas[6].mul_base(value1)
+            + alphas[7].mul_base(value2)
+            + alphas[8].mul_base(value3)
+    } else {
+        panic!("Invalid memory element/word column value: {element_word}");
+    }
 }
 
 /// Builds the response from the kernel chiplet at `row`.
@@ -984,16 +1000,79 @@ fn get_op_label(s0: Felt, s1: Felt, s2: Felt, s3: Felt) -> Felt {
     s3.mul_small(1 << 3) + s2.mul_small(1 << 2) + s1.mul_small(2) + s0 + ONE
 }
 
-/// Computes a memory read or write request at `row` given randomness `alphas`, memory address
-/// `addr` and value `value`.
-fn compute_memory_request<E: FieldElement<BaseField = Felt>>(
+/// Returns the operation unique label for memory operations.
+///
+/// The memory operation label is currently the only label that is built differently (or *simpler*)
+/// from the other chiplets. We should refactor the other chiplets to use a similar (simpler)
+/// approach.
+fn get_memory_op_label(read_write: Felt, element_word: Felt) -> Felt {
+    const MEMORY_SELECTOR: u8 = 0b110;
+    Felt::from(MEMORY_SELECTOR << 2) + read_write.mul_small(2) + element_word
+}
+
+/// Builds `MLOADW` and `MSTOREW` requests made to the memory chiplet.
+fn build_mem_mloadw_mstorew_request<E: FieldElement<BaseField = Felt>>(
+    main_trace: &MainTrace,
+    op_label: u8,
+    alphas: &[E],
+    row: RowIndex,
+) -> E {
+    let word = [
+        main_trace.stack_element(3, row + 1),
+        main_trace.stack_element(2, row + 1),
+        main_trace.stack_element(1, row + 1),
+        main_trace.stack_element(0, row + 1),
+    ];
+    let addr = main_trace.stack_element(0, row);
+
+    compute_mem_request_word(main_trace, op_label, alphas, row, addr, word)
+}
+
+/// Builds `MLOAD` and `MSTORE` requests made to the memory chiplet.
+fn build_mem_mload_mstore_request<E: FieldElement<BaseField = Felt>>(
+    main_trace: &MainTrace,
+    op_label: u8,
+    alphas: &[E],
+    row: RowIndex,
+) -> E {
+    let element = main_trace.stack_element(0, row + 1);
+    let addr = main_trace.stack_element(0, row);
+
+    compute_mem_request_element(main_trace, op_label, alphas, row, addr, element)
+}
+
+/// Computes a memory request for a read or write of a single element.
+fn compute_mem_request_element<E: FieldElement<BaseField = Felt>>(
+    main_trace: &MainTrace,
+    op_label: u8,
+    alphas: &[E],
+    row: RowIndex,
+    addr: Felt,
+    element: Felt,
+) -> E {
+    debug_assert!(op_label == MEMORY_READ_ELEMENT_LABEL || op_label == MEMORY_WRITE_ELEMENT_LABEL);
+
+    let ctx = main_trace.ctx(row);
+    let clk = main_trace.clk(row);
+
+    alphas[0]
+        + alphas[1].mul_base(Felt::from(op_label))
+        + alphas[2].mul_base(ctx)
+        + alphas[3].mul_base(addr)
+        + alphas[4].mul_base(clk)
+        + alphas[5].mul_base(element)
+}
+
+/// Computes a memory request for a read or write of a word.
+fn compute_mem_request_word<E: FieldElement<BaseField = Felt>>(
     main_trace: &MainTrace,
     op_label: u8,
     alphas: &[E],
     row: RowIndex,
     addr: Felt,
-    value: Word,
+    word: Word,
 ) -> E {
+    debug_assert!(op_label == MEMORY_READ_WORD_LABEL || op_label == MEMORY_WRITE_WORD_LABEL);
     let ctx = main_trace.ctx(row);
     let clk = main_trace.clk(row);
 
@@ -1002,8 +1081,8 @@ fn compute_memory_request<E: FieldElement<BaseField = Felt>>(
         + alphas[2].mul_base(ctx)
         + alphas[3].mul_base(addr)
         + alphas[4].mul_base(clk)
-        + alphas[5].mul_base(value[0])
-        + alphas[6].mul_base(value[1])
-        + alphas[7].mul_base(value[2])
-        + alphas[8].mul_base(value[3])
+        + alphas[5].mul_base(word[0])
+        + alphas[6].mul_base(word[1])
+        + alphas[7].mul_base(word[2])
+        + alphas[8].mul_base(word[3])
 }
diff --git a/processor/src/trace/tests/chiplets/memory.rs b/processor/src/trace/tests/chiplets/memory.rs
index 5c4f5221e8..c870d0edc9 100644
--- a/processor/src/trace/tests/chiplets/memory.rs
+++ b/processor/src/trace/tests/chiplets/memory.rs
@@ -1,13 +1,17 @@
 use miden_air::{
     trace::chiplets::{
         memory::{
-            MEMORY_READ_LABEL, MEMORY_WRITE_LABEL, MEMORY_WRITE_SELECTOR, NUM_ELEMENTS_IN_BATCH,
+            MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ_ELEMENT_LABEL,
+            MEMORY_READ_WORD_LABEL, MEMORY_WRITE, MEMORY_WRITE_ELEMENT_LABEL,
+            MEMORY_WRITE_WORD_LABEL,
         },
-        MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_SELECTORS_COL_IDX,
-        MEMORY_V_COL_RANGE,
+        MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
+        MEMORY_ELEMENT_OR_WORD_COL_IDX, MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX,
+        MEMORY_READ_WRITE_COL_IDX, MEMORY_V_COL_RANGE,
     },
     RowIndex,
 };
+use vm_core::WORD_SIZE;
 
 use super::{
     build_trace_from_ops, rand_array, ExecutionTrace, Felt, FieldElement, Operation, Trace, Word,
@@ -27,6 +31,8 @@ use super::{
 #[test]
 #[allow(clippy::needless_range_loop)]
 fn b_chip_trace_mem() {
+    const FOUR: Felt = Felt::new(4);
+
     let stack = [1, 2, 3, 4, 0];
     let word = [ONE, Felt::new(2), Felt::new(3), Felt::new(4)];
     let operations = vec![
@@ -35,14 +41,14 @@ fn b_chip_trace_mem() {
         Operation::Drop,
         Operation::Drop,
         Operation::Drop,
-        Operation::MLoad,     // read the first value of the word
-        Operation::MovDn5,    // put address 0 and space for a full word at top of stack
-        Operation::MLoadW,    // load word from address 0 to stack
-        Operation::Push(ONE), // push a new value onto the stack
-        Operation::Push(ONE), // push a new address on to the stack
-        Operation::MStore,    // store 1 at address 1
-        Operation::Drop,      // ensure the stack overflow table is empty
-        Operation::MStream,   // read 2 words starting at address 0
+        Operation::MLoad,      // read the first value of the word
+        Operation::MovDn5,     // put address 0 and space for a full word at top of stack
+        Operation::MLoadW,     // load word from address 0 to stack
+        Operation::Push(ONE),  // push a new value onto the stack
+        Operation::Push(FOUR), // push a new address on to the stack
+        Operation::MStore,     // store 1 at address 4
+        Operation::Drop,       // ensure the stack overflow table is empty
+        Operation::MStream,    // read 2 words starting at address 0
     ];
     let trace = build_trace_from_ops(operations, &stack);
 
@@ -59,7 +65,8 @@ fn b_chip_trace_mem() {
 
     // The first memory request from the stack is sent when the `MStoreW` operation is executed, at
     // cycle 1, so the request is included in the next row. (The trace begins by executing `span`).
-    let value = build_expected_memory(&rand_elements, MEMORY_WRITE_LABEL, ZERO, ZERO, ONE, word);
+    let value =
+        build_expected_bus_word_msg(&rand_elements, MEMORY_WRITE_WORD_LABEL, ZERO, ZERO, ONE, word);
     let mut expected = value.inv();
     assert_eq!(expected, b_chip[2]);
 
@@ -70,8 +77,14 @@ fn b_chip_trace_mem() {
 
     // The next memory request from the stack is sent when `MLoad` is executed at cycle 6 and
     // included at row 7
-    let value =
-        build_expected_memory(&rand_elements, MEMORY_READ_LABEL, ZERO, ZERO, Felt::new(6), word);
+    let value = build_expected_bus_element_msg(
+        &rand_elements,
+        MEMORY_READ_ELEMENT_LABEL,
+        ZERO,
+        ZERO,
+        Felt::new(6),
+        word[0],
+    );
     expected *= value.inv();
     assert_eq!(expected, b_chip[7]);
 
@@ -86,52 +99,64 @@ fn b_chip_trace_mem() {
     // to the 5 memory operations (MStream requires 2 rows).
 
     // At cycle 8 `MLoadW` is requested by the stack and `MStoreW` is provided by memory
-    let value =
-        build_expected_memory(&rand_elements, MEMORY_READ_LABEL, ZERO, ZERO, Felt::new(8), word);
+    let value = build_expected_bus_word_msg(
+        &rand_elements,
+        MEMORY_READ_WORD_LABEL,
+        ZERO,
+        ZERO,
+        Felt::new(8),
+        word,
+    );
     expected *= value.inv();
-    expected *= build_expected_memory_from_trace(&trace, &rand_elements, 8.into());
+    expected *= build_expected_bus_msg_from_trace(&trace, &rand_elements, 8.into());
     assert_eq!(expected, b_chip[9]);
 
     // At cycle 9, `MLoad` is provided by memory.
-    expected *= build_expected_memory_from_trace(&trace, &rand_elements, 9.into());
+    expected *= build_expected_bus_msg_from_trace(&trace, &rand_elements, 9.into());
     assert_eq!(expected, b_chip[10]);
 
     // At cycle 10,  `MLoadW` is provided by memory.
-    expected *= build_expected_memory_from_trace(&trace, &rand_elements, 10.into());
+    expected *= build_expected_bus_msg_from_trace(&trace, &rand_elements, 10.into());
     assert_eq!(expected, b_chip[11]);
 
     // At cycle 11, `MStore` is requested by the stack and the first read of `MStream` is provided
     // by the memory.
-    let value = build_expected_memory(
+    let value = build_expected_bus_element_msg(
         &rand_elements,
-        MEMORY_WRITE_LABEL,
+        MEMORY_WRITE_ELEMENT_LABEL,
         ZERO,
-        ONE,
+        FOUR,
         Felt::new(11),
-        [ONE, ZERO, ZERO, ZERO],
+        ONE,
     );
     expected *= value.inv();
-    expected *= build_expected_memory_from_trace(&trace, &rand_elements, 11.into());
+    expected *= build_expected_bus_msg_from_trace(&trace, &rand_elements, 11.into());
     assert_eq!(expected, b_chip[12]);
 
     // At cycle 12, `MStore` is provided by the memory
-    expected *= build_expected_memory_from_trace(&trace, &rand_elements, 12.into());
+    expected *= build_expected_bus_msg_from_trace(&trace, &rand_elements, 12.into());
     assert_eq!(expected, b_chip[13]);
 
     // At cycle 13, `MStream` is requested by the stack, and the second read of `MStream` is
     // provided by the memory.
-    let value1 =
-        build_expected_memory(&rand_elements, MEMORY_READ_LABEL, ZERO, ZERO, Felt::new(13), word);
-    let value2 = build_expected_memory(
+    let value1 = build_expected_bus_word_msg(
         &rand_elements,
-        MEMORY_READ_LABEL,
+        MEMORY_READ_WORD_LABEL,
         ZERO,
-        ONE,
+        ZERO,
+        Felt::new(13),
+        word,
+    );
+    let value2 = build_expected_bus_word_msg(
+        &rand_elements,
+        MEMORY_READ_WORD_LABEL,
+        ZERO,
+        Felt::new(4),
         Felt::new(13),
         [ONE, ZERO, ZERO, ZERO],
     );
     expected *= (value1 * value2).inv();
-    expected *= build_expected_memory_from_trace(&trace, &rand_elements, 13.into());
+    expected *= build_expected_bus_msg_from_trace(&trace, &rand_elements, 13.into());
     assert_eq!(expected, b_chip[14]);
 
     // At cycle 14 the decoder requests the span hash. We set this as the inverse of the previously
@@ -149,7 +174,25 @@ fn b_chip_trace_mem() {
 // TEST HELPERS
 // ================================================================================================
 
-fn build_expected_memory(
+fn build_expected_bus_element_msg(
+    alphas: &[Felt],
+    op_label: u8,
+    ctx: Felt,
+    addr: Felt,
+    clk: Felt,
+    value: Felt,
+) -> Felt {
+    assert!(op_label == MEMORY_READ_ELEMENT_LABEL || op_label == MEMORY_WRITE_ELEMENT_LABEL);
+
+    alphas[0]
+        + alphas[1] * Felt::from(op_label)
+        + alphas[2] * ctx
+        + alphas[3] * addr
+        + alphas[4] * clk
+        + alphas[5] * value
+}
+
+fn build_expected_bus_word_msg(
     alphas: &[Felt],
     op_label: u8,
     ctx: Felt,
@@ -157,44 +200,69 @@ fn build_expected_memory(
     clk: Felt,
     word: Word,
 ) -> Felt {
-    let mut word_value = ZERO;
-    for i in 0..NUM_ELEMENTS_IN_BATCH {
-        word_value += alphas[i + 5] * word[i];
-    }
+    assert!(op_label == MEMORY_READ_WORD_LABEL || op_label == MEMORY_WRITE_WORD_LABEL);
 
     alphas[0]
         + alphas[1] * Felt::from(op_label)
         + alphas[2] * ctx
         + alphas[3] * addr
         + alphas[4] * clk
-        + word_value
+        + alphas[5] * word[0]
+        + alphas[6] * word[1]
+        + alphas[7] * word[2]
+        + alphas[8] * word[3]
 }
 
-fn build_expected_memory_from_trace(
+fn build_expected_bus_msg_from_trace(
     trace: &ExecutionTrace,
     alphas: &[Felt],
     row: RowIndex,
 ) -> Felt {
     // get the memory access operation
-    let s0 = trace.main_trace.get_column(MEMORY_SELECTORS_COL_IDX)[row];
-    let s1 = trace.main_trace.get_column(MEMORY_SELECTORS_COL_IDX + 1)[row];
-    let op_label = if s0 == MEMORY_WRITE_SELECTOR[0] {
-        debug_assert!(s1 == ZERO);
-        MEMORY_WRITE_LABEL
-    } else {
-        MEMORY_READ_LABEL
+    let read_write = trace.main_trace.get_column(MEMORY_READ_WRITE_COL_IDX)[row];
+    let element_or_word = trace.main_trace.get_column(MEMORY_ELEMENT_OR_WORD_COL_IDX)[row];
+    let op_label = if read_write == MEMORY_WRITE {
+        if element_or_word == MEMORY_ACCESS_ELEMENT {
+            MEMORY_WRITE_ELEMENT_LABEL
+        } else {
+            MEMORY_WRITE_WORD_LABEL
+        }
+    } else
+    /* read_write == MEMORY_READ */
+    {
+        if element_or_word == MEMORY_ACCESS_ELEMENT {
+            MEMORY_READ_ELEMENT_LABEL
+        } else {
+            MEMORY_READ_WORD_LABEL
+        }
     };
 
     // get the memory access data
     let ctx = trace.main_trace.get_column(MEMORY_CTX_COL_IDX)[row];
-    let addr = trace.main_trace.get_column(MEMORY_BATCH_COL_IDX)[row];
+    let addr = {
+        let batch = trace.main_trace.get_column(MEMORY_BATCH_COL_IDX)[row];
+        let idx1 = trace.main_trace.get_column(MEMORY_IDX1_COL_IDX)[row];
+        let idx0 = trace.main_trace.get_column(MEMORY_IDX0_COL_IDX)[row];
+
+        batch + idx1.mul_small(2) + idx0
+    };
     let clk = trace.main_trace.get_column(MEMORY_CLK_COL_IDX)[row];
 
     // get the memory value
-    let mut word = [ZERO; NUM_ELEMENTS_IN_BATCH];
+    let mut word = [ZERO; WORD_SIZE];
     for (i, element) in word.iter_mut().enumerate() {
         *element = trace.main_trace.get_column(MEMORY_V_COL_RANGE.start + i)[row];
     }
 
-    build_expected_memory(alphas, op_label, ctx, addr, clk, word)
+    if element_or_word == MEMORY_ACCESS_ELEMENT {
+        let idx1 = trace.main_trace.get_column(MEMORY_IDX1_COL_IDX)[row].as_int();
+        let idx0 = trace.main_trace.get_column(MEMORY_IDX0_COL_IDX)[row].as_int();
+        let idx = idx1 * 2 + idx0;
+
+        build_expected_bus_element_msg(alphas, op_label, ctx, addr, clk, word[idx as usize])
+    } else if element_or_word == MEMORY_ACCESS_WORD {
+        build_expected_bus_word_msg(alphas, op_label, ctx, addr, clk, word)
+    } else {
+        panic!("invalid element_or_word value: {element_or_word}");
+    }
 }
diff --git a/stdlib/asm/collections/mmr.masm b/stdlib/asm/collections/mmr.masm
index 4ed948fc62..da246184cf 100644
--- a/stdlib/asm/collections/mmr.masm
+++ b/stdlib/asm/collections/mmr.masm
@@ -79,6 +79,8 @@ end
 
 #! Given the num_leaves of a MMR returns the num_peaks.
 #!
+#! Implemented as counting the number of "1" bits in `num_leaves`.
+#!
 #! Input: [num_leaves, ...]
 #! Output: [num_peaks, ...]
 #! Cycles: 69
@@ -93,14 +95,16 @@ end
 #!
 #! Input: [num_peaks, ...]
 #! Output: [len, ...]
-#! Cycles: 17
+#! Cycles: 19
 export.num_peaks_to_message_size
   # the peaks are padded to a minimum length of 16 (10 cycles)
   push.16 u32max
   # => [count_min, ...]
 
-  # when the number of peaks is greater than 16, then they are padded to an even number (7 cycles)
-  dup is_odd add
+  # when the number of peaks is greater than 16, then they are padded to an even number.
+  # we multiply by four because each peak is a word, and so is stored in 4 memory addresses.
+  # (9 cycles)
+  dup is_odd add mul.4
   # => [even_count_min, ...]
 end
 
@@ -114,10 +118,11 @@ end
 #!    length and to have a minimum size of 16 elements
 #!  - The advice map must contain a key with HASH, and its value is
 #!    `num_leaves || hash_data`, and hash_data is the data used to computed `HASH`
-#!  - mmt_ptr: the memory location where the MMR data will be written to,
-#!    starting with the MMR forest (its total leaves count) followed by its peaks
+#!  - mmr_ptr: the memory location where the MMR data will be written to,
+#!    starting with the MMR forest (its total leaves count) followed by its peaks.
+#!    The address is expected to be word-aligned.
 #!
-#! Cycles: 162 + 9 * extra_peak_pair cycles
+#! Cycles: 164 + 9 * extra_peak_pair cycles
 #!    where `extra_peak` is the number of peak pairs in addition to the first
 #!    16, i.e. `round_up((num_of_peaks - 16) / 2)`
 export.unpack
@@ -141,12 +146,12 @@ export.unpack
   # => [state_size, HASH, mmr_ptr, ...]
 
   # compute the end address including the padding data and forest (3 cycles)
-  dup.5 add add.1
+  dup.5 add add.4
   # => [mmt_ptr_end, HASH, mmr_ptr, ...]
 
   # update the mmr_ptr to account for the size (2 cycles)
-  movup.5 add.1
-  # => [mmr_ptr+1, mmt_ptr_end, HASH, ...]
+  movup.5 add.4
+  # => [mmr_ptr+4, mmt_ptr_end, HASH, ...]
 
   # hash the first 16 words (28 cycles)
   padw padw padw
@@ -158,7 +163,7 @@ export.unpack
   adv_pipe hperm
   adv_pipe hperm
   adv_pipe hperm
-  # => [C, B, A, mmr_ptr+17, mmt_ptr_end, HASH, ...]
+  # => [C, B, A, mmr_ptr+68, mmt_ptr_end, HASH, ...]
 
   # handle MMR with more than 16 elements (10 + 9 * words cycles)
   exec.mem::pipe_double_words_to_memory
@@ -176,7 +181,7 @@ end
 #!
 #! Input: [mmr_ptr, ...]
 #! Output: [HASH, ...]
-#! Cycles: 128 + 3 * num_peaks
+#! Cycles: 130 + 3 * num_peaks
 export.pack
   # load num_leaves (2 cycles)
   dup mem_load
@@ -186,12 +191,12 @@ export.pack
   exec.num_leaves_to_num_peaks
   # => [num_peaks, mmr_ptr, ...]
 
-  # compute the message size (18 cycles)
+  # compute the message size (19 cycles)
   exec.num_peaks_to_message_size
   # => [message_size, mmr_ptr, ...]
 
   # compute peaks_start and peaks_end (6 cycles)
-  dup.1 add.1 swap dup.1 add swap
+  dup.1 add.4 swap dup.1 add swap
   # => [peaks_start, peaks_end, mmr_ptr, ...]
 
   # hash the memory contents (25 + 3 * num_peaks)
diff --git a/stdlib/asm/crypto/hashes/rpo.masm b/stdlib/asm/crypto/hashes/rpo.masm
index 1035fa464d..bb8cc5c414 100644
--- a/stdlib/asm/crypto/hashes/rpo.masm
+++ b/stdlib/asm/crypto/hashes/rpo.masm
@@ -34,7 +34,7 @@ end
 
 #! Hashes the memory `start_addr` to `end_addr` given an RPO state specified by 3 words.
 #!
-#! This requires that `end_addr = start_addr + 2n` where n = {0, 1, 2 ...}, otherwise the procedure 
+#! This requires that `end_addr = start_addr + 8n` where n = {0, 1, 2 ...}, otherwise the procedure 
 #! will enter an infinite loop. 
 #!
 #! Input: [C, B, A, start_addr, end_addr, ...]
@@ -56,24 +56,25 @@ end
 #! Hashes the memory `start_addr` to `end_addr`, handles odd number of elements.
 #!
 #! Requires `start_addr ≤ end_addr`, `end_addr` is not inclusive.
+#! Requires `start_addr` and `end_addr` to be word-aligned.
 #!
 #! Input: [start_addr, end_addr, ...]
 #! Output: [H, ...]
 #!
 #! Cycles:
-#! - even words: 49 cycles + 3 * words
-#! - odd words: 61 cycles + 3 * words
+#! - even words: 53 cycles + 3 * words
+#! - odd words: 65 cycles + 3 * words
 #! where `words` is the `start_addr - end_addr - 1`
 export.hash_memory_words
   # enforce `start_addr ≤ end_addr`
   dup.1 dup.1 u32assert2 u32gte assert
 
-  # figure out if the range is for an odd number of words (9 cycles)
-  dup.1 dup.1 sub is_odd
+  # figure out if the range is for an odd number of words (11 cycles)
+  dup.1 dup.1 sub div.4 is_odd
   # => [is_odd, start_addr, end_addr, ...]
 
-  # make the start/end range even (4 cycles)
-  movup.2 dup.1 sub
+  # make the start/end range even (6 cycles)
+  movup.2 dup.1 mul.4 sub
   # => [end_addr, is_odd, start_addr, ...]
 
   # move start_addr to the right stack position (1 cycles)
diff --git a/stdlib/tests/crypto/rpo.rs b/stdlib/tests/crypto/rpo.rs
index f3879ce22a..fbc22c2385 100644
--- a/stdlib/tests/crypto/rpo.rs
+++ b/stdlib/tests/crypto/rpo.rs
@@ -55,7 +55,7 @@ fn test_hash_empty() {
     use.std::crypto::hashes::rpo
 
     begin
-        push.1002 # end address
+        push.1008 # end address
         push.1000 # start address
 
         exec.rpo::hash_memory_words
@@ -107,7 +107,7 @@ fn test_single_iteration() {
         # insert 1 to memory
         push.1.1000 mem_store
 
-        push.1002 # end address
+        push.1008 # end address
         push.1000 # start address
 
         exec.rpo::hash_memory_words
@@ -138,7 +138,7 @@ fn test_hash_one_word() {
     begin
         push.1.1000 mem_store # push data to memory
 
-        push.1001 # end address
+        push.1004 # end address
         push.1000 # start address
 
         exec.rpo::hash_memory_words
@@ -159,9 +159,9 @@ fn test_hash_even_words() {
 
     begin
         push.1.0.0.0.1000 mem_storew dropw
-        push.0.1.0.0.1001 mem_storew dropw
+        push.0.1.0.0.1004 mem_storew dropw
 
-        push.1002 # end address
+        push.1008 # end address
         push.1000 # start address
 
         exec.rpo::hash_memory_words
@@ -187,10 +187,10 @@ fn test_hash_odd_words() {
 
     begin
         push.1.0.0.0.1000 mem_storew dropw
-        push.0.1.0.0.1001 mem_storew dropw
-        push.0.0.1.0.1002 mem_storew dropw
+        push.0.1.0.0.1004 mem_storew dropw
+        push.0.0.1.0.1008 mem_storew dropw
 
-        push.1003 # end address
+        push.1012 # end address
         push.1000 # start address
 
         exec.rpo::hash_memory_words
@@ -217,9 +217,9 @@ fn test_absorb_double_words_from_memory() {
 
     begin
         push.1.0.0.0.1000 mem_storew dropw
-        push.0.1.0.0.1001 mem_storew dropw
+        push.0.1.0.0.1004 mem_storew dropw
 
-        push.1002      # end address
+        push.1008      # end address
         push.1000      # start address
         padw padw padw # hasher state
         exec.rpo::absorb_double_words_from_memory
@@ -237,8 +237,8 @@ fn test_absorb_double_words_from_memory() {
     ]).into_iter().map(|e| e.as_int()).collect();
 
     // start and end addr
-    even_hash.push(1002);
-    even_hash.push(1002);
+    even_hash.push(1008);
+    even_hash.push(1008);
 
     build_test!(even_words, &[]).expect_stack(&even_hash);
 }
@@ -250,11 +250,11 @@ fn test_squeeze_digest() {
 
     begin
         push.1.0.0.0.1000 mem_storew dropw
-        push.0.1.0.0.1001 mem_storew dropw
-        push.0.0.1.0.1002 mem_storew dropw
-        push.0.0.0.1.1003 mem_storew dropw
+        push.0.1.0.0.1004 mem_storew dropw
+        push.0.0.1.0.1008 mem_storew dropw
+        push.0.0.0.1.1012 mem_storew dropw
 
-        push.1004      # end address
+        push.1016      # end address
         push.1000      # start address
         padw padw padw # hasher state
         exec.rpo::absorb_double_words_from_memory
@@ -275,8 +275,8 @@ fn test_squeeze_digest() {
     ]).into_iter().map(|e| e.as_int()).collect();
 
     // start and end addr
-    even_hash.push(1004);
-    even_hash.push(1004);
+    even_hash.push(1016);
+    even_hash.push(1016);
 
     build_test!(even_words, &[]).expect_stack(&even_hash);
 }

From bd6e746aa7ff82cc62f730c702bbbd152f58dd0a Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Wed, 18 Dec 2024 06:01:26 -0500
Subject: [PATCH 04/19] fix: fix stdlib after element-addressable memory

---
 docs/src/user_docs/stdlib/collections.md      |  7 +-
 .../tests/integration/air/chiplets/memory.rs  | 14 ++--
 miden/tests/integration/flow_control/mod.rs   | 14 ++--
 .../operations/decorators/advice.rs           | 11 +--
 miden/tests/integration/operations/fri_ops.rs |  2 +-
 .../integration/operations/io_ops/adv_ops.rs  |  4 +-
 .../integration/operations/io_ops/env_ops.rs  | 16 ++--
 .../operations/io_ops/local_ops.rs            |  4 +-
 .../integration/operations/io_ops/mem_ops.rs  | 16 ++--
 processor/src/operations/fri_ops.rs           | 16 ++--
 processor/src/trace/tests/chiplets/memory.rs  | 10 +--
 stdlib/asm/collections/mmr.masm               | 26 +++----
 stdlib/asm/crypto/dsa/rpo_falcon512.masm      | 58 +++++++-------
 stdlib/asm/crypto/fri/frie2f4.masm            | 72 ++++++++---------
 stdlib/asm/crypto/hashes/rpo.masm             |  2 +-
 stdlib/asm/mem.masm                           | 16 ++--
 stdlib/tests/collections/mmr.rs               | 77 +++++++++----------
 stdlib/tests/crypto/falcon.rs                 | 75 +++++++++++++-----
 stdlib/tests/crypto/rpo.rs                    | 10 +--
 stdlib/tests/mem/mod.rs                       | 17 ++--
 20 files changed, 252 insertions(+), 215 deletions(-)

diff --git a/docs/src/user_docs/stdlib/collections.md b/docs/src/user_docs/stdlib/collections.md
index 3b3b26138a..da6cfcd518 100644
--- a/docs/src/user_docs/stdlib/collections.md
+++ b/docs/src/user_docs/stdlib/collections.md
@@ -15,7 +15,12 @@ The following procedures are available to read data from and make updates to a M
 | get         | Loads the leaf at the absolute position `pos` in the MMR onto the stack.<br /><br />Valid range for `pos` is between $0$ and $2^{32} - 1$ (both inclusive).<br /><br />Inputs: `[pos, mmr_ptr, ...]`<br />Output: `[N, ...]`<br /><br />Where `N` is the leaf loaded from the MMR whose memory location starts at `mmr_ptr`. |
 | add         | Adds a new leaf to the MMR.<br /><br />This will update the MMR peaks in the VM's memory and the advice provider with any merged nodes.<br /><br />Inputs: `[N, mmr_ptr, ...]`<br />Outputs: `[...]`<br /><br />Where `N` is the leaf added to the MMR whose memory locations starts at `mmr_ptr`. |
 | pack        | Computes a commitment to the given MMR and copies the MMR to the Advice Map using the commitment as a key.<br /><br />Inputs: `[mmr_ptr, ...]`<br />Outputs: `[HASH, ...]`<br /><br /> |
-| unpack      | Load the MMR peak data based on its hash.<br /><br />Inputs: `[HASH, mmr_ptr, ...]`<br />Outputs: `[...]`<br /><br />Where:<br />- `HASH`: is the MMR peak hash, the hash is expected to be padded to an even length and to have a minimum size of 16 elements.<br />- The advice map must contain a key with `HASH`, and its value is `num_leaves \|\| hash_data`, and hash_data is the data used to computed `HASH`<br />- `mmt_ptr`: the memory location where the MMR data will be written, starting with the MMR forest (the total count of its leaves) followed by its peaks. |
+| unpack      | Writes the MMR who's peaks hash to `HASH` to the memory location pointed to by `mmr_ptr`.<br /><br />Inputs: `[HASH, mmr_ptr, ...]`<br />Outputs: `[...]`<br /><br />Where:<br />- `HASH`: is the MMR peak hash, the hash is expected to be padded to an even length and to have a minimum size of 16 elements.<br />- The advice map must contain a key with `HASH`, and its value is `[num_leaves, 0, 0, 0] \|\| hash_data`, and hash_data is the data used to computed `HASH`<br />- `mmr_ptr`: the memory location where the MMR data will be written, starting with the MMR forest (the total count of its leaves) followed by its peaks. |
+
+`mmr_ptr` is a pointer to the `mmr` data structure, which is defined as:
+1. `mmr_ptr[0]` contains the number of leaves in the MMR
+2. `mmr_ptr[1..4]` are padding and are ignored
+3. `mmr_ptr[4..8], mmr_ptr[8..12], ...` contain the 1st MMR peak, 2nd MMR peak, etc.
 
 ## Sparse Merkle Tree
 
diff --git a/miden/tests/integration/air/chiplets/memory.rs b/miden/tests/integration/air/chiplets/memory.rs
index b339720640..21e3133292 100644
--- a/miden/tests/integration/air/chiplets/memory.rs
+++ b/miden/tests/integration/air/chiplets/memory.rs
@@ -24,9 +24,8 @@ fn helper_mem_store() {
     let pub_inputs = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
 
     let trace = build_test!(asm_op, &pub_inputs).execute().unwrap();
-    // Since MStore only writes 1 element to memory, the 3 elements in the word at that location
-    // that are not touched are placed in the helper registers.
-    let helper_regs = [10, 9, 8, 0, 0, 0].to_elements();
+    // MStore doesn't use helper registers, so they should be zero.
+    let helper_regs = [0, 0, 0, 0, 0, 0].to_elements();
     // We need to check helper registers state after the MStore operation at clock cycle 8.
     assert_eq!(helper_regs, trace.get_user_op_helpers_at(8));
     // After the second MStoreW call, the helper registers should be zero.
@@ -34,7 +33,7 @@ fn helper_mem_store() {
     assert_eq!(helper_regs, trace.get_user_op_helpers_at(11));
 
     // We need to check helper registers state after the MStore operation at clock cycle 14.
-    let helper_regs = [5, 4, 3, 0, 0, 0].to_elements();
+    let helper_regs = [0, 0, 0, 0, 0, 0].to_elements();
     assert_eq!(helper_regs, trace.get_user_op_helpers_at(14));
 }
 
@@ -69,9 +68,8 @@ fn helper_write_read() {
     let pub_inputs = vec![4, 3, 2, 1];
 
     let trace = build_test!(source, &pub_inputs).execute().unwrap();
-    // When the MLoad operation is called, word elements that were not pushed on the stack
-    // are written to helper registers. So, 3, 2 and 1 will be written after this operation
-    let helper_regs = [1, 2, 3, 0, 0, 0].to_elements();
+    // MLoad doesn't use helper registers, so they should be zero.
+    let helper_regs = [0, 0, 0, 0, 0, 0].to_elements();
     // We need to check helper registers state after first MLoad, which index is 8
     assert_eq!(helper_regs, trace.get_user_op_helpers_at(8));
 }
@@ -92,7 +90,7 @@ fn update() {
 
 #[test]
 fn incr_write_addr() {
-    let source = "begin mem_storew.0 mem_storew.1 end";
+    let source = "begin mem_storew.0 mem_storew.4 end";
     let pub_inputs = vec![4, 3, 2, 1];
 
     build_test!(source, &pub_inputs).prove_and_verify(pub_inputs, false);
diff --git a/miden/tests/integration/flow_control/mod.rs b/miden/tests/integration/flow_control/mod.rs
index ccbbc1c552..d2bbd4ef5b 100644
--- a/miden/tests/integration/flow_control/mod.rs
+++ b/miden/tests/integration/flow_control/mod.rs
@@ -273,8 +273,8 @@ fn simple_dyn_exec() {
             movdn.4
 
             # use dynexec to call foo again via its hash, which is stored at memory location 42
-            mem_storew.42 dropw
-            push.42
+            mem_storew.40 dropw
+            push.40
             dynexec
         end";
 
@@ -320,10 +320,10 @@ fn dynexec_with_procref() {
     end
 
     begin
-        procref.foo mem_storew.42 dropw push.42
+        procref.foo mem_storew.40 dropw push.40
         dynexec
 
-        procref.module::func mem_storew.42 dropw push.42
+        procref.module::func mem_storew.40 dropw push.40
         dynexec
 
         dup
@@ -369,8 +369,8 @@ fn simple_dyncall() {
             movdn.4
 
             # use dyncall to call foo again via its hash, which is on the stack
-            mem_storew.42 dropw
-            push.42
+            mem_storew.40 dropw
+            push.40
             dyncall
 
             swapw dropw
@@ -442,7 +442,7 @@ fn dyncall_with_syscall_and_caller() {
             push.1 push.2 push.3 push.4 padw
 
             # Prepare dyncall
-            procref.bar mem_storew.42 dropw push.42
+            procref.bar mem_storew.40 dropw push.40
             dyncall
 
             # Truncate stack
diff --git a/miden/tests/integration/operations/decorators/advice.rs b/miden/tests/integration/operations/decorators/advice.rs
index b48c9af7b2..727d21712f 100644
--- a/miden/tests/integration/operations/decorators/advice.rs
+++ b/miden/tests/integration/operations/decorators/advice.rs
@@ -176,24 +176,25 @@ fn advice_insert_mem() {
 
     # write to memory and drop first word from stack to use second word as the key for advice map.
     # mem_storew reverses the order of field elements in the word when it's stored in memory.
-    mem_storew.2 dropw mem_storew.3
+    mem_storew.8 dropw mem_storew.12
     # State Transition:
     # stack: [5, 6, 7, 8]
-    # mem[2]: [4, 3, 2, 1]
-    # mem[3]: [8, 7, 6, 5]
+    # mem[8..11]: [4, 3, 2, 1]
+    # mem[12..15]: [8, 7, 6, 5]
 
     # copy from memory to advice map
     # the key used is in the reverse order of the field elements in the word at the top of the
     # stack.
-    push.2.4 movdn.4 movdn.4
+    push.16 movdn.4 push.8 movdn.4
     adv.insert_mem
     # State Transition:
+    # stack: [5, 6, 7, 8, 4, 16]
     # advice_map: k: [8, 7, 6, 5], v: [4, 3, 2, 1, 8, 7, 6, 5]
 
     # copy from advice map to advice stack
     adv.push_mapval dropw
     # State Transition:
-    # stack: [0, 0, 0, 0]
+    # stack: [4, 16, 0, 0]
     # advice_stack: [4, 3, 2, 1, 8, 7, 6, 5]
 
     # copy first word from advice stack to stack
diff --git a/miden/tests/integration/operations/fri_ops.rs b/miden/tests/integration/operations/fri_ops.rs
index 56949c306f..1ae051d160 100644
--- a/miden/tests/integration/operations/fri_ops.rs
+++ b/miden/tests/integration/operations/fri_ops.rs
@@ -40,7 +40,7 @@ fn fri_ext2fold4() {
     // processor tests
     let stack_state = test.get_last_stack_state();
     assert_eq!(stack_state[8], Felt::new(poe).square());
-    assert_eq!(stack_state[10], Felt::new(layer_ptr + 2));
+    assert_eq!(stack_state[10], Felt::new(layer_ptr + 8));
     assert_eq!(stack_state[11], Felt::new(poe).exp(4));
     assert_eq!(stack_state[12], Felt::new(f_pos));
     assert_eq!(stack_state[15], Felt::new(end_ptr));
diff --git a/miden/tests/integration/operations/io_ops/adv_ops.rs b/miden/tests/integration/operations/io_ops/adv_ops.rs
index 5bb3147702..c67ffa7a19 100644
--- a/miden/tests/integration/operations/io_ops/adv_ops.rs
+++ b/miden/tests/integration/operations/io_ops/adv_ops.rs
@@ -92,7 +92,7 @@ fn adv_pipe() {
     // to the end (the address will be 2 since 0 + 2 = 2).
     let mut final_stack = state.iter().map(|&v| v.as_int()).collect::<Vec<u64>>();
     final_stack.reverse();
-    final_stack.push(2);
+    final_stack.push(8);
 
     let test = build_test!(source, &[], &advice_stack);
     test.expect_stack(&final_stack);
@@ -129,7 +129,7 @@ fn adv_pipe_with_hperm() {
     // to the end (the address will be 2 since 0 + 2 = 2).
     let mut final_stack = state.iter().map(|&v| v.as_int()).collect::<Vec<u64>>();
     final_stack.reverse();
-    final_stack.push(2);
+    final_stack.push(8);
 
     let test = build_test!(source, &[], &advice_stack);
     test.expect_stack(&final_stack);
diff --git a/miden/tests/integration/operations/io_ops/env_ops.rs b/miden/tests/integration/operations/io_ops/env_ops.rs
index d8bc13b594..475f58d87b 100644
--- a/miden/tests/integration/operations/io_ops/env_ops.rs
+++ b/miden/tests/integration/operations/io_ops/env_ops.rs
@@ -59,7 +59,7 @@ fn locaddr() {
         end";
 
     let test = build_test!(source, &[10]);
-    test.expect_stack(&[FMP_MIN + 2, FMP_MIN + 1, 10]);
+    test.expect_stack(&[FMP_MIN + 8, FMP_MIN + 4, 10]);
 
     // --- accessing mem via locaddr updates the correct variables --------------------------------
     let source = "
@@ -106,14 +106,14 @@ fn locaddr() {
 
     let test = build_test!(source, &[10]);
     test.expect_stack(&[
-        FMP_MIN + 3,
-        FMP_MIN + 2,
-        FMP_MIN + 1,
-        FMP_MIN + 2,
-        FMP_MIN + 5,
+        FMP_MIN + 12,
+        FMP_MIN + 8,
+        FMP_MIN + 4,
+        FMP_MIN + 8,
+        FMP_MIN + 20,
+        FMP_MIN + 16,
+        FMP_MIN + 12,
         FMP_MIN + 4,
-        FMP_MIN + 3,
-        FMP_MIN + 1,
         10,
     ]);
 
diff --git a/miden/tests/integration/operations/io_ops/local_ops.rs b/miden/tests/integration/operations/io_ops/local_ops.rs
index f1fc61e1b3..b5f753611f 100644
--- a/miden/tests/integration/operations/io_ops/local_ops.rs
+++ b/miden/tests/integration/operations/io_ops/local_ops.rs
@@ -121,11 +121,11 @@ fn storew_local() {
         begin
             mem_storew.0
             dropw
-            mem_storew.1
+            mem_storew.4
             dropw
             exec.foo
         end";
-    let mem_addr = 1;
+    let mem_addr = 4;
 
     let test = build_test!(source, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]);
     test.expect_stack_and_memory(&[4, 3, 2, 1], mem_addr, &[5, 6, 7, 8]);
diff --git a/miden/tests/integration/operations/io_ops/mem_ops.rs b/miden/tests/integration/operations/io_ops/mem_ops.rs
index a206e6cd4f..99f9c67982 100644
--- a/miden/tests/integration/operations/io_ops/mem_ops.rs
+++ b/miden/tests/integration/operations/io_ops/mem_ops.rs
@@ -50,7 +50,7 @@ fn mem_store() {
 
 #[test]
 fn mem_loadw() {
-    let addr = 1;
+    let addr = 4;
     let asm_op = "mem_loadw";
 
     // --- read from uninitialized memory - address provided via the stack ------------------------
@@ -101,7 +101,7 @@ fn mem_stream() {
         {TRUNCATE_STACK_PROC}
 
         begin
-            push.1
+            push.4
             mem_storew
             dropw
             push.0
@@ -117,7 +117,7 @@ fn mem_stream() {
     let inputs = [1, 2, 3, 4, 5, 6, 7, 8];
 
     // the state is built by replacing the values on the top of the stack with the values in memory
-    // addresses 0 and 1 (i.e., 1 through 8). Thus, the first 8 elements on the stack will be 1
+    // addresses 0 and 4 (i.e., 1 through 8). Thus, the first 8 elements on the stack will be 1
     // through 8 (in stack order, with 8 at stack[0]), and the remaining 4 are untouched (i.e., 9,
     // 10, 11, 12).
     let state: [Felt; 12] =
@@ -127,7 +127,7 @@ fn mem_stream() {
     // to the end (the address will be 2 since 0 + 2 = 2).
     let mut final_stack = state.iter().map(|&v| v.as_int()).collect::<Vec<u64>>();
     final_stack.reverse();
-    final_stack.push(2);
+    final_stack.push(8);
 
     let test = build_test!(source, &inputs);
     test.expect_stack(&final_stack);
@@ -140,7 +140,7 @@ fn mem_stream_with_hperm() {
         {TRUNCATE_STACK_PROC}
 
         begin
-            push.1
+            push.4
             mem_storew
             dropw
             push.0
@@ -169,7 +169,7 @@ fn mem_stream_with_hperm() {
     // to the end (the address will be 2 since 0 + 2 = 2).
     let mut final_stack = state.iter().map(|&v| v.as_int()).collect::<Vec<u64>>();
     final_stack.reverse();
-    final_stack.push(2);
+    final_stack.push(8);
 
     let test = build_test!(source, &inputs);
     test.expect_stack(&final_stack);
@@ -205,8 +205,8 @@ fn inverse_operations() {
         begin
             push.0
             mem_storew
-            mem_storew.1
-            push.1
+            mem_storew.4
+            push.4
             mem_loadw
             mem_loadw.0
         end";
diff --git a/processor/src/operations/fri_ops.rs b/processor/src/operations/fri_ops.rs
index cf67ef21cc..c656a62e35 100644
--- a/processor/src/operations/fri_ops.rs
+++ b/processor/src/operations/fri_ops.rs
@@ -5,7 +5,7 @@ use super::{super::QuadFelt, ExecutionError, Felt, Operation, Process};
 // CONSTANTS
 // ================================================================================================
 
-const TWO: Felt = Felt::new(2);
+const EIGHT: Felt = Felt::new(8);
 const TWO_INV: Felt = Felt::new(9223372034707292161);
 
 const DOMAIN_OFFSET: Felt = Felt::GENERATOR;
@@ -31,7 +31,7 @@ impl Process {
     /// - Folds 4 query values (v0, v1), (v2, v3), (v4, v5), (v6, v7) into a single value (ne0,
     ///   ne1).
     /// - Computes new value of the domain generator power: poe' = poe^4.
-    /// - Increments layer pointer (cptr) by 2.
+    /// - Increments layer pointer (cptr) by 8.
     /// - Checks that the previous folding was done correctly.
     /// - Shifts the stack to the left to move an item from the overflow table to stack position 15.
     ///
@@ -100,7 +100,7 @@ impl Process {
         self.stack.set(7, ds[0]);
         self.stack.set(8, poe2);
         self.stack.set(9, f_tau);
-        self.stack.set(10, layer_ptr + TWO);
+        self.stack.set(10, layer_ptr + EIGHT);
         self.stack.set(11, poe4);
         self.stack.set(12, f_pos);
         self.stack.set(13, folded_value[1]);
@@ -248,9 +248,9 @@ mod tests {
     use winter_utils::transpose_slice;
 
     use super::{
-        ExtensionOf, Felt, FieldElement, Operation, Process, QuadFelt, StarkField, TWO, TWO_INV,
+        ExtensionOf, Felt, FieldElement, Operation, Process, QuadFelt, StarkField, TWO_INV,
     };
-    use crate::DefaultHost;
+    use crate::{operations::fri_ops::EIGHT, DefaultHost};
 
     #[test]
     fn fold4() {
@@ -295,7 +295,7 @@ mod tests {
         assert_eq!(super::TAU2_INV, tau.square().inv());
         assert_eq!(super::TAU3_INV, tau.cube().inv());
 
-        assert_eq!(TWO.inv(), TWO_INV);
+        assert_eq!(Felt::new(2).inv(), TWO_INV);
     }
 
     #[test]
@@ -304,7 +304,7 @@ mod tests {
         // we need 17 values because we also assume that the pointer to the last FRI layer will
         // be in the first position of the stack overflow table
         let mut inputs = rand_array::<Felt, 17>();
-        inputs[7] = TWO; // domain segment must be < 4
+        inputs[7] = Felt::new(2); // domain segment must be < 4
 
         // when domain segment is 2, the 3rd query value and the previous value must be the same
         inputs[4] = inputs[13];
@@ -362,7 +362,7 @@ mod tests {
         // check poe, f_tau, layer_ptr, f_pos
         assert_eq!(stack_state[8], poe.square());
         assert_eq!(stack_state[9], f_tau);
-        assert_eq!(stack_state[10], layer_ptr + TWO);
+        assert_eq!(stack_state[10], layer_ptr + EIGHT);
         assert_eq!(stack_state[11], poe.exp(4));
         assert_eq!(stack_state[12], f_pos);
 
diff --git a/processor/src/trace/tests/chiplets/memory.rs b/processor/src/trace/tests/chiplets/memory.rs
index c870d0edc9..35c97f0bf6 100644
--- a/processor/src/trace/tests/chiplets/memory.rs
+++ b/processor/src/trace/tests/chiplets/memory.rs
@@ -1,9 +1,7 @@
 use miden_air::{
     trace::chiplets::{
         memory::{
-            MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ_ELEMENT_LABEL,
-            MEMORY_READ_WORD_LABEL, MEMORY_WRITE, MEMORY_WRITE_ELEMENT_LABEL,
-            MEMORY_WRITE_WORD_LABEL,
+            MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_READ_ELEMENT_LABEL, MEMORY_READ_WORD_LABEL, MEMORY_WRITE, MEMORY_WRITE_ELEMENT_LABEL, MEMORY_WRITE_WORD_LABEL
         },
         MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
         MEMORY_ELEMENT_OR_WORD_COL_IDX, MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX,
@@ -227,14 +225,14 @@ fn build_expected_bus_msg_from_trace(
         } else {
             MEMORY_WRITE_WORD_LABEL
         }
-    } else
-    /* read_write == MEMORY_READ */
-    {
+    } else if read_write == MEMORY_READ {
         if element_or_word == MEMORY_ACCESS_ELEMENT {
             MEMORY_READ_ELEMENT_LABEL
         } else {
             MEMORY_READ_WORD_LABEL
         }
+    } else {
+        panic!("invalid read_write value: {read_write}");
     };
 
     // get the memory access data
diff --git a/stdlib/asm/collections/mmr.masm b/stdlib/asm/collections/mmr.masm
index da246184cf..e18f2f7987 100644
--- a/stdlib/asm/collections/mmr.masm
+++ b/stdlib/asm/collections/mmr.masm
@@ -10,7 +10,7 @@ use.std::math::u64
 #! Input: [pos, mmr_ptr, ...]
 #! Output: [N, ...] where `N` is the leaf and `R` is the MMR peak that owns the leaf.
 #!
-#! Cycles: 115
+#! Cycles: 118
 export.get
   # load the num_leaves of the MMR (2 cycles)
   dup.1 mem_load
@@ -48,9 +48,9 @@ export.get
   swap u32assert u32popcnt
   # stack: [peak_count, relative_pos, depth, mmr_ptr, ...]
 
-  # compute `mmr_ptr + peak_count + 1` the target tree index (3 cycles)
-  movup.3 add add.1
-  # stack: [mmr_ptr, relative_pos, depth, ...]
+  # compute `mmr_ptr + 4*peak_count + 4` the target tree index (6 cycles)
+  mul.4 movup.3 add add.4
+  # stack: [peak_ptr, relative_pos, depth, ...]
 
   # load the target peak (6 cycles)
   padw movup.4 mem_loadw
@@ -108,7 +108,7 @@ export.num_peaks_to_message_size
   # => [even_count_min, ...]
 end
 
-#! Load the MMR peak data based on its hash.
+#! Writes the MMR who's peaks hash to `HASH` to the memory location pointed to by `mmr_ptr`.
 #!
 #! Input: [HASH, mmr_ptr, ...]
 #! Output: [...]
@@ -117,7 +117,7 @@ end
 #!  - HASH: is the MMR peak hash, the hash is expected to be padded to an even
 #!    length and to have a minimum size of 16 elements
 #!  - The advice map must contain a key with HASH, and its value is
-#!    `num_leaves || hash_data`, and hash_data is the data used to computed `HASH`
+#!    `[num_leaves, 0, 0 , 0] || hash_data`, and hash_data is the data used to computed `HASH`
 #!  - mmr_ptr: the memory location where the MMR data will be written to,
 #!    starting with the MMR forest (its total leaves count) followed by its peaks.
 #!    The address is expected to be word-aligned.
@@ -224,7 +224,7 @@ end
 #!
 #! Input: [EL, mmr_ptr, ...]
 #! Output: [...]
-#! Cycles: 144 + 39 * peak_merges
+#! Cycles: 147 + 39 * peak_merges
 export.add
   # get num_leaves (2 cycles)
   dup.4 mem_load
@@ -237,8 +237,8 @@ export.add
   dup exec.num_leaves_to_num_peaks
   # [num_peaks, num_leaves, EL, mmr_ptr] (70 cycles)
 
-  # compute peaks_end (3 cycles)
-  movup.6 add add.1
+  # compute peaks_end (6 cycles)
+  mul.4 movup.6 add add.4
   # [mmr_end, num_leaves, EL]
 
   # find how many MMR peaks will be merged (41 cycles)
@@ -263,7 +263,7 @@ export.add
 
   while.true # (39 cycles)
     # load peak (4 cycles)
-    dup.9 sub.1 mem_loadw
+    dup.9 sub.4 mem_loadw
     # => [PEAK, EL, -num_merges, mmr_end]
 
     # merge the nodes (17 cycles)
@@ -275,8 +275,8 @@ export.add
     # => [PAD, EL', -num_merges, mmr_end]
 
     # update control (7 cycles)
-    swapw.2 add.1 swap sub.1 swap swapw.2
-    # => [PAD, EL', -num_merges+1, mmr_end-1]
+    swapw.2 add.1 swap sub.4 swap swapw.2
+    # => [PAD, EL', -num_merges+1, mmr_end-4]
 
     # check loop condition (5 cycles)
     dup.8 neq.0
@@ -285,7 +285,7 @@ export.add
 
   # drop padding (4 cycles)
   dropw
-  # =>: [EL, -num_merges+1, mmr_end-1]
+  # =>: [EL, -num_merges+1, mmr_end-4]
 
   # save the new peak (2 cycles)
   movup.5 mem_storew
diff --git a/stdlib/asm/crypto/dsa/rpo_falcon512.masm b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
index 637f490bf9..c639a1493a 100644
--- a/stdlib/asm/crypto/dsa/rpo_falcon512.masm
+++ b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
@@ -67,7 +67,7 @@ end
 #! Takes as input a message digest, a nonce of size 40 bytes represented as 8 field elements
 #! and a pointer. The procedure absorbs MSG and NONCE into a fresh RPO state and squeezes the
 #! coefficients of a polynomial c representing the hash-to-point of (MSG || NONCE). The coefficients
-#! are then saved in the memory region [c_ptr, c_ptr + 128).
+#! are then saved in the memory region [c_ptr, c_ptr + 512).
 #! This implementation of the `hash_to_point` procedure avoids the rejection-sampling step
 #! required in the per-the-spec algorithm by using the observation on page 31 in
 #! https://falcon-sign.info/falcon.pdf
@@ -94,13 +94,13 @@ export.hash_to_point.2
     repeat.63
         swapw dup.12
         mem_storew
-        swapw dup.12 add.2 swap.13 add.1
+        swapw dup.12 add.8 swap.13 add.4
         mem_storew
         hperm
     end
 
     # Save the last remaining coefficients
-    dup.12 add.1 mem_storew dropw
+    dup.12 add.4 mem_storew dropw
     movup.8 mem_storew dropw
 
     # Clean up the stack
@@ -112,21 +112,21 @@ end
 # =============================================================================================
 
 #! For an element `tau := (tau0, tau1)` in the quadratic extension field, computes all its powers
-#! `tau^i` for `i = 0,..., 512` and stores them in the memory region `[tau_ptr, tau_ptr + 513)`.
-#! The procedure returns `tau_ptr + 513`.
+#! `tau^i` for `i = 0,..., 512` and stores them in the memory region `[tau_ptr, tau_ptr + 513*4)`.
+#! The procedure returns `tau_ptr + 513*4`.
 #!
 #! Input: [tau1, tau0, tau_ptr, ...]
-#! Output: [tau_ptr + 513, ...]
+#! Output: [tau_ptr + 513*4, ...]
 #!
 #! Cycles: 8323
 export.powers_of_tau
 
     # 1) Save tau^0 i.e. (0, 1)
     push.1 push.0.0.0
-    dup.6 add.1 swap.7
+    dup.6 add.4 swap.7
     mem_storew
     drop drop
-    #=> [0, 1, tau1, tau0, tau_ptr+1, ...]
+    #=> [0, 1, tau1, tau0, tau_ptr+4, ...]
 
     # 2) Compute tau^i
     repeat.512
@@ -134,17 +134,17 @@ export.powers_of_tau
 
         movup.3 movup.3
 
-        dup.6 add.1 swap.7 mem_storew
+        dup.6 add.4 swap.7 mem_storew
 
         drop drop
     end
 
     dropw
-    #=> [tau_ptr + 513, ...]
+    #=> [tau_ptr + 513*4, ...]
 end
 
 
-#! Sets the memory region `[ptr, ptr + 512)` to zero. The pointer c_ptr := ptr + 512 is returned
+#! Sets the memory region `[ptr, ptr + 512*4)` to zero. The pointer c_ptr := ptr + 512*4 is returned
 #! to be used to store the hash-to-point polynomial of the message later on.
 #!
 #! Input: [ptr, ...]
@@ -154,7 +154,7 @@ end
 export.set_to_zero
     padw
     repeat.512
-        dup.4 add.1 swap.5
+        dup.4 add.4 swap.5
         mem_storew
     end
     dropw
@@ -173,7 +173,7 @@ end
 #! the incremented pointer.
 #!
 #! Input: [ptr, PK, ...]
-#! Output: [tau1, tau0, ptr + 512 ...]
+#! Output: [tau1, tau0, ptr + 512*4 ...]
 #!
 #! Cycles: 5049
 export.load_h_s2_and_product.1
@@ -228,13 +228,13 @@ export.load_h_s2_and_product.1
     # 6) Return the challenge point and the incremented pointer
     exec.rpo::squeeze_digest
     drop drop
-    #=> [tau1, tau0, ptr + 512]
+    #=> [tau1, tau0, ptr + 512*4]
 end
 
 #! Checks that pi == h * s2 in Z_Q[x] by evaluating both sides at a random point.
 #! The procedure takes as input a pointer h_ptr to h. The other two polynomials
-#! are located at h_ptr + 128, for s2, and h_ptr + 256, for pi. The procedure takes
-#! also a pointer zeros_ptr to a region of memory [zeros_ptr, zeros_ptr + 1024)
+#! are located at h_ptr + 512, for s2, and h_ptr + 1024, for pi. The procedure takes
+#! also a pointer zeros_ptr to a region of memory [zeros_ptr, zeros_ptr + 512*4)
 #! and a pointer tau_ptr to powers of the random point we are evaluating at stored
 #! as [a_i, b_i, x, x] where (a_i, b_i) := tau^i for i in [0, 1023].
 #! The procedure returns () if the check passes, otherwise it raises an exception
@@ -272,7 +272,7 @@ export.probabilistic_product.4
 
     # 3) Compute the evaluation of the s2 polynomial at the random challenge
     loc_loadw.0
-    add.128
+    add.512
     #=> [s2_ptr, zeros_ptr, tau_ptr, 0, X, X, ...]
 
     # Accumulator to compute s2(tau)
@@ -301,7 +301,7 @@ export.probabilistic_product.4
 
     # Setup the pointers
     loc_loadw.0
-    add.256
+    add.1024
     #=> [pi_ptr, zeros_ptr, tau_ptr, 0, X, X, ...]
 
     # Accumulator to compute pi(tau)
@@ -323,7 +323,7 @@ export.probabilistic_product.4
     swapw.2 loc_storew.3
 
     # Setup the pointers
-    swapw.3 loc_loadw.0 add.384
+    swapw.3 loc_loadw.0 add.1536
 
     # Accumulator to compute pi2(tau)
     swapw dropw padw
@@ -467,9 +467,9 @@ end
 #! We can compute s1 in a single pass by delaying the q-modular reduction til the end. This can
 #! be achieved through a careful analysis of the computation of the difference between pi and c.
 #!
-#! The i-th coefficient s1_i of s1 is equal to c_i - (pi_i - pi_{512 + i}) which is equal to
-#! c_i  + pi_{512 + i} - pi_i. Now, we know that the size of the pi_i coefficients is bounded by
-#! J := 512 * q^2 and this means that J + pi_{512 + i} - pi_i does not Q-underflow and since
+#! The i-th coefficient s1_i of s1 is equal to c_i - (pi_i - pi_{2048 + i}) which is equal to
+#! c_i  + pi_{2048 + i} - pi_i. Now, we know that the size of the pi_i coefficients is bounded by
+#! J := 512 * q^2 and this means that J + pi_{2048 + i} - pi_i does not Q-underflow and since
 #! J = 0 modulo q, the addition of J does not affect the final result. It is also important to
 #! note that adding J does not Q-overflow by virtue of q * 2^50 < Q.
 #! All of the above implies that we can compute s1_i with only one modular reduction at the end,
@@ -490,19 +490,19 @@ export.compute_s1_norm_sq
         # 1) Load the next 4 * 3 coefficients
         # load c_i
         padw
-        dup.4 add.1281
+        dup.4 add.5124
         mem_loadw
 
-        # load pi_{i+512}
+        # load pi_{i+2048}
         padw
-        dup.8 add.128
+        dup.8 add.512
         mem_loadw
 
-        # load pi_i
+        # load pi_4
         padw
         dup.12
         mem_loadw
-        #=> [PI, PI_{i+512}, C, pi_ptr, ...]
+        #=> [PI, PI_{i+2048}, C, pi_ptr, ...]
 
         # 2) Compute the squared norm of (i + 0)-th coefficient of s1
         movup.8
@@ -545,7 +545,7 @@ export.compute_s1_norm_sq
         swap
 
         # 6) Increment the pointer
-        add.1
+        add.4
     end
 
     # Sum up the squared norm of all the coefficients of s1
@@ -565,7 +565,7 @@ end
 export.compute_s2_norm_sq
     repeat.128
         padw
-        dup.4 add.1 swap.5
+        dup.4 add.4 swap.5
         mem_loadw
 
         repeat.4
diff --git a/stdlib/asm/crypto/fri/frie2f4.masm b/stdlib/asm/crypto/fri/frie2f4.masm
index d3ae69c261..3cd725b4e8 100644
--- a/stdlib/asm/crypto/fri/frie2f4.masm
+++ b/stdlib/asm/crypto/fri/frie2f4.masm
@@ -15,13 +15,13 @@ export.preprocess.4
     while.true
         adv_loadw                       #[Q, num_queries, ptr, ..]
         dup.5                           #[ptr, Q, num_queries, ptr,..]
-        u32wrapping_add.1               #[ptr+1, Q, num_queries, ptr, ..]
-        swap.6                          #[ptr, Q, num_queries, ptr+1, ..]
-        mem_storew                      #[Q, num_queries, ptr+1, ..]
+        u32wrapping_add.4               #[ptr+4, Q, num_queries, ptr, ..]
+        swap.6                          #[ptr, Q, num_queries, ptr+4, ..]
+        mem_storew                      #[Q, num_queries, ptr+4, ..]
         dup.4
-        sub.1                           #[num_queries-1, Q, num_queries, ptr+1, ..]
-        swap.5                          #[num_queries, Q, num_queries-1, ptr+1, ..]
-        neq.0                           #[?, Q, num_queries-1, ptr+1, ..]
+        sub.1                           #[num_queries-1, Q, num_queries, ptr+4, ..]
+        swap.5                          #[num_queries, Q, num_queries-1, ptr+4, ..]
+        neq.0                           #[?, Q, num_queries-1, ptr+4, ..]
     end
     #=> [X, x, layer_ptr, g]
 
@@ -42,7 +42,7 @@ export.preprocess.4
     while.true
         adv_loadw
         dup.5
-        u32wrapping_add.1
+        u32wrapping_add.4
         swap.6
         mem_storew
         dup.4
@@ -68,7 +68,7 @@ export.preprocess.4
     while.true
         adv_loadw
         dup.5
-        u32wrapping_add.1
+        u32wrapping_add.4
         swap.6
         mem_storew
         dup.4
@@ -85,10 +85,10 @@ export.preprocess.4
 end
 
 #! Checks that, for a query with index p at layer i, the folding procedure to create layer (i + 1)
-#! was performed correctly. This also advances layer_ptr by 2 to point to the next query layer.
+#! was performed correctly. This also advances layer_ptr by 8 to point to the next query layer.
 #!
 #! Input:  [layer_ptr, layer_ptr, poe, p, e1, e0, layer_ptr, rem_ptr, x, x, x, x, x, x, x, x, ...]
-#! Output: [layer_ptr + 2, layer_ptr + 2, poe^4, f_pos, ne1, ne0, layer_ptr + 2, rem_ptr, x, x, x, x, x, x, x, x, ...]
+#! Output: [layer_ptr+8, layer_ptr+8, poe^4, f_pos, ne1, ne0, layer_ptr+8, rem_ptr, x, x, x, x, x, x, x, x, ...]
 #!
 #! Cycles: 76
 export.verify_query_layer.3
@@ -96,8 +96,8 @@ export.verify_query_layer.3
     # load layer commitment C as well as [a0, a1, t_depth, d_size] (7 cycles)
     swapdw
     movup.8
-    add.1
-    mem_loadw   # load [a0, a1, t_depth, d_size] from layer_ptr + 1
+    add.4
+    mem_loadw   # load [a0, a1, t_depth, d_size] from layer_ptr + 4
     swapw
     movup.8
     mem_loadw   # load C from layer_ptr
@@ -148,19 +148,20 @@ export.verify_query_layer.3
 
     # fold by 4 (1 cycle)
     fri_ext2fold4
-    # => [x, x, x, x, x, x, x, x, x, x, layer_ptr + 2, poe^4, f_pos, ne1, ne0, rem_ptr, ...]
+    # => [x, x, x, x, x, x, x, x, x, x, layer_ptr + 8, poe^4, f_pos, ne1, ne0, rem_ptr, ...]
 
     # prepare for next iteration (10 cycles)
     swapdw
-    dup.2
-    movdn.7
-    drop
-    drop
-    dup
-    dup.7
-    dup.1
-    neq
-    # => [?, layer_ptr + 2, layer_ptr + 2, poe^4, f_pos, ne1, ne0, layer_ptr + 2, rem_ptr, x, x, x, x, x, x, x, x, ...]
+    # => [x, x, layer_ptr + 8, poe^4, f_pos, ne1, ne0, rem_ptr, x, x, x, x, x, x, x, x, ...]
+    dup.2     # [layer_ptr+8, x, x, layer_ptr+8, poe^4, f_pos, ne1, ne0, rem_ptr, ]
+    movdn.7   # [x, x, layer_ptr+8, poe^4, f_pos, ne1, ne0, layer_ptr+8, rem_ptr, ...]
+    drop      
+    drop      # [layer_ptr+8, poe^4, f_pos, ne1, ne0, layer_ptr+8, rem_ptr, ...]
+    dup       # [layer_ptr+8, layer_ptr+8, poe^4, f_pos, ne1, ne0, layer_ptr+8, rem_ptr, ...]
+    dup.7     # [rem_ptr, layer_ptr+8, layer_ptr+8, poe^4, f_pos, ne1, ne0, layer_ptr+8, rem_ptr, ...]
+    dup.1     # [layer_ptr+8, rem_ptr, layer_ptr+8, layer_ptr+8, poe^4, f_pos, ne1, ne0, layer_ptr+8, rem_ptr, ...]
+    neq       
+    # => [?, layer_ptr+8, layer_ptr+8, poe^4, f_pos, ne1, ne0, layer_ptr+8, rem_ptr, x, x, x, x, x, x, x, x, ...]
 end
 
 #! Verifies one FRI query.
@@ -175,7 +176,7 @@ end
 #!   layer.
 #! - rem_ptr is the memory address of the remainder codeword.
 #!
-#! Cycles: 40 + num_layers * 76
+#! Cycles: 42 + num_layers * 76
 export.verify_query
 
     # prepare stack to be in a form that leverages the fri_ext2fold4 instruction output stack state
@@ -197,17 +198,18 @@ export.verify_query
     end
     # => [rem_ptr, rem_ptr, poe^(2^n), f_pos, ne1, ne0, rem_ptr, rem_ptr, x, x, x, x, x, x, x, x, ...]
 
-    # check that remainder[f_pos] == (ne0, ne1)
+    # check that rem_ptr[f_pos] == (ne0, ne1)
 
     # Since each memory address contains two extension field elements, we have to determine which
     # of the two elements we should compare against. (7 cycles)
-    movup.3
-    push.2
-    u32divmod     # f_pos must be a u32 value
-    movdn.4
-    dup.1
+    movup.3     # [f_pos, rem_ptr, rem_ptr, poe^(2^n), ne1, ne0, rem_ptr, rem_ptr, ...]
+    push.2      # [2, f_pos, rem_ptr, rem_ptr, poe^(2^n), ne1, ne0, rem_ptr, rem_ptr, ...]
+    u32divmod   # [f_pos%2, f_pos/2, rem_ptr, rem_ptr, poe^(2^n), ne1, ne0, rem_ptr, rem_ptr, ...]
+    movdn.4     # [f_pos/2, rem_ptr, rem_ptr, poe^(2^n), f_pos%2, ne1, ne0, rem_ptr, rem_ptr, ...]
+    mul.4       # [f_pos*2, rem_ptr, rem_ptr, poe^(2^n), f_pos%2, ne1, ne0, rem_ptr, rem_ptr, ...]
     dup.1
-    add
+    dup.1       # [f_pos*2, rem_ptr, f_pos*2, rem_ptr, rem_ptr, poe^(2^n), f_pos%2, ne1, ne0, rem_ptr, rem_ptr, ...]
+    add         # [rem_ptr + f_pos*2, f_pos*2, rem_ptr, rem_ptr, poe^(2^n), f_pos%2, ne1, ne0, rem_ptr, rem_ptr, ...]
     # => [rem_ptr + offset, x, x, x, x, ?, ne1, ne0, rem_ptr, rem_ptr, x, x, x, x, x, x, x, x, ..]
 
     mem_loadw
@@ -239,7 +241,7 @@ end
 #!   to g^p with g being the initial FRI domain generator. p is the query index at the first layer
 #!   and (e0, e1) is an extension field element corresponding to the value of the first layer at index p.
 #! - layer_ptr is a pointer to the first layer commitment denoted throughout the code by C.
-#!   layer_ptr + 1 points to the first [alpha0, alpha1, t_depth, d_size] where d_size is the size
+#!   layer_ptr + 4 points to the first [alpha0, alpha1, t_depth, d_size] where d_size is the size
 #!   of initial domain divided by 4, t_depth is the depth of the Merkle tree commitment to the
 #!   first layer and (alpha0, alpha1) is the first challenge used in folding the first layer.
 #!   Both t_depth and d_size are expected to be smaller than 2^32. Otherwise, the result of
@@ -255,7 +257,7 @@ end
 #! 1. rem_ptr - 1 points to the last (alpha0, alpha1, t_depth, d_size) tuple.
 #! 2. layer_ptr - 1 points to the last (e0, e1, p, poe) tuple.
 #!
-#! Cycles: 7 + 4 + num_queries * (40 + num_layers * 76 + 26)
+#! Cycles: 7 + 4 + num_queries * (42 + num_layers * 76 + 26)
 export.verify.1
 
     # store [query_ptr, layer_ptr, rem_ptr, g] to keep track of all queries
@@ -282,12 +284,12 @@ export.verify.1
         # => [x, x, x, x, x, x, x, x, x, x, g, ...]
         dropw drop drop drop
         loc_loadw.0   # load [query_ptr, layer_ptr, rem_ptr, g]
-        add.1
-        loc_storew.0  # store [query_ptr + 1, layer_ptr, rem_ptr, g]
+        add.4
+        loc_storew.0  # store [query_ptr + 4, layer_ptr, rem_ptr, g]
         dup
         dup.2
         neq
-        #=> [?, query_ptr + 1, layer_ptr, rem_ptr, g, ...]
+        #=> [?, query_ptr + 4, layer_ptr, rem_ptr, g, ...]
     end
     #=> [X, ..]
 
diff --git a/stdlib/asm/crypto/hashes/rpo.masm b/stdlib/asm/crypto/hashes/rpo.masm
index bb8cc5c414..f6a132e27a 100644
--- a/stdlib/asm/crypto/hashes/rpo.masm
+++ b/stdlib/asm/crypto/hashes/rpo.masm
@@ -135,7 +135,7 @@ export.hash_memory
     # => [num_elements/8, num_elements%8, ptr]
 
     # get the end_addr for hash_memory_even procedure (end address for pairs of words)
-    mul.2 dup.2 add movup.2
+    mul.8 dup.2 add movup.2
     # => [ptr, end_addr, num_elements%8]
 
     # get the capacity element which is equal to num_elements%8
diff --git a/stdlib/asm/mem.masm b/stdlib/asm/mem.masm
index b420ab944a..e1941c2f76 100644
--- a/stdlib/asm/mem.masm
+++ b/stdlib/asm/mem.masm
@@ -61,7 +61,7 @@ end
 #! - The words C, B, and A are the RPO hasher state
 #!     - A is the capacity
 #!     - C,B are the rate portion of the state
-#! - The value `words = end_ptr - write_ptr` must be positive and even
+#! - The value `words = end_ptr - write_ptr` must be positive and a multiple of 8
 #!
 #! Cycles: 10 + 9 * word_pairs
 export.pipe_double_words_to_memory.0
@@ -86,19 +86,19 @@ end
 #! Input: [num_words, write_ptr, ...]
 #! Output: [C, B, A, write_ptr', ...]
 #! Cycles:
-#!  even num_words: 41 + 9 * num_words / 2
-#!  odd num_words: 58 + 9 * round_down(num_words / 2)
+#!  even num_words: 43 + 9 * num_words / 2
+#!  odd num_words: 60 + 9 * round_down(num_words / 2)
 export.pipe_words_to_memory.0
   # check if there is an odd number of words (6 cycles)
   dup is_odd
   # => [is_odd, num_words, write_ptr, ...]
 
-  # copy is_odd, it defines if last last word requires padding (2 cycles)
+  # copy is_odd, it defines if last word requires padding (2 cycles)
   dup movdn.3
   # => [is_odd, num_words, write_ptr, needs_padding, ...]
 
-  # compute `end_ptr` with an even number of words (5 cycles)
-  sub dup.1 add swap
+  # compute `end_ptr` with an even number of words (7 cycles)
+  sub mul.4 dup.1 add swap
   # => [write_ptr, end_ptr, needs_padding, ...]
 
   # Prepare the capacity word. We use the padding rule which sets the first capacity
@@ -132,8 +132,8 @@ export.pipe_words_to_memory.0
 
     # - get the memory address that B' should be saved to
     # - update the write_ptr to point to the next address (4 cycles)
-    movup.8 dup.0 add.1 movdn.5
-    # => [write_ptr, B', write_ptr+1, A, ...]
+    movup.8 dup.0 add.4 movdn.5
+    # => [write_ptr, B', write_ptr+4, A, ...]
 
     # save data to memory (1 cycles)
     mem_storew
diff --git a/stdlib/tests/collections/mmr.rs b/stdlib/tests/collections/mmr.rs
index 5d9156c3de..cf9f8e12e9 100644
--- a/stdlib/tests/collections/mmr.rs
+++ b/stdlib/tests/collections/mmr.rs
@@ -42,21 +42,21 @@ fn test_num_peaks_to_message_size() {
     ";
 
     // minimum size is 16
-    build_test!(hash_size, &[1]).expect_stack(&[16]);
-    build_test!(hash_size, &[2]).expect_stack(&[16]);
-    build_test!(hash_size, &[3]).expect_stack(&[16]);
-    build_test!(hash_size, &[4]).expect_stack(&[16]);
-    build_test!(hash_size, &[7]).expect_stack(&[16]);
-    build_test!(hash_size, &[11]).expect_stack(&[16]);
-    build_test!(hash_size, &[16]).expect_stack(&[16]);
+    build_test!(hash_size, &[1]).expect_stack(&[16 * 4]);
+    build_test!(hash_size, &[2]).expect_stack(&[16 * 4]);
+    build_test!(hash_size, &[3]).expect_stack(&[16 * 4]);
+    build_test!(hash_size, &[4]).expect_stack(&[16 * 4]);
+    build_test!(hash_size, &[7]).expect_stack(&[16 * 4]);
+    build_test!(hash_size, &[11]).expect_stack(&[16 * 4]);
+    build_test!(hash_size, &[16]).expect_stack(&[16 * 4]);
 
     // after that, size is round to the next even number
-    build_test!(hash_size, &[17]).expect_stack(&[18]);
-    build_test!(hash_size, &[18]).expect_stack(&[18]);
-    build_test!(hash_size, &[19]).expect_stack(&[20]);
-    build_test!(hash_size, &[20]).expect_stack(&[20]);
-    build_test!(hash_size, &[21]).expect_stack(&[22]);
-    build_test!(hash_size, &[22]).expect_stack(&[22]);
+    build_test!(hash_size, &[17]).expect_stack(&[18 * 4]);
+    build_test!(hash_size, &[18]).expect_stack(&[18 * 4]);
+    build_test!(hash_size, &[19]).expect_stack(&[20 * 4]);
+    build_test!(hash_size, &[20]).expect_stack(&[20 * 4]);
+    build_test!(hash_size, &[21]).expect_stack(&[22 * 4]);
+    build_test!(hash_size, &[22]).expect_stack(&[22 * 4]);
 }
 
 #[test]
@@ -75,7 +75,7 @@ fn test_mmr_get_single_peak() -> Result<(), MerkleError> {
 
             begin
                 push.{num_leaves} push.1000 mem_store # leaves count
-                adv_push.4 push.1001 mem_storew dropw # MMR single peak
+                adv_push.4 push.1004 mem_storew dropw # MMR single peak
 
                 push.1000 push.{pos} exec.mmr::get
 
@@ -135,8 +135,8 @@ fn test_mmr_get_two_peaks() -> Result<(), MerkleError> {
 
             begin
                 push.{num_leaves} push.1000 mem_store # leaves count
-                adv_push.4 push.1001 mem_storew dropw # MMR first peak
-                adv_push.4 push.1002 mem_storew dropw # MMR second peak
+                adv_push.4 push.1004 mem_storew dropw # MMR first peak
+                adv_push.4 push.1008 mem_storew dropw # MMR second peak
 
                 push.1000 push.{pos} exec.mmr::get
 
@@ -187,7 +187,7 @@ fn test_mmr_tree_with_one_element() -> Result<(), MerkleError> {
 
         begin
             push.{num_leaves} push.1000 mem_store # leaves count
-            adv_push.4 push.1001 mem_storew dropw # MMR first peak
+            adv_push.4 push.1004 mem_storew dropw # MMR first peak
 
             push.1000 push.{pos} exec.mmr::get
 
@@ -213,9 +213,9 @@ fn test_mmr_tree_with_one_element() -> Result<(), MerkleError> {
 
         begin
             push.{num_leaves} push.1000 mem_store # leaves count
-            adv_push.4 push.1001 mem_storew dropw # MMR first peak
-            adv_push.4 push.1002 mem_storew dropw # MMR second peak
-            adv_push.4 push.1003 mem_storew dropw # MMR third peak
+            adv_push.4 push.1004 mem_storew dropw # MMR first peak
+            adv_push.4 push.1008 mem_storew dropw # MMR second peak
+            adv_push.4 push.1012 mem_storew dropw # MMR third peak
 
             push.1000 push.{pos} exec.mmr::get
 
@@ -235,13 +235,13 @@ fn test_mmr_unpack() {
     let number_of_leaves: u64 = 0b10101; // 3 peaks, 21 leaves
 
     // The hash data is not the same as the peaks, it is padded to 16 elements
-    let hash_data: [[Felt; 4]; 16] = [
+    let peaks: [[Felt; 4]; 16] = [
         // 3 peaks. These hashes are invalid, we can't produce data for any of these peaks (only
         // for testing)
         [ZERO, ZERO, ZERO, ONE],
         [ZERO, ZERO, ZERO, Felt::new(2)],
         [ZERO, ZERO, ZERO, Felt::new(3)],
-        // Padding, the MMR is padded to a minimum length o 16
+        // Padding, the MMR is padded to a minimum length of 16
         EMPTY_WORD,
         EMPTY_WORD,
         EMPTY_WORD,
@@ -256,10 +256,10 @@ fn test_mmr_unpack() {
         EMPTY_WORD,
         EMPTY_WORD,
     ];
-    let hash = hash_elements(&hash_data.concat());
+    let peaks_hash = hash_elements(&peaks.concat());
 
     // Set up the VM stack with the MMR hash, and its target address
-    let mut stack = felt_slice_to_ints(&*hash);
+    let mut stack = felt_slice_to_ints(&*peaks_hash);
     let mmr_ptr = 1000_u32;
     stack.insert(0, mmr_ptr as u64);
 
@@ -268,13 +268,13 @@ fn test_mmr_unpack() {
     let advice_stack = &[];
     let store = MerkleStore::new();
 
-    let mut map_data: Vec<Felt> = Vec::with_capacity(hash_data.len() + 1);
-    map_data.extend_from_slice(&[number_of_leaves.try_into().unwrap(), ZERO, ZERO, ZERO]);
-    map_data.extend_from_slice(&hash_data.as_slice().concat());
+    let mut mmr_mem_repr: Vec<Felt> = Vec::with_capacity(peaks.len() + 1);
+    mmr_mem_repr.extend_from_slice(&[number_of_leaves.try_into().unwrap(), ZERO, ZERO, ZERO]);
+    mmr_mem_repr.extend_from_slice(&peaks.as_slice().concat());
 
     let advice_map: &[(RpoDigest, Vec<Felt>)] = &[
         // Under the MMR key is the number_of_leaves, followed by the MMR peaks, and any padding
-        (hash, map_data),
+        (peaks_hash, mmr_mem_repr),
     ];
 
     let source = "
@@ -356,8 +356,7 @@ fn test_mmr_unpack_invalid_hash() {
 fn test_mmr_unpack_large_mmr() {
     let number_of_leaves: u64 = 0b11111111111111111; // 17 peaks
 
-    // The hash data is not the same as the peaks, it is padded to 16 elements
-    let hash_data: [[Felt; 4]; 18] = [
+    let peaks: [[Felt; 4]; 18] = [
         // These hashes are invalid, we can't produce data for any of these peaks (only for
         // testing)
         [ZERO, ZERO, ZERO, ONE],
@@ -380,10 +379,10 @@ fn test_mmr_unpack_large_mmr() {
         [ZERO, ZERO, ZERO, Felt::new(17)],
         EMPTY_WORD,
     ];
-    let hash = hash_elements(&hash_data.concat());
+    let peaks_hash = hash_elements(&peaks.concat());
 
     // Set up the VM stack with the MMR hash, and its target address
-    let mut stack = felt_slice_to_ints(&*hash);
+    let mut stack = felt_slice_to_ints(&*peaks_hash);
     let mmr_ptr = 1000_u32;
     stack.insert(0, mmr_ptr as u64);
 
@@ -392,13 +391,13 @@ fn test_mmr_unpack_large_mmr() {
     let advice_stack = &[];
     let store = MerkleStore::new();
 
-    let mut map_data: Vec<Felt> = Vec::with_capacity(hash_data.len() + 1);
-    map_data.extend_from_slice(&[number_of_leaves.try_into().unwrap(), ZERO, ZERO, ZERO]);
-    map_data.extend_from_slice(&hash_data.as_slice().concat());
+    let mut mmr_mem_repr: Vec<Felt> = Vec::with_capacity(peaks.len() + 1);
+    mmr_mem_repr.extend_from_slice(&[number_of_leaves.try_into().unwrap(), ZERO, ZERO, ZERO]);
+    mmr_mem_repr.extend_from_slice(&peaks.as_slice().concat());
 
     let advice_map: &[(RpoDigest, Vec<Felt>)] = &[
         // Under the MMR key is the number_of_leaves, followed by the MMR peaks, and any padding
-        (hash, map_data),
+        (peaks_hash, mmr_mem_repr),
     ];
 
     let source = "
@@ -497,8 +496,8 @@ fn test_mmr_pack() {
 
         begin
             push.3.1000 mem_store  # num_leaves, 2 peaks
-            push.1.1001 mem_store  # peak1
-            push.2.1002 mem_store  # peak2
+            push.1.1004 mem_store  # peak1
+            push.2.1008 mem_store  # peak2
 
             push.1000 exec.mmr::pack
 
@@ -587,7 +586,7 @@ fn test_mmr_two() {
 }
 
 #[test]
-fn test_mmr_large() {
+fn test_add_mmr_large() {
     let mmr_ptr = 1000;
     let source = format!(
         "
diff --git a/stdlib/tests/crypto/falcon.rs b/stdlib/tests/crypto/falcon.rs
index 5c41c90052..57cc538199 100644
--- a/stdlib/tests/crypto/falcon.rs
+++ b/stdlib/tests/crypto/falcon.rs
@@ -24,35 +24,38 @@ const Q: u64 = (M - 1) / 2;
 const N: usize = 512;
 const J: u64 = (N * M as usize * M as usize) as u64;
 
-const PROBABILISTIC_PRODUCT_SOURCE: &str = "
+#[test]
+fn test_set_to_zero() {
+    let source = "
     use.std::crypto::dsa::rpo_falcon512
 
     begin
-        #=> [PK, ...]
-        mem_load.0
-        #=> [h_ptr, PK, ...]
+        # write bytes in the first and last addresses of the region to be zeroed
+        push.1.2.3.4 mem_storew.1000 dropw
+        push.1.2.3.4 mem_storew.3044 dropw
 
-        exec.rpo_falcon512::load_h_s2_and_product
-        #=> [tau1, tau0, tau_ptr, ...]
-
-        exec.rpo_falcon512::powers_of_tau
-        #=> [zeros_ptr, ...]
+        # This address should be untouched
+        push.1.2.3.4 mem_storew.3048 dropw
 
+        push.1000
         exec.rpo_falcon512::set_to_zero
-        #=> [c_ptr, ...]
 
-        drop
-        #=> [...]
+        # Assert that output pointer is 1000 + 4 * 512 = 3048
+        push.3048 assert_eq
+    end
+    ";
 
-        push.512    # tau_ptr
-        push.1025   # z_ptr
-        push.0      # h ptr
+    let expected_memory = {
+        let mut memory = vec![0_u64; N * 4];
+        // addresses [3048, 3052) (not zeroed)
+        memory.extend_from_slice(&[1, 2, 3, 4]);
 
-        #=> [h_ptr, zeros_ptr, tau_ptr, ...]
+        memory
+    };
 
-        exec.rpo_falcon512::probabilistic_product
-    end
-    ";
+    let test = build_test!(source, &[]);
+    test.expect_stack_and_memory(&[], 1000_u32, &expected_memory);
+}
 
 #[test]
 fn test_falcon512_norm_sq() {
@@ -119,10 +122,40 @@ fn test_falcon512_powers_of_tau() {
     let stack_init = [tau_ptr.into(), tau_0, tau_1];
 
     let test = build_test!(source, &stack_init);
-    let expected_stack = &[<u32 as Into<u64>>::into(tau_ptr) + N as u64 + 1];
+    let expected_stack = &[u64::from(tau_ptr) + (N as u64 + 1_u64) * 4];
     test.expect_stack_and_memory(expected_stack, tau_ptr, &expected_memory);
 }
 
+const PROBABILISTIC_PRODUCT_SOURCE: &str = "
+    use.std::crypto::dsa::rpo_falcon512
+
+    begin
+        #=> [PK, ...]
+        mem_load.0
+        #=> [h_ptr, PK, ...]
+
+        exec.rpo_falcon512::load_h_s2_and_product
+        #=> [tau1, tau0, tau_ptr, ...]
+
+        exec.rpo_falcon512::powers_of_tau
+        #=> [zeros_ptr, ...]
+
+        exec.rpo_falcon512::set_to_zero
+        #=> [c_ptr, ...]
+
+        drop
+        #=> [...]
+
+        push.2048    # tau_ptr
+        push.4100   # zeroes_ptr (tau_ptr + 4 * 513)
+        push.0      # h ptr
+
+        #=> [h_ptr, zeros_ptr, tau_ptr, ...]
+
+        exec.rpo_falcon512::probabilistic_product
+    end
+    ";
+
 #[test]
 fn test_falcon512_probabilistic_product() {
     // Create two random polynomials and multiply them.
@@ -174,7 +207,7 @@ fn test_falcon512_probabilistic_product_failure() {
     expect_exec_error_matches!(
         test,
         ExecutionError::FailedAssertion{ clk, err_code, err_msg }
-        if clk == RowIndex::from(17490) && err_code == 0 && err_msg.is_none()
+        if clk == RowIndex::from(18841) && err_code == 0 && err_msg.is_none()
     );
 }
 
diff --git a/stdlib/tests/crypto/rpo.rs b/stdlib/tests/crypto/rpo.rs
index fbc22c2385..391d5e1118 100644
--- a/stdlib/tests/crypto/rpo.rs
+++ b/stdlib/tests/crypto/rpo.rs
@@ -289,7 +289,7 @@ fn test_hash_memory() {
 
     begin
         push.1.2.3.4.1000 mem_storew dropw
-        push.5.0.0.0.1001 mem_storew dropw
+        push.5.0.0.0.1004 mem_storew dropw
         push.11
 
         push.5.1000
@@ -315,7 +315,7 @@ fn test_hash_memory() {
 
     begin
         push.1.2.3.4.1000 mem_storew dropw
-        push.5.6.7.8.1001 mem_storew dropw
+        push.5.6.7.8.1004 mem_storew dropw
         push.11
 
         push.8.1000
@@ -341,9 +341,9 @@ fn test_hash_memory() {
 
     begin
         push.1.2.3.4.1000 mem_storew dropw
-        push.5.6.7.8.1001 mem_storew dropw
-        push.9.10.11.12.1002 mem_storew dropw
-        push.13.14.15.0.1003 mem_storew dropw
+        push.5.6.7.8.1004 mem_storew dropw
+        push.9.10.11.12.1008 mem_storew dropw
+        push.13.14.15.0.1012 mem_storew dropw
         push.11
 
         push.15.1000
diff --git a/stdlib/tests/mem/mod.rs b/stdlib/tests/mem/mod.rs
index 773a0ade11..95930d3c6e 100644
--- a/stdlib/tests/mem/mod.rs
+++ b/stdlib/tests/mem/mod.rs
@@ -93,14 +93,15 @@ fn test_memcopy() {
 
 #[test]
 fn test_pipe_double_words_to_memory() {
-    let mem_addr = 1000;
+    let start_addr = 1000;
+    let end_addr = 1008;
     let source = format!(
         "
         use.std::mem
         use.std::sys
 
         begin
-            push.1002       # end_addr
+            push.{}         # end_addr
             push.{}         # write_addr
             padw padw padw  # hasher state
 
@@ -108,17 +109,17 @@ fn test_pipe_double_words_to_memory() {
 
             exec.sys::truncate_stack
         end",
-        mem_addr
+        end_addr, start_addr,
     );
 
     let operand_stack = &[];
     let data = &[1, 2, 3, 4, 5, 6, 7, 8];
     let mut expected_stack =
         felt_slice_to_ints(&build_expected_perm(&[0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8]));
-    expected_stack.push(1002);
+    expected_stack.push(end_addr);
     build_test!(source, operand_stack, &data).expect_stack_and_memory(
         &expected_stack,
-        mem_addr,
+        start_addr,
         data,
     );
 }
@@ -147,7 +148,7 @@ fn test_pipe_words_to_memory() {
     let operand_stack = &[];
     let data = &[1, 2, 3, 4];
     let mut expected_stack = felt_slice_to_ints(&build_expected_hash(data));
-    expected_stack.push(1001);
+    expected_stack.push(1004);
     build_test!(one_word, operand_stack, &data).expect_stack_and_memory(
         &expected_stack,
         mem_addr,
@@ -175,7 +176,7 @@ fn test_pipe_words_to_memory() {
     let operand_stack = &[];
     let data = &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12];
     let mut expected_stack = felt_slice_to_ints(&build_expected_hash(data));
-    expected_stack.push(1003);
+    expected_stack.push(1012);
     build_test!(three_words, operand_stack, &data).expect_stack_and_memory(
         &expected_stack,
         mem_addr,
@@ -206,7 +207,7 @@ fn test_pipe_preimage_to_memory() {
     advice_stack.reverse();
     advice_stack.extend(data);
     build_test!(three_words, operand_stack, &advice_stack).expect_stack_and_memory(
-        &[1003],
+        &[1012],
         mem_addr,
         data,
     );

From 06750d4822e2e7a112272ff1173a5b8fb90b004f Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Wed, 18 Dec 2024 17:04:54 -0500
Subject: [PATCH 05/19] changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 922fa9d0b0..eb3a28ebca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+#### Highlights
+- [BREAKING] Memory is now memory addressable (#1598)
+
 #### Changes
 - [BREAKING] `Process` no longer takes ownership of the `Host` (#1571).
 - [BREAKING] `ProcessState` was converted from a trait to a struct (#1571).

From f07da49221f4a590855ace9531cc435a56791771 Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Wed, 18 Dec 2024 17:07:52 -0500
Subject: [PATCH 06/19] more test fixes

---
 docs/src/user_docs/stdlib/mem.md             |   2 +-
 processor/src/trace/tests/chiplets/memory.rs |   4 +-
 stdlib/asm/crypto/fri/ext2fri.masm           |  40 +--
 stdlib/asm/crypto/hashes/keccak256.masm      | 280 +++++++++----------
 stdlib/asm/mem.masm                          |  10 +-
 stdlib/docs/mem.md                           |   2 +-
 stdlib/tests/crypto/fri/remainder.rs         |   6 +-
 stdlib/tests/mem/mod.rs                      |  45 ++-
 8 files changed, 196 insertions(+), 193 deletions(-)

diff --git a/docs/src/user_docs/stdlib/mem.md b/docs/src/user_docs/stdlib/mem.md
index 1c9360116d..2c837cdcab 100644
--- a/docs/src/user_docs/stdlib/mem.md
+++ b/docs/src/user_docs/stdlib/mem.md
@@ -3,7 +3,7 @@ Module `std::mem` contains a set of utility procedures for working with random a
 
 | Procedure   | Description   |
 | ----------- | ------------- |
-| memcopy | Copies `n` words from `read_ptr` to `write_ptr`.<br /><br />Stack transition looks as follows:<br /><br />[n, read_ptr, write_ptr, ...] -> [...]<br /><br />Cycles: 15 + 16n |
+| memcopy_words | Copies `n` words from `read_ptr` to `write_ptr`.<br /><br />Stack transition looks as follows:<br /><br />[n, read_ptr, write_ptr, ...] -> [...]<br /><br />Cycles: 15 + 16n |
 | pipe_double_words_to_memory | Moves an even number of words from the advice stack to memory.<br /><br />Input: [C, B, A, write_ptr, end_ptr, ...]<br />Output: [C, B, A, write_ptr, ...]<br /><br />Where:<br />- The words C, B, and A are the RPO hasher state<br />- A is the capacity<br />- C, B are the rate portion of the state<br />- The value `num_words = end_ptr - write_ptr` must be positive and even<br /><br />Cycles: 10 + 9 * num_words / 2 |
 | pipe_words_to_memory | Moves an arbitrary number of words from the advice stack to memory.<br /><br />Input: [num_words, write_ptr, ...]<br />Output: [HASH, write_ptr', ...]<br /><br />Where `HASH` is the sequential RPO hash of all copied words.<br /><br />Cycles:<br />- Even num_words: 48 + 9 * num_words / 2<br />- Odd num_words: 65 + 9 * round_down(num_words / 2) |
 | pipe_preimage_to_memory | Moves an arbitrary number of words from the advice stack to memory and asserts it matches the commitment.<br /><br />Input: [num_words, write_ptr, COM, ...]<br />Output: [write_ptr', ...]<br /><br />Cycles:<br />- Even num_words: 58 + 9 * num_words / 2<br /> - Odd num_words: 75 + 9 * round_down(num_words / 2) |
diff --git a/processor/src/trace/tests/chiplets/memory.rs b/processor/src/trace/tests/chiplets/memory.rs
index 35c97f0bf6..04acb7205b 100644
--- a/processor/src/trace/tests/chiplets/memory.rs
+++ b/processor/src/trace/tests/chiplets/memory.rs
@@ -1,7 +1,9 @@
 use miden_air::{
     trace::chiplets::{
         memory::{
-            MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_READ_ELEMENT_LABEL, MEMORY_READ_WORD_LABEL, MEMORY_WRITE, MEMORY_WRITE_ELEMENT_LABEL, MEMORY_WRITE_WORD_LABEL
+            MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_READ_ELEMENT_LABEL,
+            MEMORY_READ_WORD_LABEL, MEMORY_WRITE, MEMORY_WRITE_ELEMENT_LABEL,
+            MEMORY_WRITE_WORD_LABEL,
         },
         MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
         MEMORY_ELEMENT_OR_WORD_COL_IDX, MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX,
diff --git a/stdlib/asm/crypto/fri/ext2fri.masm b/stdlib/asm/crypto/fri/ext2fri.masm
index dc558d1c44..bc2effaccc 100644
--- a/stdlib/asm/crypto/fri/ext2fri.masm
+++ b/stdlib/asm/crypto/fri/ext2fri.masm
@@ -7,7 +7,7 @@
 #!    remaining part of word i.e. a3, a2 will be consumed during immediate next iteration
 #!    of computing β which involves invocation of accumulate_for_odd_index.
 #!
-#! Input: [ω, ν1, ν0, τ1, τ0, q_ptr - 1, ...]
+#! Input: [ω, ν1, ν0, τ1, τ0, q_ptr - 4, ...]
 #! Output: [a3, a2, ν1', ν0', τ1, τ0, q_ptr, ...]
 #!
 #! Cycles: 54
@@ -24,11 +24,11 @@ proc.accumulate_for_even_index
     movdn.2
     mul
 
-    # load <a0, a1, a2, a3> from q_ptr + 1
+    # load <a0, a1, a2, a3> from q_ptr + 4
     #
     # notice, first increment the memory address and then load from it.
     movup.6
-    add.1
+    add.4
     movdn.6
     push.0.0.0
     dup.9
@@ -102,9 +102,9 @@ end
 #!
 #! Cycles: 2802
 proc.compute_beta_64
-    # decrement starting address by 1, because we first increment and then load onto the stack
+    # decrement starting address by 4, because we first increment and then load onto the stack
     movup.2
-    sub.1
+    sub.4
     movdn.2
 
     push.0.0 # accumulator for β
@@ -393,7 +393,7 @@ end
 proc.compute_beta_32
     # decrement starting address by 1, because we first increment and then load onto the stack
     movup.2
-    sub.1
+    sub.4
     movdn.2
 
     push.0.0 # accumulator for β
@@ -556,37 +556,37 @@ end
 #!
 #! Cycles: 114
 proc.compute_alpha_64
-    padw dup.6 sub.1 swap.7
+    padw dup.6 sub.4 swap.7
     mem_loadw
-    #=> [a11, a10, a01, a00, τ1, τ0, p_ptr-1, ...]
+    #=> [a11, a10, a01, a00, τ1, τ0, p_ptr-4, ...]
     dup.5 dup.5
     ext2mul
     ext2add
-    #=> [acc1, acc0, τ1, τ0, p_ptr-1, ...]
+    #=> [acc1, acc0, τ1, τ0, p_ptr-4, ...]
 
-    movup.4 dup sub.1 movdn.5
+    movup.4 dup sub.4 movdn.5
     padw movup.4
     mem_loadw
-    #=> [a11, a10, a01, a00, acc1, acc0, τ1, τ0, p_ptr-1, ...]
+    #=> [a11, a10, a01, a00, acc1, acc0, τ1, τ0, p_ptr-4, ...]
     movup.5 movup.5
     dup.7 dup.7
-    #=> [τ1, τ0, acc1, acc0, a11, a10, a01, a00, τ1, τ0, p_ptr-1, ...]
+    #=> [τ1, τ0, acc1, acc0, a11, a10, a01, a00, τ1, τ0, p_ptr-4, ...]
     ext2mul
     ext2add
-    #=> [acc1, acc0, a01, a00, τ1, τ0, p_ptr-1, ...]
+    #=> [acc1, acc0, a01, a00, τ1, τ0, p_ptr-4, ...]
 
     dup.5 dup.5
     ext2mul
     ext2add
-    #=> [acc1, acc0, τ1, τ0, p_ptr-1, ...]
+    #=> [acc1, acc0, τ1, τ0, p_ptr-4, ...]
 
-    movup.4 dup sub.1 movdn.5
+    movup.4 dup sub.4 movdn.5
     padw movup.4
     mem_loadw
-    #=> [a11, a10, a01, a00, acc1, acc0, τ1, τ0, p_ptr-1, ...]
+    #=> [a11, a10, a01, a00, acc1, acc0, τ1, τ0, p_ptr-4, ...]
     movup.5 movup.5
     dup.7 dup.7
-    #=> [τ1, τ0, acc1, acc0, a11, a10, a01, a00, τ1, τ0, p_ptr-1, ...]
+    #=> [τ1, τ0, acc1, acc0, a11, a10, a01, a00, τ1, τ0, p_ptr-4, ...]
     ext2mul
     ext2add
     #=> [acc1, acc0, a01, a00, τ1, τ0, p_ptr-1, ...]
@@ -624,7 +624,7 @@ end
 #!
 #! Cycles: 47
 proc.compute_alpha_32
-    padw dup.6 sub.1 swap.7
+    padw dup.6 sub.4 swap.7
     mem_loadw
     dup.5 dup.5
     ext2mul
@@ -667,7 +667,7 @@ export.verify_remainder_64
     #=> [β1, β0, τ1, τ0, q_ptr, ...]
 
     # Pointer to the last word of the remainder polynomial for Horner evaluation.
-    movup.4 add.4
+    movup.4 add.16
     #=> [p_ptr, β1, β0, τ1, τ0, ...]
 
     # We need to multiply τ by the domain offset before evaluation.
@@ -714,7 +714,7 @@ export.verify_remainder_32
     #=> [β1, β0, τ1, τ0, q_ptr, ...]
 
     # Pointer to the last word of the remainder polynomial for Horner evaluation.
-    movup.4 add.2
+    movup.4 add.8
     #=> [p_ptr, β1, β0, τ1, τ0, ...]
 
     # We need to multiply τ by the domain offset before evaluation.
diff --git a/stdlib/asm/crypto/hashes/keccak256.masm b/stdlib/asm/crypto/hashes/keccak256.masm
index a342ee646d..59e407affd 100644
--- a/stdlib/asm/crypto/hashes/keccak256.masm
+++ b/stdlib/asm/crypto/hashes/keccak256.masm
@@ -14,7 +14,7 @@
 #! Whole keccak-p[1600, 24] state can be represented using fifty u32 elements i.e. 13 absolute memory addresses
 #! s.t. last two elements of 12 -th ( when indexed from zero ) memory address are zeroed.
 #!
-#! Consecutive memory addresses can be computed by repeated application of `add.1`.
+#! Consecutive memory addresses can be computed by repeated application of `add.4`.
 proc.theta.3
     dup
     locaddr.0
@@ -34,7 +34,7 @@ proc.theta.3
     drop
 
     movup.2
-    add.2
+    add.8
 
     # bring S[10], S[11]
     dup
@@ -56,7 +56,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.3
+    add.12
 
     # bring S[20], S[21]
     dup
@@ -80,7 +80,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.2
+    add.8
 
     # bring S[30], S[31]
     dup
@@ -102,7 +102,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.3
+    add.12
 
     # bring S[40], S[41]
     push.0.0.0.0
@@ -140,7 +140,7 @@ proc.theta.3
     drop
 
     movup.2
-    add.3
+    add.12
 
     # bring S[12], S[13]
     dup
@@ -164,7 +164,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.2
+    add.8
 
     # bring S[22], S[23]
     dup
@@ -186,7 +186,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.3
+    add.12
 
     # bring S[32], S[33]
     dup
@@ -210,7 +210,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.2
+    add.8
 
     # bring S[42], S[43]
     push.0.0.0.0
@@ -243,7 +243,7 @@ proc.theta.3
 
     locaddr.0
     mem_load
-    add.1
+    add.4
 
     # bring S[4], S[5]
     dup
@@ -257,7 +257,7 @@ proc.theta.3
     drop
 
     movup.2
-    add.2
+    add.8
 
     # bring S[14], S[15]
     dup
@@ -279,7 +279,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.3
+    add.12
 
     # bring S[24], S[25]
     dup
@@ -303,7 +303,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.2
+    add.8
 
     # bring S[34], S[35]
     dup
@@ -325,7 +325,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.3
+    add.12
 
     # bring S[44], S[45]
     push.0.0.0.0
@@ -352,7 +352,7 @@ proc.theta.3
 
     locaddr.0
     mem_load
-    add.1
+    add.4
 
     # bring S[6], S[7]
     dup
@@ -364,7 +364,7 @@ proc.theta.3
     drop
 
     movup.2
-    add.3
+    add.12
 
     # bring S[16], S[17]
     dup
@@ -388,7 +388,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.2
+    add.8
 
     # bring S[26], S[27]
     dup
@@ -410,7 +410,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.3
+    add.12
 
     # bring S[36], S[37]
     dup
@@ -434,7 +434,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.2
+    add.8
 
     # bring S[46], S[47]
     push.0.0.0.0
@@ -467,7 +467,7 @@ proc.theta.3
 
     locaddr.0
     mem_load
-    add.2
+    add.8
 
     # bring S[8], S[9]
     dup
@@ -481,7 +481,7 @@ proc.theta.3
     drop
 
     movup.2
-    add.2
+    add.8
 
     # bring S[18], S[19]
     dup
@@ -503,7 +503,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.3
+    add.12
 
     # bring S[28], S[29]
     dup
@@ -527,7 +527,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.2
+    add.8
 
     # bring S[38], S[39]
     dup
@@ -549,7 +549,7 @@ proc.theta.3
     swap
 
     movup.2
-    add.3
+    add.12
 
     # bring S[48], S[49]
     push.0.0.0.0
@@ -675,7 +675,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[4..8)
 
@@ -706,7 +706,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[8..12)
 
@@ -737,7 +737,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[12..16)
 
@@ -768,7 +768,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[16..20)
 
@@ -799,7 +799,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[20..24)
 
@@ -830,7 +830,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[24..28)
 
@@ -861,7 +861,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[28..32)
 
@@ -892,7 +892,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[32..36)
 
@@ -923,7 +923,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[36..40)
 
@@ -954,7 +954,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[40..44)
 
@@ -985,7 +985,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[44..48)
 
@@ -1016,7 +1016,7 @@ proc.theta.3
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     # compute state[48..50)
 
@@ -1054,7 +1054,7 @@ end
 #! Whole keccak-p[1600, 24] state can be represented using fifty u32 elements i.e. 13 absolute memory addresses
 #! s.t. last two elements of 12 -th ( when indexed from zero ) memory address are zeroed.
 #!
-#! Consecutive memory addresses can be computed by repeated application of `add.1`.
+#! Consecutive memory addresses can be computed by repeated application of `add.4`.
 proc.rho.1
     dup
     locaddr.0
@@ -1071,7 +1071,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1093,7 +1093,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1114,7 +1114,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1136,7 +1136,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1157,7 +1157,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1178,7 +1178,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1199,7 +1199,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1220,7 +1220,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1241,7 +1241,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1262,7 +1262,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1284,7 +1284,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1305,7 +1305,7 @@ proc.rho.1
 
     movup.4
     dup
-    add.1
+    add.4
     movdn.5
     mem_storew
 
@@ -1339,7 +1339,7 @@ end
 #! Whole keccak-p[1600, 24] state can be represented using fifty u32 elements i.e. 13 absolute memory addresses
 #! s.t. last two elements of 12 -th ( when indexed from zero ) memory address are zeroed.
 #!
-#! Consecutive memory addresses can be computed by repeated application of `add.1`.
+#! Consecutive memory addresses can be computed by repeated application of `add.4`.
 proc.pi.14
     dup
     locaddr.0
@@ -1366,12 +1366,12 @@ proc.pi.14
     movdn.3
 
     dup.5
-    add.5
+    add.20
     mem_storew
 
     # place state[4..8) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -1382,19 +1382,19 @@ proc.pi.14
     movdn.3
 
     dup.7
-    add.10
+    add.40
     mem_storew
 
     drop
     drop
 
     dup.5
-    add.2
+    add.8
     mem_storew
 
     # place state[8..12) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -1403,7 +1403,7 @@ proc.pi.14
     push.0.0
 
     dup.7
-    add.7
+    add.28
     mem_storew
 
     movup.2
@@ -1415,12 +1415,12 @@ proc.pi.14
     movdn.3
 
     dup.5
-    add.8
+    add.32
     mem_storew
 
     # place state[12..16) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -1440,7 +1440,7 @@ proc.pi.14
     mem_storew
 
     dup.7
-    add.5
+    add.20
     mem_loadw
 
     movup.2
@@ -1449,19 +1449,19 @@ proc.pi.14
     drop
 
     dup.5
-    add.5
+    add.20
     mem_storew
 
     # place state[16..20) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
     mem_loadw
 
     dup.5
-    add.10
+    add.40
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -1472,7 +1472,7 @@ proc.pi.14
     drop
 
     dup.7
-    add.10
+    add.40
     mem_storew
 
     dropw
@@ -1482,19 +1482,19 @@ proc.pi.14
     movdn.3
 
     dup.5
-    add.3
+    add.12
     mem_storew
 
     # place state[20..24) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
     mem_loadw
 
     dup.5
-    add.3
+    add.12
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -1505,11 +1505,11 @@ proc.pi.14
     drop
 
     dup.7
-    add.3
+    add.12
     mem_storew
 
     dup.7
-    add.8
+    add.32
     mem_loadw
 
     movup.2
@@ -1518,12 +1518,12 @@ proc.pi.14
     drop
 
     dup.5
-    add.8
+    add.32
     mem_storew
 
     # place state[24..28) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -1534,7 +1534,7 @@ proc.pi.14
     movdn.3
 
     dup.7
-    add.1
+    add.4
     mem_storew
 
     drop
@@ -1543,24 +1543,24 @@ proc.pi.14
     movdn.3
 
     dup.5
-    add.6
+    add.24
     mem_storew
 
     # place state[28..32) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
     mem_loadw
 
     dup.5
-    add.11
+    add.44
     mem_storew
 
     # place state[32..36) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -1571,7 +1571,7 @@ proc.pi.14
     movdn.3
 
     dup.7
-    add.4
+    add.16
     mem_storew
 
     drop
@@ -1580,19 +1580,19 @@ proc.pi.14
     movdn.3
 
     dup.5
-    add.9
+    add.36
     mem_storew
 
     # place state[36..40) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
     mem_loadw
 
     dup.5
-    add.1
+    add.4
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -1603,11 +1603,11 @@ proc.pi.14
     drop
 
     dup.7
-    add.1
+    add.4
     mem_storew
 
     dup.7
-    add.6
+    add.24
     mem_loadw
 
     movup.2
@@ -1616,19 +1616,19 @@ proc.pi.14
     drop
 
     dup.5
-    add.6
+    add.24
     mem_storew
 
     # place state[40..44) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
     mem_loadw
 
     dup.5
-    add.7
+    add.28
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -1639,7 +1639,7 @@ proc.pi.14
     movup.3
 
     dup.7
-    add.7
+    add.28
     mem_storew
 
     dropw
@@ -1649,19 +1649,19 @@ proc.pi.14
     movdn.3
 
     dup.5
-    add.12
+    add.48
     mem_storew
 
     # place state[44..48) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
     mem_loadw
 
     dup.5
-    add.4
+    add.16
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -1672,11 +1672,11 @@ proc.pi.14
     drop
 
     dup.7
-    add.4
+    add.16
     mem_storew
 
     dup.7
-    add.9
+    add.36
     mem_loadw
 
     movup.2
@@ -1685,19 +1685,19 @@ proc.pi.14
     drop
 
     dup.5
-    add.9
+    add.36
     mem_storew
 
     # place state[48..50) to desired location(s)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
     mem_loadw
 
     dup.5
-    add.2
+    add.8
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -1708,7 +1708,7 @@ proc.pi.14
     movdn.3
 
     dup.7
-    add.2
+    add.8
     mem_storew
 
     drop
@@ -1729,11 +1729,11 @@ proc.pi.14
         mem_storew
 
         movup.4
-        add.1
+        add.4
         movdn.4
 
         movup.5
-        add.1
+        add.4
         movdn.5
     end
 
@@ -1758,7 +1758,7 @@ end
 #! Whole keccak-p[1600, 24] state can be represented using fifty u32 elements i.e. 13 absolute memory addresses
 #! s.t. last two elements of 12 -th ( when indexed from zero ) memory address are zeroed.
 #!
-#! Consecutive memory addresses can be computed by repeated application of `add.1`.
+#! Consecutive memory addresses can be computed by repeated application of `add.4`.
 proc.chi.4
     dup
     locaddr.0
@@ -1779,7 +1779,7 @@ proc.chi.4
     swap
 
     movup.2
-    add.1
+    add.4
     dup
     movdn.3
 
@@ -1830,7 +1830,7 @@ proc.chi.4
     swap
 
     movup.2
-    add.1
+    add.4
     dup
     movdn.3
 
@@ -1857,7 +1857,7 @@ proc.chi.4
     movup.3
 
     movup.4
-    sub.2
+    sub.8
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -1928,7 +1928,7 @@ proc.chi.4
     mem_storew
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -1961,7 +1961,7 @@ proc.chi.4
     mem_storew
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -1979,7 +1979,7 @@ proc.chi.4
 
     # process state[10..20)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2005,7 +2005,7 @@ proc.chi.4
     mem_storew
 
     movup.6
-    add.1
+    add.4
     dup
     movdn.7
 
@@ -2048,7 +2048,7 @@ proc.chi.4
     mem_storew
 
     movup.6
-    sub.2
+    sub.8
     dup
     movdn.7
 
@@ -2077,7 +2077,7 @@ proc.chi.4
     movup.3
 
     movup.4
-    add.1
+    add.4
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -2110,7 +2110,7 @@ proc.chi.4
 
     locaddr.0
     mem_load
-    add.2
+    add.8
     dup
     movdn.5
 
@@ -2141,7 +2141,7 @@ proc.chi.4
     mem_storew
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2172,7 +2172,7 @@ proc.chi.4
     mem_storew
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2204,7 +2204,7 @@ proc.chi.4
 
     # process state[20..30)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2219,7 +2219,7 @@ proc.chi.4
     swap
 
     movup.2
-    add.1
+    add.4
     movdn.2
 
     dup.2
@@ -2258,7 +2258,7 @@ proc.chi.4
     loc_storew.1
 
     movup.6
-    add.1
+    add.4
     movdn.6
 
     dup.6
@@ -2288,7 +2288,7 @@ proc.chi.4
     swap
 
     movup.4
-    sub.2
+    sub.8
     movdn.4
 
     dup.4
@@ -2366,7 +2366,7 @@ proc.chi.4
     mem_storew
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2397,7 +2397,7 @@ proc.chi.4
     mem_storew
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2429,7 +2429,7 @@ proc.chi.4
 
     # process state[30..40)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2454,7 +2454,7 @@ proc.chi.4
     loc_storew.1
 
     movup.6
-    add.1
+    add.4
     movdn.6
 
     dup.6
@@ -2502,7 +2502,7 @@ proc.chi.4
     loc_storew.2
 
     movup.6
-    sub.2
+    sub.8
     movdn.6
 
     dup.6
@@ -2530,7 +2530,7 @@ proc.chi.4
     swap
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2564,7 +2564,7 @@ proc.chi.4
     loc_storew.3
 
     movup.4
-    sub.1
+    sub.4
     movdn.4
 
     dup.4
@@ -2595,7 +2595,7 @@ proc.chi.4
     mem_storew
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2626,7 +2626,7 @@ proc.chi.4
     mem_storew
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2658,7 +2658,7 @@ proc.chi.4
 
     # process state[40..50)
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2668,7 +2668,7 @@ proc.chi.4
     drop
 
     movup.2
-    add.1
+    add.4
     movdn.2
 
     dup.2
@@ -2718,7 +2718,7 @@ proc.chi.4
     loc_storew.1
 
     movup.6
-    add.1
+    add.4
     movdn.6
 
     dup.6
@@ -2748,7 +2748,7 @@ proc.chi.4
     swap
 
     movup.4
-    sub.2
+    sub.8
     movdn.4
 
     dup.4
@@ -2826,7 +2826,7 @@ proc.chi.4
     mem_storew
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2857,7 +2857,7 @@ proc.chi.4
     mem_storew
 
     movup.4
-    add.1
+    add.4
     movdn.4
 
     dup.4
@@ -2947,7 +2947,7 @@ end
 #! Whole keccak-p[1600, 24] state can be represented using fifty u32 elements i.e. 13 absolute memory addresses
 #! s.t. last two elements of 12 -th ( when indexed from zero ) memory address are zeroed.
 #!
-#! Consecutive memory addresses can be computed by repeated application of `add.1`.
+#! Consecutive memory addresses can be computed by repeated application of `add.4`.
 #!
 #! See https://github.com/itzmeanjan/merklize-sha/blob/1d35aae9da7fed20127489f362b4bc93242a516c/include/sha3.hpp#L325-L340
 proc.round
@@ -2977,7 +2977,7 @@ end
 #! Whole keccak-p[1600, 24] state can be represented using fifty u32 elements i.e. 13 absolute memory addresses
 #! s.t. last two elements of 12 -th ( when indexed from zero ) memory address are zeroed.
 #!
-#! Consecutive memory addresses can be computed by repeated application of `add.1`.
+#! Consecutive memory addresses can be computed by repeated application of `add.4`.
 #!
 #! See https://github.com/itzmeanjan/merklize-sha/blob/1d35aae9da7fed20127489f362b4bc93242a516c/include/sha3.hpp#L379-L427
 proc.keccak_p
@@ -3357,7 +3357,7 @@ end
 #! [state_addr, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, ...]
 #!
 #! Note, state_addr is the starting absolute memory address where keccak-p[1600, 24] state
-#! is kept. Consecutive addresses can be computed by repeated application of `add.1` instruction.
+#! is kept. Consecutive addresses can be computed by repeated application of `add.4` instruction.
 #!
 #! Final stack state :
 #!
@@ -3381,7 +3381,7 @@ proc.to_state_array
         mem_storew
         dropw
 
-        add.1
+        add.4
     end
 
     push.0.0.0.1
@@ -3389,56 +3389,56 @@ proc.to_state_array
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     push.0.0.0.0
     dup.4
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     push.0.0.0.0
     dup.4
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     push.0.0.0.0
     dup.4
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     push.0.0.2147483648.0
     dup.4
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     push.0.0.0.0
     dup.4
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     push.0.0.0.0
     dup.4
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     push.0.0.0.0
     dup.4
     mem_storew
     dropw
 
-    add.1
+    add.4
 
     push.0.0.0.0
     movup.4
diff --git a/stdlib/asm/mem.masm b/stdlib/asm/mem.masm
index e1941c2f76..6cd7495378 100644
--- a/stdlib/asm/mem.masm
+++ b/stdlib/asm/mem.masm
@@ -4,10 +4,12 @@ use.std::crypto::hashes::rpo
 
 #! Copies `n` words from `read_ptr` to `write_ptr`.
 #!
+#! `read_ptr` and `write_ptr` *must be* word-aligned.
+#!
 #! Stack transition looks as follows:
 #! [n, read_ptr, write_ptr, ...] -> [...]
 #! cycles: 15 + 16n
-export.memcopy
+export.memcopy_words
   # The loop variable is changed with an add instead of sub because the former
   # uses one fewer cycle. So here the counter is negated. (1 cycles)
   # stack: [-n, read_ptr, write_ptr, ...]
@@ -37,11 +39,11 @@ export.memcopy
     # stack: [-n, read_ptr, write_ptr, x, 0, 0, 0, 0, ...]
     swapw
 
-    # stack: [-n+1, read_ptr+1, write_ptr+1, x, 0, 0, 0, 0, ...]
+    # stack: [-n+1, read_ptr+4, write_ptr+4, x, 0, 0, 0, 0, ...]
     # update counters (9 cycles)
-    add.1 movup.3 movup.3 add.1 movup.3 add.1 movup.3
+    add.1 movup.3 movup.3 add.4 movup.3 add.4 movup.3
 
-    # stack: [0, 0, 0, 0, -n+1, read_ptr+1, write_ptr+1, x, ...]
+    # stack: [0, 0, 0, 0, -n+1, read_ptr+4, write_ptr+4, x, ...]
     swapw
 
     dup.4 neq.0 # while(n!=0) (3 cycles)
diff --git a/stdlib/docs/mem.md b/stdlib/docs/mem.md
index 84fdb8134e..56c645cd8d 100644
--- a/stdlib/docs/mem.md
+++ b/stdlib/docs/mem.md
@@ -2,7 +2,7 @@
 ## std::mem
 | Procedure | Description |
 | ----------- | ------------- |
-| memcopy | Copies `n` words from `read_ptr` to `write_ptr`.<br /><br />Stack transition looks as follows:<br />[n, read_ptr, write_ptr, ...] -> [...]<br />cycles: 15 + 16n<br /> |
+| memcopy_words | Copies `n` words from `read_ptr` to `write_ptr`.<br /><br />Stack transition looks as follows:<br />[n, read_ptr, write_ptr, ...] -> [...]<br />cycles: 15 + 16n<br /> |
 | pipe_double_words_to_memory | Copies an even number of words from the advice_stack to memory.<br /><br />Input: [C, B, A, write_ptr, end_ptr, ...]<br />Output: [C, B, A, write_ptr, ...]<br /><br />Where:<br />- The words C, B, and A are the RPO hasher state<br />- A is the capacity<br />- C,B are the rate portion of the state<br />- The value `words = end_ptr - write_ptr` must be positive and even<br /><br />Cycles: 10 + 9 * word_pairs<br /> |
 | pipe_words_to_memory | Copies an arbitrary number of words from the advice stack to memory<br /><br />Input: [num_words, write_ptr, ...]<br />Output: [C, B, A, write_ptr', ...]<br />Cycles:<br />even num_words: 41 + 9 * num_words / 2<br />odd num_words: 58 + 9 * round_down(num_words / 2)<br /> |
 | pipe_preimage_to_memory | Moves an arbitrary number of words from the advice stack to memory and asserts it matches the commitment.<br /><br />Input: [num_words, write_ptr, COM, ...]<br />Output: [write_ptr', ...]<br />Cycles:<br />even num_words: 62 + 9 * num_words / 2<br />odd num_words: 79 + 9 * round_down(num_words / 2)<br /> |
diff --git a/stdlib/tests/crypto/fri/remainder.rs b/stdlib/tests/crypto/fri/remainder.rs
index 33746d2af6..6542ece5f9 100644
--- a/stdlib/tests/crypto/fri/remainder.rs
+++ b/stdlib/tests/crypto/fri/remainder.rs
@@ -43,7 +43,7 @@ fn test_decorator_ext2intt(in_poly_len: usize, blowup: usize) {
             dup.4
             mem_storew
             dropw
-            sub.1
+            sub.4
         end
         drop
 
@@ -105,7 +105,7 @@ fn test_verify_remainder_64() {
             dup.4
             mem_storew
             dropw
-            sub.1
+            sub.4
         end
         drop
 
@@ -148,7 +148,7 @@ fn test_verify_remainder_32() {
             dup.4
             mem_storew
             dropw
-            sub.1
+            sub.4
         end
         drop
 
diff --git a/stdlib/tests/mem/mod.rs b/stdlib/tests/mem/mod.rs
index 95930d3c6e..f5efa22b14 100644
--- a/stdlib/tests/mem/mod.rs
+++ b/stdlib/tests/mem/mod.rs
@@ -5,7 +5,7 @@ use test_utils::{
 };
 
 #[test]
-fn test_memcopy() {
+fn test_memcopy_words() {
     use miden_stdlib::StdLibrary;
 
     let source = "
@@ -13,12 +13,12 @@ fn test_memcopy() {
 
     begin
         push.0.0.0.1.1000 mem_storew dropw
-        push.0.0.1.0.1001 mem_storew dropw
-        push.0.0.1.1.1002 mem_storew dropw
-        push.0.1.0.0.1003 mem_storew dropw
-        push.0.1.0.1.1004 mem_storew dropw
+        push.0.0.1.0.1004 mem_storew dropw
+        push.0.0.1.1.1008 mem_storew dropw
+        push.0.1.0.0.1012 mem_storew dropw
+        push.0.1.0.1.1016 mem_storew dropw
 
-        push.2000.1000.5 exec.mem::memcopy
+        push.2000.1000.5 exec.mem::memcopy_words
     end
     ";
 
@@ -37,31 +37,30 @@ fn test_memcopy() {
         Process::new(program.kernel().clone(), StackInputs::default(), ExecutionOptions::default());
     process.execute(&program, &mut host).unwrap();
 
-    // TODO(plafer): this will fail due to addresses being too close to each other
     assert_eq!(
         process.chiplets.memory().get_word(ContextId::root(), 1000).unwrap(),
         Some([ZERO, ZERO, ZERO, ONE]),
         "Address 1000"
     );
     assert_eq!(
-        process.chiplets.memory().get_word(ContextId::root(), 1001).unwrap(),
+        process.chiplets.memory().get_word(ContextId::root(), 1004).unwrap(),
         Some([ZERO, ZERO, ONE, ZERO]),
-        "Address 1001"
+        "Address 1004"
     );
     assert_eq!(
-        process.chiplets.memory().get_word(ContextId::root(), 1002).unwrap(),
+        process.chiplets.memory().get_word(ContextId::root(), 1008).unwrap(),
         Some([ZERO, ZERO, ONE, ONE]),
-        "Address 1002"
+        "Address 1008"
     );
     assert_eq!(
-        process.chiplets.memory().get_word(ContextId::root(), 1003).unwrap(),
+        process.chiplets.memory().get_word(ContextId::root(), 1012).unwrap(),
         Some([ZERO, ONE, ZERO, ZERO]),
-        "Address 1003"
+        "Address 1012"
     );
     assert_eq!(
-        process.chiplets.memory().get_word(ContextId::root(), 1004).unwrap(),
+        process.chiplets.memory().get_word(ContextId::root(), 1016).unwrap(),
         Some([ZERO, ONE, ZERO, ONE]),
-        "Address 1004"
+        "Address 1016"
     );
 
     assert_eq!(
@@ -70,24 +69,24 @@ fn test_memcopy() {
         "Address 2000"
     );
     assert_eq!(
-        process.chiplets.memory().get_word(ContextId::root(), 2001).unwrap(),
+        process.chiplets.memory().get_word(ContextId::root(), 2004).unwrap(),
         Some([ZERO, ZERO, ONE, ZERO]),
-        "Address 2001"
+        "Address 2004"
     );
     assert_eq!(
-        process.chiplets.memory().get_word(ContextId::root(), 2002).unwrap(),
+        process.chiplets.memory().get_word(ContextId::root(), 2008).unwrap(),
         Some([ZERO, ZERO, ONE, ONE]),
-        "Address 2002"
+        "Address 2008"
     );
     assert_eq!(
-        process.chiplets.memory().get_word(ContextId::root(), 2003).unwrap(),
+        process.chiplets.memory().get_word(ContextId::root(), 2012).unwrap(),
         Some([ZERO, ONE, ZERO, ZERO]),
-        "Address 2003"
+        "Address 2012"
     );
     assert_eq!(
-        process.chiplets.memory().get_word(ContextId::root(), 2004).unwrap(),
+        process.chiplets.memory().get_word(ContextId::root(), 2016).unwrap(),
         Some([ZERO, ONE, ZERO, ONE]),
-        "Address 2004"
+        "Address 2016"
     );
 }
 

From 51f11c33f308e74df663d9d3805aeea59e0f8a2e Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Sat, 21 Dec 2024 03:48:45 -0500
Subject: [PATCH 07/19] fix: recursive verifier

---
 stdlib/asm/crypto/fri/helper.masm          |  36 ++--
 stdlib/asm/crypto/stark/constants.masm     | 155 ++++++++-------
 stdlib/asm/crypto/stark/deep_queries.masm  |  38 ++--
 stdlib/asm/crypto/stark/ood_frames.masm    |  52 ++---
 stdlib/asm/crypto/stark/public_inputs.masm |   2 +-
 stdlib/asm/crypto/stark/random_coin.masm   | 213 +++++++++++++--------
 stdlib/asm/crypto/stark/verifier.masm      |  10 +-
 7 files changed, 292 insertions(+), 214 deletions(-)

diff --git a/stdlib/asm/crypto/fri/helper.masm b/stdlib/asm/crypto/fri/helper.masm
index bf7b4ffeb5..884f8afa21 100644
--- a/stdlib/asm/crypto/fri/helper.masm
+++ b/stdlib/asm/crypto/fri/helper.masm
@@ -101,24 +101,25 @@ export.load_fri_layer_commitments
     neq
     while.true
         swapw               # [Y, num_layers, ptr_layer, y, y, ...]
-        adv_loadw           # [Com, num_layers, ptr_layer, y, y, ...]
+        adv_loadw           # [COM, num_layers, ptr_layer, y, y, ...]
 
         # Save FRI layer commitment
         dup.5
-        add.1
+        add.4
         swap.6
         mem_storew
-        #=> [Com, num_layers, ptr_layer + 1, y, y, ...]
+        #=> [COM, num_layers, ptr_layer + 4, y, y, ...]
 
         # Reseed
         exec.random_coin::reseed
-        # => [num_layers, ptr_layer + 1, y, y, ...]
+        # => [num_layers, ptr_layer + 4, y, y, ...]
 
         push.0.0.0.0
         exec.random_coin::get_rate_1
+        #=> [R1, ZERO, num_layers, ptr_layer + 4, y, y, ... ]
         push.0.0
         exec.constants::tmp5 mem_loadw
-        # => [lde_size, log2(lde_size), lde_generator, 0, a1, a0, Y, num_layers, ptr_layer + 1, y, y, ...]
+        # => [lde_size, log2(lde_size), lde_generator, 0, a1, a0, Y, num_layers, ptr_layer + 4, y, y, ...]
 
         # Compute and save to memory new lde_size and its new logarithm
         div.4
@@ -132,15 +133,15 @@ export.load_fri_layer_commitments
         movup.2 drop
         swapw
         dropw
-        # => [lde_size, log2(lde_size), a1, a0, num_layers, ptr_layer + 1, y, y, Y, ...]
+        # => [lde_size, log2(lde_size), a1, a0, num_layers, ptr_layer + 4, y, y, Y, ...]
 
         # Save [lde_size, log2(lde_size), a1, a0] in memory next to the layer commitment
         dup.5
-        add.1
+        add.4
         swap.6
         mem_storew
         swapw
-        # => [num_layers, ptr_layer + 2, y, y, lde_size, log2(lde_size), a1, a0, Y]
+        # => [num_layers, ptr_layer + 8, y, y, lde_size, log2(lde_size), a1, a0, Y]
 
         # Decrement the FRI layer counter
         sub.1
@@ -170,17 +171,22 @@ export.load_and_verify_remainder
     push.0.0.0.0
     adv_loadw
     exec.constants::tmp7 mem_storew
+    #=> [COM, ...]
 
     # Reseed with remainder commitment
     exec.random_coin::reseed
+    #=> [...]
 
     # adv_pipe the remainder codeword
     ## Get the length of remainder
+    # TODO(plafer): add a `padw` here
     exec.constants::tmp6 mem_loadw
     ## Compute the correct remainder pointer using length of remainder
     exec.constants::fri_com_ptr
+    #=> [fri_com_ptr, num_fri_layers, remainder_size, lde_size, lde_size]
+    
     swap
-    mul.2
+    mul.8
     add
     ## Store for later use
     exec.constants::tmp8 mem_storew
@@ -205,12 +211,12 @@ export.load_and_verify_remainder
         # coefficients first.
         adv_loadw
         dup.12
-        add.16
+        add.64
         mem_storew
         swapw
         adv_loadw
         dup.12
-        add.17
+        add.68
         mem_storew
         hperm
         # => [Y, Remainder_poly_com, Y, ptr_remainder, remainder_size, y, y]
@@ -246,23 +252,23 @@ export.load_and_verify_remainder
         # coefficients first.
         adv_loadw
         dup.12
-        add.32
+        add.128
         mem_storew
         swapw
         adv_loadw
         dup.12
-        add.33
+        add.132
         mem_storew
         hperm
 
         adv_loadw
         dup.12
-        add.34
+        add.136
         mem_storew
         swapw
         adv_loadw
         dup.12
-        add.35
+        add.140
         mem_storew
         hperm
         # => [Y, Remainder_poly_com, Y, ptr_remainder, remainder_size, y, y]
diff --git a/stdlib/asm/crypto/stark/constants.masm b/stdlib/asm/crypto/stark/constants.masm
index 8529917593..2d5c5bf9e7 100644
--- a/stdlib/asm/crypto/stark/constants.masm
+++ b/stdlib/asm/crypto/stark/constants.masm
@@ -5,7 +5,14 @@
 const.ROOT_UNITY=7277203076849721926
 const.DOMAIN_OFFSET=7
 const.DOMAIN_OFFSET_INV=2635249152773512046
-const.NUM_CONSTRAINT_COMPOSITION_COEF_MULTIPLIED_BY_TWO_ROUNDED_UP_TO_FOUR=224
+
+# TODO(plafer): remove "MULTIPLIED_BY_TWO"
+# Number of coefficients corresponds to the number of boundary + transition constraints 
+# (including auxiliary constraints)
+const.NUM_CONSTRAINT_COMPOSITION_COEF_MULTIPLIED_BY_TWO_ROUNDED_UP_TO_FOUR=232
+
+# Number of coefficients corresponds to "number of main & aux columns" + 8,
+# where "8" is the number of columns needed to store the constraint composition polynomial.
 const.NUM_DEEP_COMPOSITION_COEF_MULTIPLIED_BY_TWO_ROUNDED_UP_TO_FOUR=88
 
 
@@ -19,35 +26,37 @@ const.TRACE_DOMAIN_GENERATOR_PTR=4294799999
 const.PUBLIC_INPUTS_PTR=4294800000
 
 # OOD Frames
-# (70 + 7) * 2 * 2 Felt for current and next trace rows and 8 * 2 Felt for constraint composition
-# polynomials. Total memory slots required: ((70 + 7) * 2 * 2 + 8 * 2) / 4 = 81
+# (71 + 7) * 2 * 2 Felt for current and next trace rows and 8 * 2 Felt for constraint composition
+# polynomials. Memory slots:
+# OOD_TRACE_PTR: (71 + 7) * 2 * 2 = 312
+# OOD_CONSTRAINT_EVALS_PTR: 8 * 2 = 16
 const.OOD_TRACE_PTR=4294900000
-const.OOD_CONSTRAINT_EVALS_PTR=4294900077
+const.OOD_CONSTRAINT_EVALS_PTR=4294900312
 
 # Current trace row
-# 70 Felt for main portion of trace, 7 * 2 Felt for auxiliary portion of trace and 8 * 2 Felt for
+# 71 Felt for main portion of trace, 7 * 2 Felt for auxiliary portion of trace and 8 * 2 Felt for
 # constraint composition polynomials. Since we store these with the padding to make each of the
-# three portions a multiple of 8, the number of slots required is (72 + 16 + 16) / 4 = 26
-const.CURRENT_TRACE_ROW_PTR=4294900100
+# three portions a multiple of 8, the number of slots required is 72 + 16 + 16 = 104
+const.CURRENT_TRACE_ROW_PTR=4294900400
 
 # Random elements
-# There are are currently 16 ExtFelt for a total of 32 Felt. Thus the number of slots required is 8.
-const.AUX_RAND_ELEM_PTR=4294900150
+# There are are currently 16 ExtFelt for a total of 32 Felt. Thus the number of memory slots required is 32.
+const.AUX_RAND_ELEM_PTR=4294900600
 
-# We need 2 Felt for each constraint. We take 2800 slots as an upper bound
-const.COMPOSITION_COEF_PTR=4294900200
+# We need 2 Felt for each constraint. We take 112000 slots as an upper bound
+const.COMPOSITION_COEF_PTR=4294900800
 
 # We need 2 Felt for each trace column and each of the 8 constraint composition columns. We thus need
-# (70 + 7 + 8) * 2 Felt i.e. 43 memory slots.
+# (71 + 7 + 8) * 2 = 172 Felt i.e. 172 memory slots.
 # Note that there is a cap on the number of such coefficients so that the memory region allocated for
 # these coefficients does not overlap with the memory region storing the FRI queries.
-# This cap is of a 100 coefficients which is equivalent to 50 memory slots. This gives 150 memory
+# This cap is of a 100 coefficients which is equivalent to 200 memory slots. This gives 600 memory
 # slots for all of the FRI queries i.e., 150 FRI queries.
-const.DEEP_RAND_CC_PTR=4294903000
+const.DEEP_RAND_CC_PTR=4294912000
 
 # FRI
 #
-#       (FRI_COM_PTR - 150)    ---|
+#       (FRI_COM_PTR - 600)    ---|
 #              .
 #              .                  | <- FRI queries
 #              .
@@ -55,53 +64,53 @@ const.DEEP_RAND_CC_PTR=4294903000
 #              .
 #              .                  | <- FRI layer commitments and folding challenges
 #              .
-#       (FRI_COM_PTR + 32)     ---|
+#       (FRI_COM_PTR + 128)     ---|
 #              .
 #              .                  | <- Remainder codeword and polynomial
 #              .
-#       (FRI_COM_PTR + 66-1)   ---|
+#       (FRI_COM_PTR + 264-1)   ---|
 #
-# For each FRI layer, we need 2 memory slots, one for storing the FRI layer commitment and one for
+# For each FRI layer, we need 8 memory slots, one for storing the FRI layer commitment and one for
 # storing the word [a0, a1, log2(lde_size), lde_size] where a := (a0, a1) is the folding randomness
 # and lde_size is the size of the LDE domain. Since we are using a folding factor of 4 and the
 # maximal degree of the remainder polynomial that we allow is 7, an upper limit of 16 FRI layers is
-# ample and the number of memory slots we thus allocate for this is 32. Moreover, we allocate
-# an additional 32 slots for the remainder codeword and 2 for the remainder polynomial. These are
+# ample and the number of memory slots we thus allocate for this is 128. Moreover, we allocate
+# an additional 128 slots for the remainder codeword and 8 for the remainder polynomial. These are
 # expected to be laid out right after the FRI commitments.
-# The total number of slots thus becomes 66.
-const.FRI_COM_PTR=4294903200
+# The total number of slots thus becomes 264.
+const.FRI_COM_PTR=4294912800
 
 # Commitment to main, auxiliary and composition polynomials traces
-const.MAIN_TRACE_COM_PTR=4294903300
-const.AUX_TRACE_COM_PTR=4294903301
-const.COMPOSITION_POLY_COM_PTR=4294903302
+const.MAIN_TRACE_COM_PTR=4294913200
+const.AUX_TRACE_COM_PTR=4294913204
+const.COMPOSITION_POLY_COM_PTR=4294913208
 
 # Instant-specific constants
-const.LDE_SIZE_PTR=4294903303
-const.Z_PTR=4294903304
-const.NUM_QUERIES_PTR=4294903305
-const.TRACE_LENGTH_PTR=4294903306
-const.TRACE_LENGTH_LOG_PTR=4294903307
-const.GRINDING_FACTOR_PTR=4294903308
+const.LDE_SIZE_PTR=4294913212
+const.Z_PTR=4294913216
+const.NUM_QUERIES_PTR=4294913220
+const.TRACE_LENGTH_PTR=4294913224
+const.TRACE_LENGTH_LOG_PTR=4294913228
+const.GRINDING_FACTOR_PTR=4294913232
 
 # RPO capacity initialization words
-const.ZERO_WORD_PTR=4294903309
+const.ZERO_WORD_PTR=4294913236
 
 # State of RPO-based random coin
-const.C_PTR=4294903311
-const.R1_PTR=4294903312
-const.R2_PTR=4294903313
+const.C_PTR=4294913244
+const.R1_PTR=4294913248
+const.R2_PTR=4294913252
 
 # Address used for storing temporary values:
-const.TMP1=4294903315
-const.TMP2=4294903316
-const.TMP3=4294903317
-const.TMP4=4294903318
-const.TMP5=4294903319
-const.TMP6=4294903320
-const.TMP7=4294903321
-const.TMP8=4294903322
-const.TMP9=4294903323
+const.TMP1=4294913256
+const.TMP2=4294913260
+const.TMP3=4294913264
+const.TMP4=4294913268
+const.TMP5=4294913272
+const.TMP6=4294913276
+const.TMP7=4294913280
+const.TMP8=4294913284
+const.TMP9=4294913288
 
 
 
@@ -112,35 +121,35 @@ const.TMP9=4294903323
 #   | TRACE_DOMAIN_GENERATOR_PTR               |       4294799999        |
 #   | PUBLIC_INPUTS_PTR                        |       4294800000        |
 #   | OOD_TRACE_PTR                            |       4294900000        |
-#   | OOD_CONSTRAINT_EVALS_PTR                 |       4294900077        |
-#   | CURRENT_TRACE_ROW_PTR                    |       4294900100        |
-#   | AUX_RAND_ELEM_PTR                        |       4294900150        |
-#   | COMPOSITION_COEF_PTR                     |       4294900200        |
-#   | DEEP_RAND_CC_PTR                         |       4294903000        |
-#   | FRI_COM_PTR                              |       4294903200        |
-#   | MAIN_TRACE_COM_PTR                       |       4294903300        |
-#   | AUX_TRACE_COM_PTR                        |       4294903301        |
-#   | COMPOSITION_POLY_COM_PTR                 |       4294903302        |
-#   | LDE_SIZE_PTR                             |       4294903303        |
-#   | Z_PTR                                    |       4294903304        |
-#   | NUM_QUERIES_PTR                          |       4294903305        |
-#   | TRACE_LENGTH_PTR                         |       4294903306        |
-#   | TRACE_LENGTH_LOG_PTR                     |       4294903307        |
-#   | GRINDING_FACTOR_PTR                      |       4294903308        |
-#   | ZERO_WORD_PTR                            |       4294903309        |
-#   | ZERO_ZERO_ZERO_ONE_PTR                   |       4294903310        |
-#   | C_PTR                                    |       4294903311        |
-#   | R1_PTR                                   |       4294903312        |
-#   | R2_PTR                                   |       4294903313        |
-#   | TMP1                                     |       4294903315        |
-#   | TMP2                                     |       4294903316        |
-#   | TMP3                                     |       4294903317        |
-#   | TMP4                                     |       4294903318        |
-#   | TMP5                                     |       4294903319        |
-#   | TMP6                                     |       4294903320        |
-#   | TMP7                                     |       4294903321        |
-#   | TMP8                                     |       4294903322        |
-#   | TMP9                                     |       4294903323        |
+#   | OOD_CONSTRAINT_EVALS_PTR                 |       4294900312        |
+#   | CURRENT_TRACE_ROW_PTR                    |       4294900400        |
+#   | AUX_RAND_ELEM_PTR                        |       4294900600        |
+#   | COMPOSITION_COEF_PTR                     |       4294900800        |
+#   | DEEP_RAND_CC_PTR                         |       4294912000        |
+#   | FRI_COM_PTR                              |       4294912800        |
+#   | MAIN_TRACE_COM_PTR                       |       4294913200        |
+#   | AUX_TRACE_COM_PTR                        |       4294913204        |
+#   | COMPOSITION_POLY_COM_PTR                 |       4294913208        |
+#   | LDE_SIZE_PTR                             |       4294913212        |
+#   | Z_PTR                                    |       4294913216        |
+#   | NUM_QUERIES_PTR                          |       4294913220        |
+#   | TRACE_LENGTH_PTR                         |       4294913224        |
+#   | TRACE_LENGTH_LOG_PTR                     |       4294913228        |
+#   | GRINDING_FACTOR_PTR                      |       4294913232        |
+#   | ZERO_WORD_PTR                            |       4294913236        |
+#   | ZERO_ZERO_ZERO_ONE_PTR                   |       4294913240        |
+#   | C_PTR                                    |       4294913244        |
+#   | R1_PTR                                   |       4294913248        |
+#   | R2_PTR                                   |       4294913252        |
+#   | TMP1                                     |       4294913256        |
+#   | TMP2                                     |       4294913260        |
+#   | TMP3                                     |       4294913264        |
+#   | TMP4                                     |       4294913268        |
+#   | TMP5                                     |       4294913272        |
+#   | TMP6                                     |       4294913276        |
+#   | TMP7                                     |       4294913280        |
+#   | TMP8                                     |       4294913284        |
+#   | TMP9                                     |       4294913288        |
 #   +------------------------------------------+-------------------------+
 
 # ACCESSORS
diff --git a/stdlib/asm/crypto/stark/deep_queries.masm b/stdlib/asm/crypto/stark/deep_queries.masm
index 41c721c066..c36f7a701a 100644
--- a/stdlib/asm/crypto/stark/deep_queries.masm
+++ b/stdlib/asm/crypto/stark/deep_queries.masm
@@ -22,7 +22,7 @@ use.std::crypto::stark::constants
 #!                                                       \/
 #!
 #! +-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+------+--------+--------+-----+
-#! |  T31  |  T30  |  T21  |  T20  |  T11  |  T10  |  T01  |  T00  |  p1' |  p0' |  r1' |  r0' |x_addr|z_addr+1|a_addr+b|  -  |
+#! |  T31  |  T30  |  T21  |  T20  |  T11  |  T10  |  T01  |  T00  |  p1' |  p0' |  r1' |  r0' |x_addr|z_addr+4|a_addr+b|  -  |
 #! +-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+------+--------+--------------+
 #!
 #!
@@ -47,12 +47,12 @@ export.combine_aux
     # 2) Get a_addr and update it. This is done here before it becomes inaccessible.
 
     # Update a_addr
-    dup.14 add.1 swap.15
+    dup.14 add.4 swap.15
     #=> [a_addr, T01, T00, T31, T30, T21, T20, T11, T10, p1, p0, r1, r0, x_addr, z_addr, a_addr', 0]
 
     # 3) Load i-th OOD frame portion. This assumes that the OOD frame has been serialized with `current` and `next` rows interleaved.
     # This also updates the z_addr pointer.
-    dup.14 add.1 swap.15
+    dup.14 add.4 swap.15
     padw movup.4 mem_loadw
     #=> [Tgz1, Tgz0, Tz1, Tz0, a_addr, T01, T00, T31, T30, T21, T20, T11, T10, p1, p0, r1, r0, x_addr, z_addr', a_addr', 0]
 
@@ -144,6 +144,8 @@ proc.load_query_row
 
     ## Get main trace commitment and use it to get the leaf
     movdn.3 movdn.2
+    #=> [y, y, depth, index, query_ptr]
+
     push.0.0
     exec.constants::main_trace_com_ptr mem_loadw
     #=>[R, depth, index, query_ptr, ...]
@@ -158,12 +160,18 @@ proc.load_query_row
     exec.constants::current_trace_row_ptr
     swapw
     #=>[R, ptr, y, y, y, depth, index, query_ptr, ...]
+
+    # Set the first element of the capacity to the number of main trace columns (modulo 8)
     exec.constants::zero_word mem_loadw
-    add.6
+    add.7
     swap.3
+
+    # Set R1 and R2 to whatever (they will be overwritten by adv_pipe)
     padw
     padw
-    #=> [Y, Y, 0, 0, 0, 1, ptr, y, y, y]
+    #=> [Y, Y, 0, 0, 0, 7, ptr, y, y, y]
+
+    # Read the first 64 main trace columns in - the last 7 will be handled separately
     repeat.8
         adv_pipe hperm
     end
@@ -176,10 +184,10 @@ proc.load_query_row
     dropw
     adv_push.1
     adv_push.1
-    push.0
+    adv_push.1
     push.0
     ## Store the last 2 main segment columns
-    dup.12 add.1 mem_storew
+    dup.12 add.4 mem_storew
 
     ## Final hperm
     hperm
@@ -198,7 +206,7 @@ proc.load_query_row
     #=> [Y, ptr, y, y, y, depth, index, query_ptr, ...]
 
     ## increment ptr to account for the last two words we loaded from the advice tape
-    swapw add.2 swapw
+    swapw add.8 swapw
 
 
     # Aux trace part
@@ -225,7 +233,7 @@ proc.load_query_row
     push.0.0
 
     ## Store the last aux segment column
-    dup.12 add.1 mem_storew
+    dup.12 add.4 mem_storew
 
     ## Final hperm
     hperm
@@ -243,7 +251,7 @@ proc.load_query_row
     #=> [Y, ptr, y, y, y, depth, index, query_ptr, ...]
 
     ##increment ptr to account for column 9 and an additional +1 for the all zero word
-    swapw add.2 swapw
+    swapw add.8 swapw
 
 
     # Constraint composition trace part
@@ -349,7 +357,7 @@ proc.combine_main_trace_columns
     end
 
     mem_stream
-    repeat.6
+    repeat.7
         rcomb_base
     end
 end
@@ -473,6 +481,7 @@ export.compute_deep_composition_polynomial_queries
     exec.constants::fri_com_ptr
     dup.1
     # =>[query_ptr, query_end_ptr, query_ptr...]
+
     # Store the pointers to:
     # 1. random values for computing DEEP polynomial
     # 2. OOD evaluation frame
@@ -505,6 +514,7 @@ export.compute_deep_composition_polynomial_queries
 
     push.1
     while.true
+
         # I)
         #
         # Load the (main, aux, constraint)-traces rows associated with the current query and get
@@ -580,16 +590,16 @@ export.compute_deep_composition_polynomial_queries
         ## b) Store [eval0, eval1, index, poe]
         ##
         ## Cycles: 5
-        dup.4 add.1 swap.5
+        dup.4 add.4 swap.5
         mem_storew
-        #=> [poe, index, eval1, eval0, query_ptr+1, query_end_ptr, query_ptr, ...]
+        #=> [poe, index, eval1, eval0, query_ptr+4, query_end_ptr, query_ptr, ...]
 
         ## c) Prepare stack for next iteration
         ##
         ## Cycles: 4
         dup.5 dup.5
         neq
-        #=> [?, query_ptr+1, query_end_ptr, ...]
+        #=> [?, query_ptr+4, query_end_ptr, ...]
     end
     dropw drop drop
 end
diff --git a/stdlib/asm/crypto/stark/ood_frames.masm b/stdlib/asm/crypto/stark/ood_frames.masm
index b162d71b77..d6e3988550 100644
--- a/stdlib/asm/crypto/stark/ood_frames.masm
+++ b/stdlib/asm/crypto/stark/ood_frames.masm
@@ -9,34 +9,32 @@ use.std::crypto::hashes::rpo
 #! Output: [OOD_FRAME_HASH, ...]
 #! Cycles: 100
 export.load_evaluation_frame
-    # We have 70 main trace columns and 7 aux trace columns for a total of 154 base field elements
+    # We have 71 main trace columns and 7 aux trace columns for a total of 156 base field elements
     # per row. Since we have two rows, i.e. current and next, the total number of field elements
-    # making up the OOD evaluation frame is:
-    # 324 = 38 * 8 + 4
+    # making up the OOD evaluation frame is 156*2 = 312. We will be reading felts in 39 batches of 8 
+    # using `adv_pipe`: 312 = 39 * 8. 
     # The elements are stored from the stack as (a1_1, a1_0, a0_1, a0_0) where a0 is from the
     # current row and a1 from the next row.
 
     exec.constants::ood_trace_ptr
+    #=> [ood_trace_ptr ]
 
-    push.4.0.0.0
+    # Note: the first word is the capacity, where its first element is initialized with the number of elements to hash MODULO 8.
+    push.0.0.0.0
     padw padw
-    repeat.38
+    #=> [ZERO, ZERO, 0, 0, 0, 4, ood_trace_ptr]
+    repeat.39
         adv_pipe
         hperm
     end
-
-    # Load the last remaining word and pad with 1 followed by three 0
-    adv_loadw
-    dup.12 mem_storew
-    swapw
-    exec.constants::zero_word mem_loadw
-    hperm
+    #=> [R1, R2, C, ood_trace_ptr+312]
 
     dropw
     swapw
     dropw
     movup.4
     drop
+    #=> [R2]
 end
 
 #! Loads OOD constraint composition polynomial evaluation columns into memory and reseeds the random
@@ -61,18 +59,18 @@ export.load_constraint_evaluations
     dropw
 
     dup.1 dup.1 push.0.0
-    exec.constants::ood_constraint_evals_ptr add.1
+    exec.constants::ood_constraint_evals_ptr add.4
     mem_storew
 
     # Load value_2 and value_3
     adv_loadw
     dup.3 dup.3 push.0.0
-    exec.constants::ood_constraint_evals_ptr add.2
+    exec.constants::ood_constraint_evals_ptr add.8
     mem_storew
     dropw
 
     dup.1 dup.1 push.0.0
-    exec.constants::ood_constraint_evals_ptr add.3
+    exec.constants::ood_constraint_evals_ptr add.12
     mem_storew
 
     dropw
@@ -81,12 +79,12 @@ export.load_constraint_evaluations
     # Load value_4 and value_5
     adv_loadw
     dup.3 dup.3 push.0.0
-    exec.constants::ood_constraint_evals_ptr add.4
+    exec.constants::ood_constraint_evals_ptr add.16
     mem_storew
     dropw
 
     dup.1 dup.1 push.0.0
-    exec.constants::ood_constraint_evals_ptr add.5
+    exec.constants::ood_constraint_evals_ptr add.20
     mem_storew
     dropw
 
@@ -95,12 +93,12 @@ export.load_constraint_evaluations
     # Load value_6 and value_7
     adv_loadw
     dup.3 dup.3 push.0.0
-    exec.constants::ood_constraint_evals_ptr add.6
+    exec.constants::ood_constraint_evals_ptr add.24
     mem_storew
     dropw
 
     dup.1 dup.1 push.0.0
-    exec.constants::ood_constraint_evals_ptr add.7
+    exec.constants::ood_constraint_evals_ptr add.28
     mem_storew
     dropw
 
@@ -127,36 +125,38 @@ export.compute_Hz
     # => [0, 0, v0_1, v0_0, ptr, ...]
    
     # Load value_1
-    push.0.0 dup.6 add.1 mem_loadw
+    push.0.0 dup.6 add.4 mem_loadw
     # => [0, 0, v1_1, v1_0, v0_1, v0_0, ptr, ...]
 
     # Load value_2
-    push.0.0 dup.8 add.2 mem_loadw
+    push.0.0 dup.8 add.8 mem_loadw
     # => [0, 0, v2_1, v2_0, v1_1, v1_0, v0_1, v0_0, ptr, ...]
 
     # Load value_3
-    push.0.0 dup.10 add.3 mem_loadw
+    push.0.0 dup.10 add.12 mem_loadw
     # => [0, 0, v3_1, v3_0, v2_1, v2_0, v1_1, v1_0, v0_1, v0_0, ptr, ...]
 
     # Load value_4
-    push.0.0 dup.12 add.4 mem_loadw
+    push.0.0 dup.12 add.16 mem_loadw
     # => [0, 0, v4_1, v4_0, v3_1, v3_0, v2_1, v2_0, v1_1, v1_0, v0_1, v0_0, ptr, ...]
 
     # Load value_5
-    push.0.0 movup.14 movdn.4 dup.4 add.5 mem_loadw
+    push.0.0 movup.14 movdn.4 dup.4 add.20 mem_loadw
     # => [0, 0, v5_1, v5_0, ptr, v4_1, v4_0, v3_1, v3_0, v2_1, v2_0, v1_1, v1_0, v0_1, v0_0, ptr, ...]
 
     # Load value_6
-    push.0.0 dup.6 add.6 mem_loadw
+    push.0.0 dup.6 add.24 mem_loadw
     # => [0, 0, v6_1, v6_0, v5_1, v5_0, ptr, v4_1, v4_0, v3_1, v3_0, v2_1, v2_0, v1_1, v1_0, v0_1, v0_0, ptr, ...]
 
     # Load value_7
-    push.0.0 movup.8 add.7 mem_loadw
+    push.0.0 movup.8 add.28 mem_loadw
     # => [0, 0, v7_1, v7_0, v6_1, v6_0, v5_1, v5_0, ptr, v4_1, v4_0, v3_1, v3_0, v2_1, v2_0, v1_1, v1_0, v0_1, v0_0, ptr, ...]
 
     ## Load z^N where N is the length of the execution trace
     push.0.0
     exec.constants::z_ptr mem_loadw
+    # => [(z1, z0)^N, z1, z0, v7_1, v7_0, v6_1, v6_0, v5_1, v5_0, ptr, v4_1, v4_0, v3_1, v3_0, v2_1, v2_0, v1_1, v1_0, v0_1, v0_0, ptr, ...]
+
     movup.2 drop
     movup.2 drop
     # => [z1, z0, value_7, ... ,value_0]
diff --git a/stdlib/asm/crypto/stark/public_inputs.masm b/stdlib/asm/crypto/stark/public_inputs.masm
index a837288d3e..af8151d87b 100644
--- a/stdlib/asm/crypto/stark/public_inputs.masm
+++ b/stdlib/asm/crypto/stark/public_inputs.masm
@@ -5,7 +5,7 @@ use.std::crypto::stark::constants
 #! Load the public inputs in memory starting from the address referenced by `public_inputs_ptr`.
 #! In parallel, compute the hash of the public inputs being loaded. The hashing starts with
 #! capacity registers of the hash function set to `C` resulting from hashing the proof context.
-#! The ouptut D is the digest of the hashing.
+#! The output D is the digest of the hashing.
 #!
 #! Input: [public_inputs_ptr, C]
 #! Output: [D]
diff --git a/stdlib/asm/crypto/stark/random_coin.masm b/stdlib/asm/crypto/stark/random_coin.masm
index 63fe259c9f..3a655bf033 100644
--- a/stdlib/asm/crypto/stark/random_coin.masm
+++ b/stdlib/asm/crypto/stark/random_coin.masm
@@ -172,9 +172,9 @@ export.init_seed
     ## 4. number of auxiliary random values
     ## 5. trace length (this is already on the stack)
 
-    ## main segment width is 70 and there are 1 auxiliary segments
+    ## main segment width is 71 and there are 1 auxiliary segments
     ## of width 7 using 16 random extension field elements
-    push.0x46010710 
+    push.0x47010710 
     ## field modulus bytes (2 field elements)
     push.0x01 # lower half of the modulus
     push.0xffffffff # upper half of the modulus
@@ -233,10 +233,12 @@ end
 # =============================================================================================
 
 #! Generates a `num_tuples` tuples of random field elements and stores them in memory
-#! starting from address `dest_ptr`. Each memory address holds two tuples.
+#! starting from address `dest_ptr`. Each tuple uses 8 memory slots.
 #! TODO: Generalize by keeping track of something similar to the `output` variable in `RpoRandomCoin`
 #! so that we keep track of already used randomness and know when there is a need to apply `hperm`.
 #!
+#! `dest_ptr` must be word-aligned.
+#!
 #! Input: [dest_ptr, num_tuples, ...]
 #! Output: [...]
 #!
@@ -257,14 +259,16 @@ proc.generate_random_coefficients
     dup.5 mem_storew
 
     exec.get_rate_2
-    dup.9 add.1 mem_storew
+    dup.9 add.4 mem_storew
     #=> [R2, R1, loop_ctr, dest_ptr, 0, 0, ..]
 
     exec.get_capacity
     swapdw
+    #=> [R1, loop_ctr, dest_ptr, 0, 0, C, R2 ..]
     swapw
-    swap add.2 swap
-    #=> [loop_ctr, dest_ptr, 0, 0, R1, C, R2, ..]
+    #=> [loop_ctr, dest_ptr, 0, 0, R1, C, R2 ..]
+    swap add.8 swap
+    #=> [loop_ctr, dest_ptr+8, 0, 0, R1, C, R2, ..]
 
     add.1 dup neq.0
 
@@ -273,8 +277,8 @@ proc.generate_random_coefficients
         swapw.3 hperm
         #=> [R2, R1, C, loop_ctr, dest_ptr, x, x, ...]
 
-        # save R2 to mem[dest+1]; we use dup.13 here because it takes only 1 cycle
-        dup.13 add.1 mem_storew
+        # save R2 to mem[dest+4]; we use dup.13 here because it takes only 1 cycle
+        dup.13 add.4 mem_storew
         #=> [R2, R1, C, loop_ctr, dest_ptr, x, x, ...]
 
         # save R1 to mem[dest]
@@ -285,8 +289,8 @@ proc.generate_random_coefficients
         swapw.3
         #=> [loop_ctr, dest_ptr, x, x, R1, C, R2, ...]
 
-        swap add.2 swap
-        #=> [loop_ctr, dest_ptr+2, x, x, R1, C, R2, ...]
+        swap add.8 swap
+        #=> [loop_ctr, dest_ptr+8, x, x, R1, C, R2, ...]
 
         add.1 dup
         #=> [loop_ctr+1, loop_ctr+1, dest_ptr+2, x, x, R1, C, R2, ...]
@@ -306,10 +310,12 @@ proc.generate_random_coefficients
 end
 
 #! Generates a `num_tuples` tuples of random field elements and stores them in memory
-#! starting from address `dest_ptr`. Each memory address holds one tuple.
+#! starting from address `dest_ptr`. Each memory word holds one tuple, e.g. `[0, 0, t0, t1]`.
 #! TODO: Generalize by keeping track of something similar to the `output` variable in `RpoRandomCoin`
 #! so that we keep track of already used randomness and know when there is a need to apply `hperm`.
 #!
+#! `dest_ptr` must be word-aligned.
+#!
 #! Input: [dest_ptr, num_tuples, ...]
 #! Output: [...]
 #!
@@ -333,33 +339,45 @@ proc.generate_random_coefficients_pad
     dup.4
     push.0.0
     dup.4
-    mem_storew
-    #=> [0, 0, a01, a00, dest_ptr, a11, a10, a01, a00, loop_ctr, dest_ptr, x, x, ...]
+    #=> [dest_ptr, 0, 0, a01, a00, dest_ptr, a11, a10, a01, a00, loop_ctr, dest_ptr, x, x, ...]
+
+    mem_storew dropw
+    #=> [dest_ptr, a11, a10, a01, a00, loop_ctr, dest_ptr, x, x, ...]
 
-    dropw
     dup.2 dup.2 push.0.0
-    movup.4 add.1 mem_storew
+    #=> [0, 0, a11, a10, dest_ptr, a11, a10, a01, a00, loop_ctr, dest_ptr, x, x, ...]
+
+    movup.4 add.4 mem_storew
     #=> [0, 0, a11, a10, a11, a10, a01, a00, loop_ctr, dest_ptr, x, x, ...]
 
     exec.constants::r2_ptr mem_loadw
-    dup.9  add.4 swap.10
-    #=> [dest_ptr, a31, a30, a21, a20, a11, a10, a01, a00, loop_ctr, dest_ptr+4, x, x, ...]
+    #=> [a31, a30, a21, a20, a11, a10, a01, a00, loop_ctr, dest_ptr, x, x, ...]
+
+    dup.9  add.16 swap.10
+    #=> [dest_ptr, a31, a30, a21, a20, a11, a10, a01, a00, loop_ctr, dest_ptr+16, x, x, ...]
 
     dup.4
     dup.4
     push.0.0
-    dup.4 add.2
-    mem_storew
-    #=> [0, 0, a21, a20, dest_ptr, a31, a30, a21, a20, a11, a10, a01, a00, loop_ctr, dest_ptr+4, x, x, ...]
-    dropw
+    #=> [0, 0, a21, a20, dest_ptr, a31, a30, a21, a20, a11, a10, a01, a00, loop_ctr, dest_ptr+16, x, x, ...]
+
+    dup.4 add.8
+    mem_storew dropw
+    #=> [dest_ptr, a31, a30, a21, a20, a11, a10, a01, a00, loop_ctr, dest_ptr+16, x, x, ...]
+
     dup.2 dup.2 push.0.0
-    movup.4 add.3 mem_storew
-    #=> [0, 0, a31, a30, a31, a30, a21, a20, a11, a10, a01, a00, loop_ctr, dest_ptr+4, x, x, ...]
+    #=> [0, 0, a31, a30, dest_ptr, a31, a30, a21, a20, a11, a10, a01, a00, loop_ctr, dest_ptr+16, x, x, ...]
+
+    movup.4 add.12 mem_storew
+    #=> [0, 0, a31, a30, a31, a30, a21, a20, a11, a10, a01, a00, loop_ctr, dest_ptr+16, x, x, ...]
 
     exec.constants::c_ptr mem_loadw
+    #=> [C, a31, a30, a21, a20, a11, a10, a01, a00, loop_ctr, dest_ptr+16, x, x, ...]
+
     swapdw
+    #=> [a11, a10, a01, a00, loop_ctr, dest_ptr+16, x, x, C, a31, a30, a21, a20, ...]
     swapw
-    #=> [loop_ctr, dest_ptr, 0, 0, R1, C, R2, ..]
+    #=> [loop_ctr, dest_ptr+16, x, x, R1, C, R2, ..]
 
     add.1 dup neq.0
 
@@ -368,36 +386,44 @@ proc.generate_random_coefficients_pad
         swapw.3 hperm
         #=> [R2, R1, C, loop_ctr, dest_ptr, x, x, ...]
 
-        # save R2 to mem[dest+1]; we use dup.13 here because it takes only 1 cycle
+        # save R2 to mem[dest+4]; we use dup.13 here because it takes only 1 cycle
         dup.13
         dup.4 dup.4 push.0.0
-        dup.4 add.2
-        mem_storew
-        #=> [0, 0, a21, a20, dest_ptr, a31, a30, a21, a20, a11, a10, a01, a00, C, loop_ctr, dest_ptr, x, x, ...]
-        dropw
+        #=> [0, 0, a31, a30, dest_ptr, R2, R1, C, loop_ctr, dest_ptr, x, x, ...]
+
+        dup.4 add.8
+        mem_storew dropw
+        #=> [dest_ptr, a31, a30, a21, a20, a11, a10, a01, a00, C, loop_ctr, dest_ptr, x, x, ...]
+
         dup.2 dup.2 push.0.0
-        movup.4 add.3 mem_storew
-        #=> [0, 0, a31, a30, a31, a30, a21, a20, a11, a10, a01, a00, C, loop_ctr, dest_ptr, x, x, ...]
+        #=> [0, 0, a31, a30, dest_ptr, a31, a30, a21, a20, a11, a10, a01, a00, C, loop_ctr, dest_ptr, x, x, ...]
+
+        movup.4 add.12 mem_storew dropw
+        #=> [a31, a30, a21, a20, a11, a10, a01, a00, C, loop_ctr, dest_ptr, x, x, ...]
 
         # save R1 to mem[dest]
-        dropw
         swapw dup.13
+        #=> [dest_ptr, a11, a10, a01, a00, a31, a30, a21, a20, C, loop_ctr, dest_ptr, x, x, ...]
+
         dup.4 dup.4 push.0.0
-        dup.4
-        mem_storew
         #=> [0, 0, a01, a00, dest_ptr, a11, a10, a01, a00, a31, a30, a21, a20, C, loop_ctr, dest_ptr, x, x, ...]
-        dropw
+
+        dup.4
+        mem_storew dropw
+        #=> [dest_ptr, a11, a10, a01, a00, a31, a30, a21, a20, C, loop_ctr, dest_ptr, x, x, ...]
+
         dup.2 dup.2 push.0.0
-        movup.4 add.1 mem_storew
-        #=> [0, 0, a01, a00, a11, a10, a01, a00, a31, a30, a21, a20, C, loop_ctr, dest_ptr, x, x, ...]
+        #=> [0, 0, a11, a10, dest_ptr, a11, a10, a01, a00, a31, a30, a21, a20, C, loop_ctr, dest_ptr, x, x, ...]
+
+        movup.4 add.4 mem_storew dropw
+        #=> [a11, a10, a01, a00, a31, a30, a21, a20, C, loop_ctr, dest_ptr, x, x, ...]
 
         # reshuffle and update destination pointer and loop counter
-        dropw
         swapw
         swapw.3
         #=> [loop_ctr, dest_ptr, x, x, R1, C, R2, ...]
 
-        swap add.4 swap
+        swap add.16 swap
         #=> [loop_ctr, dest_ptr+2, x, x, R1, C, R2, ...]
 
         add.1 dup
@@ -405,7 +431,7 @@ proc.generate_random_coefficients_pad
 
         neq.0
     end
-
+    
     # Save the new state of the random coin
     dropw
     exec.constants::r1_ptr mem_storew
@@ -418,20 +444,21 @@ proc.generate_random_coefficients_pad
 end
 
 #! Draw a list of random extension field elements related to the auxiliary trace and store the list
-#! in memory from `aux_rand_elem_ptr` to `aux_rand_elem_ptr + 8 - 1`
+#! in memory from `aux_rand_elem_ptr` to `aux_rand_elem_ptr + 32 - 4`
 #!
 #! Input: [aux_rand_elem_ptr, ...]
 #! Output: [...]
 #! Cycles: 159
 export.generate_aux_randomness
 
+    # TODO(plafer): make 16 a constant in constants.masm
     push.16 swap
     exec.generate_random_coefficients
     #=> [...]
 end
 
 #! Draw constraint composition random coefficients and save them into memory in the region from
-#! `compos_coef_ptr` `compos_coef_ptr + 112 - 1` as `(r1_1, r1_0, r0_1, r0_0)`
+#! `compos_coef_ptr` `compos_coef_ptr + NUM_COEFFS - 1` as `(r1_1, r1_0, r0_1, r0_0)`
 #!
 #! Input: [compos_coef_ptr, ...]
 #! Output: [...]
@@ -447,15 +474,15 @@ end
 #! Draw deep composition polynomial random coefficients and save them into memory in the region from
 #! `deep_rand_coef_ptr` to `deep_rand_coef_ptr + 89 - 1` as `(0, 0, r0_1, r0_0)`
 #! The number of coefficients is equal to:
-#! 1. (70 + 7) * 2 Felt for the main and auxiliary traces.
+#! 1. (71 + 7) * 2 Felt for the main and auxiliary traces.
 #! 2. 8 * 2 Felt for constraint polynomial.
-#! Total: 85 tuples of type (Felt, Felt)
+#! Total: 86 tuples of type (Felt, Felt)
 #!
 #! Input: [deep_rand_coef_ptr, ...]
 #! Output: [...]
 #! Cycles: 1624
 export.generate_deep_composition_random_coefficients
-    # note that 88 is the next number after 85 divisible by 4
+    # note that 88 is the next number after 86 divisible by 4
     exec.constants::num_deep_composition_coef_multiplied_by_two_and_rounded_up_to_4
     swap
     exec.generate_random_coefficients_pad
@@ -468,7 +495,7 @@ end
 
 #! Generate the OOD challenge point `z = (z0, z1)` and compute `z^N` where N is
 #! the trace length. The resulting word `[(z_1, z_0)^N, z1, z0]` is stored in the
-#! global memory address `exec.z_ptr` reservedfor it.
+#! global memory address `exec.z_ptr` reserved for it.
 #!
 #! Input: [X, ...]
 #! Output: [...]
@@ -508,11 +535,11 @@ end
 # =============================================================================================
 
 # Helper function for generating a list of indices that takes a word of random felts and saves
-# to memory region referenced by `ptr` 4 random integers in the range 0..=(mask+1).
+# to memory region (referenced by `ptr`) 4 random integers in the range 0..=(mask+1).
 # `depth` is saved next to each of the 4 integers for use in subsequent steps.
 #
 # Input: [R, ptr, mask, depth, ...]
-# Output:[...]
+# Output:[R, ptr+16, mask, depth, ...]
 #
 # Cycles: 100
 proc.generate_four_integers
@@ -525,22 +552,22 @@ proc.generate_four_integers
     push.0 movdn.3      # [r, depth, r0_hi, 0, R1, ptr, mask, depth, ...]
 
     # Store and update pointer
-    dup.8 add.1 swap.9  # [ptr, r, depth, r0_hi, 0, R1, ptr + 1, mask, depth, ...]
+    dup.8 add.4 swap.9  # [ptr, r, depth, r0_hi, 0, R1, ptr + 4, mask, depth, ...]
     mem_storew
-    dropw               # [R1, ptr + 1, mask, depth, ...]
+    dropw               # [R1, ptr + 4, mask, depth, ...]
 
     # Get the second random felt
-    dup.2               # [r1, R1, ptr, mask, depth, ...]
-    u32split swap       # [r1_lo, r1_hi, R1, ptr, mask, depth, ...]
-    dup.7               # [mask, r1_lo, r1_hi, R1, ptr, mask, depth, ...]
-    u32and              # [r, r1_hi, R1, ptr, mask, depth, ...]
-    dup.8 swap          # [r, depth, r1_hi, R1, ptr, mask, depth, ...]
-    push.0 movdn.3      # [r, depth, r1_hi, 0, R1, ptr, mask, depth, ...]
+    dup.2               # [r1, R1, ptr+4, mask, depth, ...]
+    u32split swap       # [r1_lo, r1_hi, R1, ptr+4, mask, depth, ...]
+    dup.7               # [mask, r1_lo, r1_hi, R1, ptr+4, mask, depth, ...]
+    u32and              # [r, r1_hi, R1, ptr+4, mask, depth, ...]
+    dup.8 swap          # [r, depth, r1_hi, R1, ptr+4, mask, depth, ...]
+    push.0 movdn.3      # [r, depth, r1_hi, 0, R1, ptr+4, mask, depth, ...]
 
     # Store and update pointer
-    dup.8 add.1 swap.9  # [ptr, r, depth, r1_hi, 0, R1, ptr + 1, mask, depth, ...]
+    dup.8 add.4 swap.9  # [ptr, r, depth, r1_hi, 0, R1, ptr+8, mask, depth, ...]
     mem_storew
-    dropw               # [R1, ptr + 1, mask, depth, ...]
+    dropw               # [R1, ptr + 8, mask, depth, ...]
 
     # Get the third random felt
     dup.1
@@ -551,7 +578,7 @@ proc.generate_four_integers
     push.0 movdn.3
 
     # Store and update pointer
-    dup.8 add.1 swap.9
+    dup.8 add.4 swap.9
     mem_storew
     dropw
 
@@ -564,7 +591,7 @@ proc.generate_four_integers
     push.0 movdn.3
 
     # Store and update pointer
-    dup.8 add.1 swap.9
+    dup.8 add.4 swap.9
     mem_storew
     dropw
 end
@@ -577,7 +604,7 @@ end
 # `depth` is saved next to each of the 3 integers for use in subsequent steps.
 #
 # Input: [R, ptr, mask, depth, ...]
-# Output:[R, ptr + 3, mask, depth, ...]
+# Output:[R, ptr + 12, mask, depth, ...]
 #
 # Cycles: 75
 proc.generate_three_integers
@@ -590,9 +617,9 @@ proc.generate_three_integers
     push.0 movdn.3      # [r, depth, r0_hi, 0, R1, ptr, mask, depth, ...]
 
     # Store and update pointer
-    dup.8 add.1 swap.9  # [ptr, r, depth, r0_hi, 0, R1, ptr + 1, mask, depth, ...]
+    dup.8 add.4 swap.9  # [ptr, r, depth, r0_hi, 0, R1, ptr + 4, mask, depth, ...]
     mem_storew
-    dropw               # [R1, ptr + 1, mask, depth, ...]
+    dropw               # [R1, ptr + 4, mask, depth, ...]
 
     # Get the second random felt
     dup.1               # [r1, R1, ptr, mask, depth, ...]
@@ -603,9 +630,9 @@ proc.generate_three_integers
     push.0 movdn.3      # [r, depth, r1_hi, 0, R1, ptr, mask, depth, ...]
 
     # Store and update pointer
-    dup.8 add.1 swap.9  # [ptr, r, depth, r1_hi, 0, R1, ptr + 1, mask, depth, ...]
+    dup.8 add.4 swap.9  # [ptr, r, depth, r1_hi, 0, R1, ptr + 4, mask, depth, ...]
     mem_storew
-    dropw               # [R1, ptr + 1, mask, depth, ...]
+    dropw               # [R1, ptr + 4, mask, depth, ...]
 
     # Get the third random felt
     dup.0
@@ -616,7 +643,7 @@ proc.generate_three_integers
     push.0 movdn.3
 
     # Store and update pointer
-    dup.8 add.1 swap.9
+    dup.8 add.4 swap.9
     mem_storew
     dropw
 end
@@ -652,18 +679,22 @@ export.generate_list_indices
     # Load the first half of the rate portion of the state of the random coin. We discard the first
     # element as it is used for PoW and use the remaining the 3.
     exec.get_rate_1
+    #=> [R1, query_ptr, mask, depth, num_queries]
     exec.generate_three_integers
+    #=> [R1, query_ptr+12, mask, depth, num_queries]
 
     # Load the second half of the rate portion of the state of the random coin.
     exec.constants::r2_ptr mem_loadw
+    #=> [R2, query_ptr+12, mask, depth, num_queries]
     exec.generate_four_integers
-    #=> [R2, query_ptr, mask, depth, num_queries, ...]
+    #=> [R2, query_ptr+26, mask, depth, num_queries, ...]
 
     # Squeeze
     exec.constants::c_ptr mem_loadw
     exec.get_rate_1
     exec.get_rate_2
     hperm
+    #=> [R2', R1, C, query_ptr+26, mask, depth, num_queries, ...]
 
     # Save the new state
     exec.constants::r2_ptr mem_storew
@@ -674,7 +705,7 @@ export.generate_list_indices
     # => [C, R1]
     exec.constants::c_ptr mem_storew
     dropw
-    #=> [R1, query_ptr, mask, depth, num_queries, ...]
+    #=> [R1, query_ptr+26, mask, depth, num_queries, ...]
 
 
     # Use `num_queries` to iterate.
@@ -682,50 +713,63 @@ export.generate_list_indices
     ## Subtract the 7 elements we have already generated above.
     movup.7
     push.7 sub
+    #=> [num_queries-7, R1, query_ptr+26, mask, depth, ...]
 
     ## Divide by 8 to get the number of iterations
     u32assert u32divmod.8
-    #=> [remainder, quotient, X, query_ptr, mask, depth, ...]
+    #=> [num_queries_remainder, num_queries_quotient, X, query_ptr+26, mask, depth, ...]
 
     ## Save remainder for later use
     movdn.8
+    #=> [num_queries_quotient, X, query_ptr+26, mask, depth, num_queries_remainder, ...]
 
     ## Use `quotient` to iterate
     dup movdn.8
+    #=> [num_queries_quotient, X, query_ptr+26, mask, depth, num_queries_quotient, num_queries_remainder, ...]
+
     push.0 neq
     while.true
+        #=> [X, query_ptr', mask, depth, num_remaining_iterations, remainder, ...]
+
         exec.generate_four_integers
+        #=> [X, query_ptr'+16, mask, depth, num_remaining_iterations, remainder, ...]
 
         exec.constants::r2_ptr mem_loadw
         exec.generate_four_integers
-        #=> [R2, query_ptr, mask, depth, num_queries, ...]
+        #=> [R2, query_ptr'+32, mask, depth, num_remaining_iterations, remainder, ...]
 
         # Squeeze
         exec.constants::c_ptr mem_loadw
         exec.get_rate_1
         exec.get_rate_2
         hperm
+        #=> [R2, R1, C, query_ptr'+32, mask, depth, num_remaining_iterations, remainder, ...]
 
         # Save the new state
         exec.constants::r2_ptr mem_storew
         dropw
-        # => [R1, C]
+        #=> [R1, C, query_ptr'+32, mask, depth, num_remaining_iterations, remainder, ...]
         exec.constants::r1_ptr mem_storew
         swapw
-        # => [C, R1]
+        #=> [C, R1, query_ptr'+32, mask, depth, num_remaining_iterations, remainder, ...]
         exec.constants::c_ptr mem_storew
         dropw
-        #=> [R1, query_ptr, mask, depth, num_remaining_iterations, remainder, ...]
+        #=> [R1, query_ptr'+32, mask, depth, num_remaining_iterations, remainder, ...]
 
         movup.7 sub.1 dup movdn.8
+        #=> [num_remaining_iterations-1, R1, query_ptr'+32, mask, depth, num_remaining_iterations-1, remainder, ...]
+
         push.0 neq
     end
-
+    #=> [R1, query_ptr', mask, depth, 0, remainder, ...]
 
     ## Use remainder
+    ## Note: we rename the `remainder` variable to `num_queries`, as it now indicates the number of
+    ## queries left.
 
     ### Put the remaining number of queries to generate in the appropriate stack position
     movup.8 movdn.7
+    #=> [R1, query_ptr', mask, depth, num_queries, ...]
 
     ### Load the second half of the rate portion of the state of the random coin.
     padw exec.constants::r2_ptr mem_loadw
@@ -733,23 +777,29 @@ export.generate_list_indices
 
     ### Iterate over remainder
     dup.11 sub.1 swap.12
+    #=> [num_queries, R2, R1, query_ptr, mask, depth, num_queries-1, ...]
+
     neq.0
     while.true
+    #=> [R2, R1, query_ptr, mask, depth, num_queries, ...]
         movup.7
-        u32split swap       # [r0_lo, r0_hi, R2, r3, r2, r1, ptr, mask, depth, ...]
-        dup.10              # [mask, r0_lo, r0_hi, R2, r3, r2, r1, ptr, mask, depth, ...]
-        u32and              # [r, r0_hi, R2, r3, r2, r1, ptr, mask, depth, ...]
-        dup.11 swap         # [r, depth, r0_hi, R2, r3, r2, r1, ptr, mask, depth, ...]
-        push.0 movdn.3      # [r, depth, r0_hi, 0, R2, r3, r2, r1, ptr, mask, depth, ...]
+        u32split swap       # [r0_lo, r0_hi, R2, r3, r2, r1, ptr, mask, depth, num_queries, ...]
+        dup.10              # [mask, r0_lo, r0_hi, R2, r3, r2, r1, ptr, mask, depth, num_queries, ...]
+        u32and              # [r, r0_hi, R2, r3, r2, r1, ptr, mask, depth, num_queries, ...]
+        dup.11 swap         # [r, depth, r0_hi, R2, r3, r2, r1, ptr, mask, depth, num_queries, ...]
+        push.0 movdn.3      # [r, depth, r0_hi, 0, R2, r3, r2, r1, ptr, mask, depth, num_queries, ...]
 
         # Store and update pointer
-        dup.11 add.1 swap.12  # [ptr, r, depth, r0_hi, 0, R2, r3, r2, r1, ptr + 1, mask, depth, ...]
+        dup.11 add.4 swap.12  # [ptr, r, depth, r0_hi, 0, R2, r3, r2, r1, ptr + 4, mask, depth, num_queries, ...]
         mem_storew
-        drop drop drop               # [x, R2, r3, r2, r1, ptr + 1, mask, depth, ...]
+        drop drop drop               # [x, R2, r3, r2, r1, ptr + 1, mask, depth, num_queries, ...]
         dup.11 sub.1 swap.12
+        #=> [num_queries, x, R2, r3, r2, r1, ptr + 1, mask, depth, num_queries-1, ...]
         push.0 neq
     end
+    #=> [R2, R1, query_ptr, mask, depth, 0, ...]
 
+    # TODO(plafer): not sure why the extra `drop` is needed? Don't we have exactly 3 words on stack?
     dropw dropw  dropw drop
 end
 
@@ -772,6 +822,7 @@ export.check_pow
 
     # Load Capacity portion
     exec.get_capacity
+    #=> [C, mask, ...]
 
     # Load first half of rate portion and add pow witness to first element of rate
     exec.get_rate_1
diff --git a/stdlib/asm/crypto/stark/verifier.masm b/stdlib/asm/crypto/stark/verifier.masm
index 0183fe2a78..fb587b38ce 100644
--- a/stdlib/asm/crypto/stark/verifier.masm
+++ b/stdlib/asm/crypto/stark/verifier.masm
@@ -14,9 +14,9 @@ use.std::crypto::stark::utils
 #!   - The maximal allowed degree of the remainder polynomial is 7.
 #!   - The public inputs are composed of the input and output stacks, of fixed size equal to 16.
 #!   - There are two trace segments, main and auxiliary. It is assumed that the main trace segment
-#!   is 70 columns wide while the auxiliary trace segment is 7 columns wide.
+#!   is 71 columns wide while the auxiliary trace segment is 7 columns wide.
 #!   - The OOD evaluation frame is composed of two interleaved rows, current and next, each composed
-#!    of 70 elements representing the main trace portion and 7 elements for the auxiliary trace one.
+#!    of 71 elements representing the main trace portion and 7 elements for the auxiliary trace one.
 #!   - To boost soundness, the protocol is run on a quadratic extension field and this means that
 #!    the OOD evaluation frame is composed of elements in a quadratic extension field i.e. tuples.
 #!    Similarly, elements of the auxiliary trace are quadratic extension field elements. The random
@@ -215,7 +215,9 @@ export.verify
     # the first layer commitment and total number of queries.
     exec.constants::fri_com_ptr
     exec.constants::number_queries_ptr mem_load
-    dup movdn.2
+    dup movdn.2 mul.4
+    #=> [num_queries*4, fri_com_ptr, num_queries, ...]
+
     sub
     #=> [query_ptr, num_queries, ...]
 
@@ -226,7 +228,7 @@ export.verify
     exec.random_coin::generate_list_indices
     #=> [query_ptr, ...]
 
-    # Compute deep compostion polynomial queries
+    # Compute deep composition polynomial queries
     #
     # Cycles: 24 + num_queries * 445
     #=> [query_ptr, ...]

From 6f3c11bc3e5ebfa8440397121503d8c88a47e348 Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Sat, 21 Dec 2024 13:38:39 -0500
Subject: [PATCH 08/19] fix sha256

---
 stdlib/asm/crypto/hashes/sha256.masm     | 21 ++++++++++++---------
 stdlib/asm/crypto/stark/random_coin.masm |  1 -
 stdlib/tests/crypto/sha256.rs            |  6 +++---
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/stdlib/asm/crypto/hashes/sha256.masm b/stdlib/asm/crypto/hashes/sha256.masm
index 7b627c1114..efbf6e139a 100644
--- a/stdlib/asm/crypto/hashes/sha256.masm
+++ b/stdlib/asm/crypto/hashes/sha256.masm
@@ -1574,8 +1574,8 @@ export.hash_memory.12
     push.55 loc_load.1 u32wrapping_sub push.63 u32and
     loc_load.1 u32assert2 u32overflowing_add assertz u32assert u32overflowing_add.9 assertz loc_store.2
 
-    # loc.3 (last memory address in padding): input_address + padded_length / 16 - 1
-    loc_load.2 u32assert u32div.16 loc_load.0 u32wrapping_add u32wrapping_sub.1 loc_store.3
+    # loc.3 (last word address in padding): input_address + (padded_length / 4) - 4
+    loc_load.2 u32assert u32div.4 loc_load.0 u32wrapping_add u32wrapping_sub.4 loc_store.3
 
     # loc.4 (u32 aligned padding byte): 0x80000000 >> ((input_length % 4) * 8)
     loc_load.1 u32assert u32mod.4 u32assert u32overflowing_mul.8 assertz push.0x80000000 swap u32shr loc_store.4
@@ -1583,15 +1583,18 @@ export.hash_memory.12
     # loc.5 (memory offset of first padding byte): (input_length / 4) % 4
     loc_load.1 u32assert u32div.4 u32mod.4 loc_store.5
 
-    # loc.6 (memory address of first padding byte): input_address + (len / 16)
-    loc_load.0 loc_load.1 u32assert u32div.16 u32assert2 u32overflowing_add assertz loc_store.6
+    # loc.6 (memory address of first padding byte): input_address + (len / 16) * 4
+    # Note: (len /16) * 4 is *not* the same as (len / 4), due to the division being a division + floor operation
+    loc_load.0 loc_load.1 u32assert u32div.16 u32assert2 mul.4 u32overflowing_add assertz loc_store.6
+
 
     # loc.7 (number of remaining 512-bit blocks to consume): padded_length / 64
     loc_load.2 u32assert u32div.64 loc_store.7
 
     # Set the first byte after the message to 0x80
     padw loc_load.6 mem_loadw loc_store.8 loc_store.9 loc_store.10 loc_store.11
-    locaddr.8 loc_load.5 u32wrapping_add dup mem_load loc_load.4 u32wrapping_add swap mem_store
+    # Note: We have to `mul.4` here because locals are spread 4 addresses apart.
+    locaddr.8 loc_load.5 mul.4 u32wrapping_add dup mem_load loc_load.4 u32wrapping_add swap mem_store
     loc_load.11 loc_load.10 loc_load.9 loc_load.8 loc_load.6 mem_storew dropw
 
     # Set message length in bits at end of padding
@@ -1606,13 +1609,13 @@ export.hash_memory.12
     # Consume sha256 blocks
     loc_load.7 u32assert neq.0
     while.true
-        padw loc_load.0 u32assert u32overflowing_add.3 assertz mem_loadw movdnw.2
-        padw loc_load.0 u32assert u32overflowing_add.2 assertz mem_loadw movdnw.2
-        padw loc_load.0 u32assert u32overflowing_add.1 assertz mem_loadw movdnw.2
+        padw loc_load.0 u32assert u32overflowing_add.12 assertz mem_loadw movdnw.2
+        padw loc_load.0 u32assert u32overflowing_add.8 assertz mem_loadw movdnw.2
+        padw loc_load.0 u32assert u32overflowing_add.4 assertz mem_loadw movdnw.2
         padw loc_load.0 u32assert u32overflowing_add.0 assertz mem_loadw movdnw.2
         exec.prepare_message_schedule_and_consume
 
-        loc_load.0 u32assert u32overflowing_add.4 assertz loc_store.0
+        loc_load.0 u32assert u32overflowing_add.16 assertz loc_store.0
         loc_load.7 u32assert u32overflowing_sub.1 assertz dup loc_store.7
         u32assert neq.0
     end
diff --git a/stdlib/asm/crypto/stark/random_coin.masm b/stdlib/asm/crypto/stark/random_coin.masm
index 3a655bf033..f52ac05f56 100644
--- a/stdlib/asm/crypto/stark/random_coin.masm
+++ b/stdlib/asm/crypto/stark/random_coin.masm
@@ -799,7 +799,6 @@ export.generate_list_indices
     end
     #=> [R2, R1, query_ptr, mask, depth, 0, ...]
 
-    # TODO(plafer): not sure why the extra `drop` is needed? Don't we have exactly 3 words on stack?
     dropw dropw  dropw drop
 end
 
diff --git a/stdlib/tests/crypto/sha256.rs b/stdlib/tests/crypto/sha256.rs
index e6a371b9c3..e7df08311c 100644
--- a/stdlib/tests/crypto/sha256.rs
+++ b/stdlib/tests/crypto/sha256.rs
@@ -38,15 +38,15 @@ fn sha256_hash_memory() {
         # mem.2 - length in felts
         mem_load.1 u32assert u32overflowing_add.3 assertz u32assert u32div.4 mem_store.2
 
-        # Load input data into memory address 10000, 10001, ...
+        # Load input data into memory address 10000, 10004, ...
         mem_load.2 u32assert neq.0
         while.true
             mem_load.0 mem_storew dropw
-            mem_load.0 u32assert u32overflowing_add.1 assertz mem_store.0
+            mem_load.0 u32assert u32overflowing_add.4 assertz mem_store.0
             mem_load.2 u32assert u32overflowing_sub.1 assertz dup mem_store.2 u32assert neq.0
         end
 
-        # Compute hash of memory address 10000, 10001, ...
+        # Compute hash of memory address 10000, 10004, ...
         mem_load.1
         push.10000
         exec.sha256::hash_memory

From eb33e0ffb395eddfa4c22e7b4859ae8f5bcc89e3 Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Mon, 6 Jan 2025 08:41:22 -0500
Subject: [PATCH 09/19] PR fixes

---
 CHANGELOG.md                                  |  2 +-
 air/src/constraints/chiplets/memory/mod.rs    | 88 ++++++++++++------
 air/src/constraints/chiplets/memory/tests.rs  | 17 ++--
 air/src/constraints/chiplets/mod.rs           |  2 +-
 air/src/trace/chiplets/memory.rs              | 19 ++--
 air/src/trace/chiplets/mod.rs                 |  6 +-
 air/src/trace/mod.rs                          |  2 +-
 assembly/src/assembler/instruction/mem_ops.rs |  7 +-
 assembly/src/assembler/mod.rs                 | 10 +-
 docs/src/user_docs/stdlib/collections.md      |  2 +-
 docs/src/user_docs/stdlib/mem.md              |  2 +-
 miden/src/cli/debug/executor.rs               | 16 ++--
 .../tests/integration/air/chiplets/memory.rs  | 37 +-------
 miden/tests/integration/flow_control/mod.rs   |  2 +-
 .../integration/operations/io_ops/mem_ops.rs  |  2 +-
 processor/src/chiplets/aux_trace/mod.rs       | 92 ++++++-------------
 processor/src/chiplets/memory/mod.rs          | 25 ++---
 processor/src/chiplets/memory/segment.rs      | 10 ++
 processor/src/chiplets/memory/tests.rs        | 10 +-
 processor/src/trace/tests/chiplets/memory.rs  | 10 +-
 20 files changed, 168 insertions(+), 193 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eb3a28ebca..3e908eb10d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,7 @@
 # Changelog
 
 #### Highlights
-- [BREAKING] Memory is now memory addressable (#1598)
+- [BREAKING] Memory is now element-addressable (#1598)
 
 #### Changes
 - [BREAKING] `Process` no longer takes ownership of the `Host` (#1571).
diff --git a/air/src/constraints/chiplets/memory/mod.rs b/air/src/constraints/chiplets/memory/mod.rs
index f9318ec615..487e9aa28e 100644
--- a/air/src/constraints/chiplets/memory/mod.rs
+++ b/air/src/constraints/chiplets/memory/mod.rs
@@ -6,9 +6,9 @@ use super::{EvaluationFrame, FieldElement};
 use crate::{
     trace::chiplets::{
         MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_D0_COL_IDX,
-        MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_ELEMENT_OR_WORD_COL_IDX,
-        MEMORY_FLAG_SAME_BATCH_AND_CONTEXT, MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX,
-        MEMORY_READ_WRITE_COL_IDX, MEMORY_V_COL_RANGE,
+        MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_FLAG_SAME_BATCH_AND_CONTEXT,
+        MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX, MEMORY_IS_READ_COL_IDX,
+        MEMORY_IS_WORD_ACCESS_COL_IDX, MEMORY_V_COL_RANGE,
     },
     utils::{binary_not, is_binary, EvaluationResult},
 };
@@ -81,8 +81,8 @@ fn enforce_binary_columns<E: FieldElement>(
     result: &mut [E],
     memory_flag: E,
 ) -> usize {
-    result[0] = memory_flag * is_binary(frame.read_write());
-    result[1] = memory_flag * is_binary(frame.element_or_word());
+    result[0] = memory_flag * is_binary(frame.is_read());
+    result[1] = memory_flag * is_binary(frame.is_word_access());
     result[2] = memory_flag * is_binary(frame.idx0());
     result[3] = memory_flag * is_binary(frame.idx1());
 
@@ -151,14 +151,34 @@ fn enforce_flag_same_context_and_batch<E: FieldElement>(
 
 /// A constraint evaluation function to enforce that memory is initialized to zero when it is read
 /// before being written and that when existing memory values are read they remain unchanged.
+///
+/// The constraints on the values depend on a few factors:
+/// - When in the first row of a new context or batch, any of the 4 values of the batch that are not
+///   written to must be set to 0.
+///   - This is because the memory is initialized to 0 when a new context or batch is started.
+/// - When we remain in the same context and batch, then this is when we want to enforce the "memory
+///   property" that what was previously written must be read. Therefore, the values that are not
+///   being written need to be equal to the values in the previous row (i.e. previously written, or
+///   initialized to 0).
+///   - The implication is that in a given evaluation frame, we always constrain the "next" value,
+///     since that constraint depends on the "current" value.
 fn enforce_values<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
     memory_flag_no_last: E,
     memory_flag_first_row: E,
 ) -> usize {
-    // intuition: c_i is set to 1 when `v'[i]` is *not* written to, and 0 otherwise.
-    // in other words, c_i is set to 1 when `v'[i]` needs to be constrained.
+    // c_i is set to 1 when `v'[i]` is not written to, and 0 otherwise.
+    //
+    // In other words, c_i is set to 1 when `v'[i]` needs to be constrained (to either 0 or `v[i]`).
+    //
+    // Note that `c_i` only uses values in the "next" row. This is because it must be used to
+    // constrain the first row of the memory chiplet, where that row sits in the "next" position of
+    // the frame, and the "current" row belongs to the previous chiplet (and hence the "current" row
+    // must not be accessed).
+    //
+    // As a result, `c_i` does not include the constraint of being in the memory chiplet, or in the
+    // same context and batch - these must be enforced separately.
     let (c0, c1, c2, c3) = {
         // intuition: the i'th `f` flag is set to 1 when `i == 2 * idx1 + idx0`
         let f0 = binary_not(frame.idx1_next()) * binary_not(frame.idx0_next());
@@ -167,22 +187,24 @@ fn enforce_values<E: FieldElement>(
         let f3 = frame.idx1_next() * frame.idx0_next();
 
         let c_i = |f_i| {
-            frame.read_write_next()
-                + binary_not(frame.read_write_next())
-                    * binary_not(frame.element_or_word_next())
-                    * binary_not(f_i)
+            // z_i is set to 1 when `v'[i]` is not being accessed.
+            let z_i = binary_not(frame.is_word_access_next()) * binary_not(f_i);
+            let is_read_next = frame.is_read_next();
+
+            is_read_next + binary_not(is_read_next) * z_i
         };
 
         (c_i(f0), c_i(f1), c_i(f2), c_i(f3))
     };
 
-    // first row constraints
+    // first row constraints: when row' is the first row, and v'[i] is not written to, then v'[i]
+    // must be 0.
     result[0] = memory_flag_first_row * c0 * frame.v_next(0);
     result[1] = memory_flag_first_row * c1 * frame.v_next(1);
     result[2] = memory_flag_first_row * c2 * frame.v_next(2);
     result[3] = memory_flag_first_row * c3 * frame.v_next(3);
 
-    // non-first row, new batch or context constraints: when  row' is a new batch/ctx, and v'[i] is
+    // non-first row, new batch or context constraints: when row' is a new batch/ctx, and v'[i] is
     // not written to, then v'[i] must be 0.
     result[4] = memory_flag_no_last * binary_not(frame.f_scb_next()) * c0 * frame.v_next(0);
     result[5] = memory_flag_no_last * binary_not(frame.f_scb_next()) * c1 * frame.v_next(1);
@@ -207,14 +229,22 @@ fn enforce_values<E: FieldElement>(
 trait EvaluationFrameExt<E: FieldElement> {
     // --- Column accessors -----------------------------------------------------------------------
 
-    /// Gets the value of the read/write column in the current row.
-    fn read_write(&self) -> E;
-    /// Gets the value of the read/write column in the next row.
-    fn read_write_next(&self) -> E;
-    /// Gets the value of the element/word column in the current row.
-    fn element_or_word(&self) -> E;
-    /// Gets the value of the element/word column in the next row.
-    fn element_or_word_next(&self) -> E;
+    /// The value of the read/write column in the current row.
+    ///
+    /// 0: write, 1: read
+    fn is_read(&self) -> E;
+    /// The value of the read/write column in the next row.
+    ///
+    /// 0: write, 1: read
+    fn is_read_next(&self) -> E;
+    /// The value of the element/word column in the current row.
+    ///
+    /// 0: element, 1: word
+    fn is_word_access(&self) -> E;
+    /// The value of the element/word column in the next row.
+    ///
+    /// 0: element, 1: word
+    fn is_word_access_next(&self) -> E;
     /// The current context value.
     #[allow(dead_code)]
     fn ctx(&self) -> E;
@@ -281,23 +311,23 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
     // --- Column accessors -----------------------------------------------------------------------
 
     #[inline(always)]
-    fn read_write(&self) -> E {
-        self.current()[MEMORY_READ_WRITE_COL_IDX]
+    fn is_read(&self) -> E {
+        self.current()[MEMORY_IS_READ_COL_IDX]
     }
 
     #[inline(always)]
-    fn read_write_next(&self) -> E {
-        self.next()[MEMORY_READ_WRITE_COL_IDX]
+    fn is_read_next(&self) -> E {
+        self.next()[MEMORY_IS_READ_COL_IDX]
     }
 
     #[inline(always)]
-    fn element_or_word(&self) -> E {
-        self.current()[MEMORY_ELEMENT_OR_WORD_COL_IDX]
+    fn is_word_access(&self) -> E {
+        self.current()[MEMORY_IS_WORD_ACCESS_COL_IDX]
     }
 
     #[inline(always)]
-    fn element_or_word_next(&self) -> E {
-        self.next()[MEMORY_ELEMENT_OR_WORD_COL_IDX]
+    fn is_word_access_next(&self) -> E {
+        self.next()[MEMORY_IS_WORD_ACCESS_COL_IDX]
     }
 
     #[inline(always)]
diff --git a/air/src/constraints/chiplets/memory/tests.rs b/air/src/constraints/chiplets/memory/tests.rs
index 159def8ba5..6b4a673234 100644
--- a/air/src/constraints/chiplets/memory/tests.rs
+++ b/air/src/constraints/chiplets/memory/tests.rs
@@ -12,8 +12,8 @@ use crate::{
     trace::{
         chiplets::{
             memory::{MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_WRITE},
-            MEMORY_ELEMENT_OR_WORD_COL_IDX, MEMORY_FLAG_SAME_BATCH_AND_CONTEXT,
-            MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX, MEMORY_READ_WRITE_COL_IDX,
+            MEMORY_FLAG_SAME_BATCH_AND_CONTEXT, MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX,
+            MEMORY_IS_READ_COL_IDX, MEMORY_IS_WORD_ACCESS_COL_IDX,
         },
         TRACE_WIDTH,
     },
@@ -141,8 +141,8 @@ fn get_test_frame(
     let mut next = vec![ZERO; TRACE_WIDTH];
 
     // Set the operation in the next row.
-    next[MEMORY_READ_WRITE_COL_IDX] = read_write;
-    next[MEMORY_ELEMENT_OR_WORD_COL_IDX] = MEMORY_ACCESS_WORD;
+    next[MEMORY_IS_READ_COL_IDX] = read_write;
+    next[MEMORY_IS_WORD_ACCESS_COL_IDX] = MEMORY_ACCESS_WORD;
 
     // Set the context, addr, and clock columns in the next row to the values in the delta row.
     next[MEMORY_CTX_COL_IDX] = Felt::new(delta_row[0]);
@@ -177,7 +177,8 @@ fn get_test_frame(
     next[MEMORY_IDX0_COL_IDX] = ZERO;
     next[MEMORY_IDX1_COL_IDX] = ZERO;
 
-    // If the context or batch columns are changed, the same batch and context flag should be zero.
+    // If the context or batch columns are changed, the "same batch and context" flag should be
+    // zero.
     if delta_row[MemoryTestDeltaType::Batch as usize] > 0
         || delta_row[MemoryTestDeltaType::Context as usize] > 0
     {
@@ -193,9 +194,9 @@ fn get_test_frame(
 /// the specified delta type, which determines the column over which the delta and delta inverse
 /// values of the trace would be calculated.
 ///
-/// - When the delta type is Context, the address and clock columns can be anything.
+/// - When the delta type is Context, the batch and clock columns can be anything.
 /// - When the delta type is Batch, the context must remain unchanged but the clock can change.
-/// - When the delta type is Clock, both the context and address columns must remain unchanged.
+/// - When the delta type is Clock, both the context and batch columns must remain unchanged.
 fn get_test_delta_row(delta_type: &MemoryTestDeltaType) -> Vec<u64> {
     let delta_value = word_aligned_rand_value() as u64;
     let mut row = vec![0; 3];
@@ -231,7 +232,7 @@ fn get_test_delta_row(delta_type: &MemoryTestDeltaType) -> Vec<u64> {
     row
 }
 
-/// Returns a random value that is aligned to a word boundary (i.e. divisible by 4).
+/// Returns a random value that is divisible by 4 (i.e. "word aligned" when treated as an address).
 fn word_aligned_rand_value() -> u32 {
     let value = rand_value::<u32>();
     value - (value % 4)
diff --git a/air/src/constraints/chiplets/mod.rs b/air/src/constraints/chiplets/mod.rs
index a0ebff2faa..4263273855 100644
--- a/air/src/constraints/chiplets/mod.rs
+++ b/air/src/constraints/chiplets/mod.rs
@@ -161,7 +161,7 @@ trait EvaluationFrameExt<E: FieldElement> {
     /// trace.
     fn memory_flag_next(&self) -> E;
 
-    /// Flag to indicate whether the frame is in the first row of the memory portion of the Chiplets
+    /// Flag to indicate whether the next row in the frame is in the memory portion of the Chiplets
     /// trace.
     fn memory_flag_first_row(&self) -> E;
 }
diff --git a/air/src/trace/chiplets/memory.rs b/air/src/trace/chiplets/memory.rs
index e17689331d..759af60f7c 100644
--- a/air/src/trace/chiplets/memory.rs
+++ b/air/src/trace/chiplets/memory.rs
@@ -23,8 +23,8 @@ pub const MEMORY_ACCESS_WORD: Felt = ONE;
 
 // All bus labels encode the chiplet selector (1, 1, 0), as well as the read/write and element/word
 // columns. The purpose of the label is to force the chiplet to assign the correct values to the
-// read/write and element/word columns. We also include the chiplet selector as a "namespace" for
-// memory chiplet labels (to really ensure they don't collide with labels from other chiplets).
+// read/write and element/word columns. We also include the chiplet selector as a unique identifier
+// for memory chiplet labels (to ensure they don't collide with labels from other chiplets).
 
 /// Unique label when r/w=0 and e/w=0.
 pub const MEMORY_WRITE_ELEMENT_LABEL: u8 = 0b11000;
@@ -40,13 +40,14 @@ pub const MEMORY_READ_WORD_LABEL: u8 = 0b11011;
 
 // --- COLUMN ACCESSOR INDICES WITHIN THE CHIPLET -------------------------------------------------
 
-/// Column to hold the whether the operation is a read or write.
-pub const READ_WRITE_COL_IDX: usize = 0;
+/// Column to hold whether the operation is a read or write.
+pub const IS_READ_COL_IDX: usize = 0;
 /// Column to hold the whether the operation was over an element or a word.
-pub const ELEMENT_OR_WORD_COL_IDX: usize = READ_WRITE_COL_IDX + 1;
+pub const IS_WORD_ACCESS_COL_IDX: usize = IS_READ_COL_IDX + 1;
 /// Column to hold the context ID of the current memory context.
-pub const CTX_COL_IDX: usize = ELEMENT_OR_WORD_COL_IDX + 1;
-/// Column to hold the memory address.
+pub const CTX_COL_IDX: usize = IS_WORD_ACCESS_COL_IDX + 1;
+/// Column to hold the batch (i.e. group of 4 memory slots, referred to by the address of the first
+/// slot in the batch).
 pub const BATCH_COL_IDX: usize = CTX_COL_IDX + 1;
 /// Column to hold the first bit of the index of the address in the batch.
 pub const IDX0_COL_IDX: usize = BATCH_COL_IDX + 1;
@@ -54,8 +55,8 @@ pub const IDX0_COL_IDX: usize = BATCH_COL_IDX + 1;
 pub const IDX1_COL_IDX: usize = IDX0_COL_IDX + 1;
 /// Column for the clock cycle in which the memory operation occurred.
 pub const CLK_COL_IDX: usize = IDX1_COL_IDX + 1;
-/// Columns to hold the values stored at a given memory context, address, and clock cycle after
-/// the memory operation. When reading from a new address, these are initialized to zero.
+/// Columns to hold the values stored at a given memory context, batch, and clock cycle after
+/// the memory operation. When reading from a new batch, these are initialized to zero.
 pub const V_COL_RANGE: Range<usize> = create_range(CLK_COL_IDX + 1, WORD_SIZE);
 /// Column for the lower 16-bits of the delta between two consecutive context IDs, addresses, or
 /// clock cycles.
diff --git a/air/src/trace/chiplets/mod.rs b/air/src/trace/chiplets/mod.rs
index b62a7e7d2d..e0bc47b262 100644
--- a/air/src/trace/chiplets/mod.rs
+++ b/air/src/trace/chiplets/mod.rs
@@ -91,10 +91,10 @@ pub const BITWISE_OUTPUT_COL_IDX: usize = BITWISE_TRACE_OFFSET + bitwise::OUTPUT
 /// indicates the operation (read or write).
 pub const MEMORY_SELECTORS_COL_IDX: usize = MEMORY_TRACE_OFFSET;
 /// The index within the main trace of the column containing the memory read/write column.
-pub const MEMORY_READ_WRITE_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::READ_WRITE_COL_IDX;
+pub const MEMORY_IS_READ_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::IS_READ_COL_IDX;
 /// The index within the main trace of the column containing the memory element/word column.
-pub const MEMORY_ELEMENT_OR_WORD_COL_IDX: usize =
-    MEMORY_TRACE_OFFSET + memory::ELEMENT_OR_WORD_COL_IDX;
+pub const MEMORY_IS_WORD_ACCESS_COL_IDX: usize =
+    MEMORY_TRACE_OFFSET + memory::IS_WORD_ACCESS_COL_IDX;
 /// The index within the main trace of the column containing the memory context.
 pub const MEMORY_CTX_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::CTX_COL_IDX;
 /// The index within the main trace of the column containing the memory address.
diff --git a/air/src/trace/mod.rs b/air/src/trace/mod.rs
index c4eef3e810..81f17c8278 100644
--- a/air/src/trace/mod.rs
+++ b/air/src/trace/mod.rs
@@ -19,7 +19,7 @@ pub const MIN_TRACE_LEN: usize = 64;
 // ------------------------------------------------------------------------------------------------
 
 //      system          decoder           stack      range checks       chiplets
-//    (8 columns)     (24 columns)    (19 columns)    (3 columns)     (17 columns)
+//    (8 columns)     (24 columns)    (19 columns)    (3 columns)     (18 columns)
 // ├───────────────┴───────────────┴───────────────┴───────────────┴─────────────────┤
 
 pub const SYS_TRACE_OFFSET: usize = 0;
diff --git a/assembly/src/assembler/instruction/mem_ops.rs b/assembly/src/assembler/instruction/mem_ops.rs
index a20a424988..09ec16d596 100644
--- a/assembly/src/assembler/instruction/mem_ops.rs
+++ b/assembly/src/assembler/instruction/mem_ops.rs
@@ -100,8 +100,8 @@ pub fn mem_write_imm(
 // ================================================================================================
 
 /// Appends a sequence of operations to the span needed for converting procedure local index to
-/// absolute memory address. This consists of putting index onto the stack and then executing
-/// LOCADDR operation.
+/// absolute memory address. This consists in calculating the offset of the local value from the
+/// frame pointer and pushing the result onto the stack.
 ///
 /// This operation takes:
 /// - 3 VM cycles if index == 1
@@ -127,6 +127,9 @@ pub fn local_to_absolute_addr(
     let max = num_proc_locals - 1;
     validate_param(index_of_local, 0..=max)?;
 
+    // Local values are placed under the frame pointer, so we need to calculate the offset of the
+    // local value from the frame pointer. Local values are also indexed by word, so we need to
+    // multiply the index by the word size.
     let fmp_offset_of_local = (max - index_of_local) * WORD_SIZE as u16;
     push_felt(block_builder, -Felt::from(fmp_offset_of_local));
     block_builder.push_op(FmpAdd);
diff --git a/assembly/src/assembler/mod.rs b/assembly/src/assembler/mod.rs
index 01804d83fa..a5ead4f33d 100644
--- a/assembly/src/assembler/mod.rs
+++ b/assembly/src/assembler/mod.rs
@@ -572,12 +572,12 @@ impl Assembler {
         let wrapper_proc = self.module_graph.get_procedure_unsafe(gid);
         let proc = wrapper_proc.unwrap_ast().unwrap_procedure();
         let proc_body_id = if num_locals > 0 {
-            // for procedures with locals, we need to update fmp register before and after the
-            // procedure body is executed. specifically:
+            // For procedures with locals, we need to update fmp register before and after the
+            // procedure body is executed. Specifically:
             // - to allocate procedure locals we need to increment fmp by 4 times the number of
-            //   locals
-            // - to deallocate procedure locals we need to decrement it by the same amount We leave
-            // 4 elements between locals to properly support reading and writing words to locals.
+            //   locals, and
+            // - to deallocate procedure locals we need to decrement it by the same amount. We leave
+            //   4 elements between locals to properly support reading and writing words to locals.
             let locals_frame = Felt::from(num_locals * WORD_SIZE as u16);
             let wrapper = BodyWrapper {
                 prologue: vec![Operation::Push(locals_frame), Operation::FmpUpdate],
diff --git a/docs/src/user_docs/stdlib/collections.md b/docs/src/user_docs/stdlib/collections.md
index da6cfcd518..759ef1f640 100644
--- a/docs/src/user_docs/stdlib/collections.md
+++ b/docs/src/user_docs/stdlib/collections.md
@@ -15,7 +15,7 @@ The following procedures are available to read data from and make updates to a M
 | get         | Loads the leaf at the absolute position `pos` in the MMR onto the stack.<br /><br />Valid range for `pos` is between $0$ and $2^{32} - 1$ (both inclusive).<br /><br />Inputs: `[pos, mmr_ptr, ...]`<br />Output: `[N, ...]`<br /><br />Where `N` is the leaf loaded from the MMR whose memory location starts at `mmr_ptr`. |
 | add         | Adds a new leaf to the MMR.<br /><br />This will update the MMR peaks in the VM's memory and the advice provider with any merged nodes.<br /><br />Inputs: `[N, mmr_ptr, ...]`<br />Outputs: `[...]`<br /><br />Where `N` is the leaf added to the MMR whose memory locations starts at `mmr_ptr`. |
 | pack        | Computes a commitment to the given MMR and copies the MMR to the Advice Map using the commitment as a key.<br /><br />Inputs: `[mmr_ptr, ...]`<br />Outputs: `[HASH, ...]`<br /><br /> |
-| unpack      | Writes the MMR who's peaks hash to `HASH` to the memory location pointed to by `mmr_ptr`.<br /><br />Inputs: `[HASH, mmr_ptr, ...]`<br />Outputs: `[...]`<br /><br />Where:<br />- `HASH`: is the MMR peak hash, the hash is expected to be padded to an even length and to have a minimum size of 16 elements.<br />- The advice map must contain a key with `HASH`, and its value is `[num_leaves, 0, 0, 0] \|\| hash_data`, and hash_data is the data used to computed `HASH`<br />- `mmr_ptr`: the memory location where the MMR data will be written, starting with the MMR forest (the total count of its leaves) followed by its peaks. |
+| unpack      | Writes the MMR who's peaks hash to `HASH` to the memory location pointed to by `mmr_ptr`.<br /><br />Inputs: `[HASH, mmr_ptr, ...]`<br />Outputs: `[...]`<br /><br />Where:<br />- `HASH`: is the MMR peak hash, the hash is expected to be padded to an even length and to have a minimum size of 16 elements.<br />- The advice map must contain a key with `HASH`, and its value is `[num_leaves, 0, 0, 0] \|\| hash_data`, and hash_data is the data used to computed `HASH`<br />- `mmr_ptr`: the memory location where the MMR data will be written, starting with the MMR forest (the total count of its leaves) followed by its peaks. The memory location must be word-aligned. |
 
 `mmr_ptr` is a pointer to the `mmr` data structure, which is defined as:
 1. `mmr_ptr[0]` contains the number of leaves in the MMR
diff --git a/docs/src/user_docs/stdlib/mem.md b/docs/src/user_docs/stdlib/mem.md
index 2c837cdcab..399277c747 100644
--- a/docs/src/user_docs/stdlib/mem.md
+++ b/docs/src/user_docs/stdlib/mem.md
@@ -3,7 +3,7 @@ Module `std::mem` contains a set of utility procedures for working with random a
 
 | Procedure   | Description   |
 | ----------- | ------------- |
-| memcopy_words | Copies `n` words from `read_ptr` to `write_ptr`.<br /><br />Stack transition looks as follows:<br /><br />[n, read_ptr, write_ptr, ...] -> [...]<br /><br />Cycles: 15 + 16n |
+| memcopy_words | Copies `n` words from `read_ptr` to `write_ptr`; both pointers must be word-aligned.<br /><br />Stack transition looks as follows:<br /><br />[n, read_ptr, write_ptr, ...] -> [...]<br /><br />Cycles: 15 + 16n |
 | pipe_double_words_to_memory | Moves an even number of words from the advice stack to memory.<br /><br />Input: [C, B, A, write_ptr, end_ptr, ...]<br />Output: [C, B, A, write_ptr, ...]<br /><br />Where:<br />- The words C, B, and A are the RPO hasher state<br />- A is the capacity<br />- C, B are the rate portion of the state<br />- The value `num_words = end_ptr - write_ptr` must be positive and even<br /><br />Cycles: 10 + 9 * num_words / 2 |
 | pipe_words_to_memory | Moves an arbitrary number of words from the advice stack to memory.<br /><br />Input: [num_words, write_ptr, ...]<br />Output: [HASH, write_ptr', ...]<br /><br />Where `HASH` is the sequential RPO hash of all copied words.<br /><br />Cycles:<br />- Even num_words: 48 + 9 * num_words / 2<br />- Odd num_words: 65 + 9 * round_down(num_words / 2) |
 | pipe_preimage_to_memory | Moves an arbitrary number of words from the advice stack to memory and asserts it matches the commitment.<br /><br />Input: [num_words, write_ptr, COM, ...]<br />Output: [write_ptr', ...]<br /><br />Cycles:<br />- Even num_words: 58 + 9 * num_words / 2<br /> - Odd num_words: 75 + 9 * round_down(num_words / 2) |
diff --git a/miden/src/cli/debug/executor.rs b/miden/src/cli/debug/executor.rs
index c755f3ead0..4f3a209213 100644
--- a/miden/src/cli/debug/executor.rs
+++ b/miden/src/cli/debug/executor.rs
@@ -44,7 +44,7 @@ impl DebugExecutor {
     // MODIFIERS
     // --------------------------------------------------------------------------------------------
 
-    /// executes a debug command against the vm in it's current state.
+    /// Executes a debug command against the vm in it's current state.
     pub fn execute(&mut self, command: DebugCommand) -> bool {
         match command {
             DebugCommand::Continue => {
@@ -122,12 +122,12 @@ impl DebugExecutor {
     // ACCESSORS
     // --------------------------------------------------------------------------------------------
 
-    /// print general VM state information.
+    /// Prints general VM state information.
     fn print_vm_state(&self) {
         println!("{}", self.vm_state)
     }
 
-    /// print all stack items.
+    /// Prints all stack items.
     pub fn print_stack(&self) {
         println!(
             "{}",
@@ -141,7 +141,7 @@ impl DebugExecutor {
         )
     }
 
-    /// print specified stack item.
+    /// Prints specified stack item.
     pub fn print_stack_item(&self, index: usize) {
         let len = self.vm_state.stack.len();
         println!("stack len {}", len);
@@ -152,14 +152,14 @@ impl DebugExecutor {
         }
     }
 
-    /// print all memory entries.
+    /// Prints all memory entries.
     pub fn print_memory(&self) {
         for &(address, mem) in self.vm_state.memory.iter() {
             Self::print_memory_data(address, mem)
         }
     }
 
-    /// print specified memory entry.
+    /// Prints specified memory entry.
     pub fn print_memory_entry(&self, address: u64) {
         let entry = self.vm_state.memory.iter().find_map(|(addr, mem)| match address == *addr {
             true => Some(mem),
@@ -175,12 +175,12 @@ impl DebugExecutor {
     // HELPERS
     // --------------------------------------------------------------------------------------------
 
-    /// print memory data.
+    /// Prints memory data.
     fn print_memory_data(address: u64, mem_value: Felt) {
         println!("{address} {mem_value:?}");
     }
 
-    /// print help message
+    /// Prints help message
     fn print_help() {
         let message = "---------------------------------------------------------------------\n\
             Miden Assembly Debug CLI\n\
diff --git a/miden/tests/integration/air/chiplets/memory.rs b/miden/tests/integration/air/chiplets/memory.rs
index 21e3133292..b558277b02 100644
--- a/miden/tests/integration/air/chiplets/memory.rs
+++ b/miden/tests/integration/air/chiplets/memory.rs
@@ -1,4 +1,4 @@
-use test_utils::{build_op_test, build_test, ToElements};
+use test_utils::{build_op_test, build_test};
 
 #[test]
 fn mem_load() {
@@ -15,28 +15,6 @@ fn mem_store() {
     build_op_test!(asm_op, &pub_inputs).prove_and_verify(pub_inputs, false);
 }
 
-#[test]
-fn helper_mem_store() {
-    // Sequence of operations: [Span, Pad, MStoreW, Drop, Drop, Drop, Drop, Pad, Mstore, Drop, Pad,
-    // MStoreW, Drop, Pad, Mstore, Drop]
-    let asm_op =
-        "begin mem_storew.0 drop drop drop drop mem_store.0 mem_storew.0 drop mem_store.0 end";
-    let pub_inputs = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
-
-    let trace = build_test!(asm_op, &pub_inputs).execute().unwrap();
-    // MStore doesn't use helper registers, so they should be zero.
-    let helper_regs = [0, 0, 0, 0, 0, 0].to_elements();
-    // We need to check helper registers state after the MStore operation at clock cycle 8.
-    assert_eq!(helper_regs, trace.get_user_op_helpers_at(8));
-    // After the second MStoreW call, the helper registers should be zero.
-    let helper_regs = [0, 0, 0, 0, 0, 0].to_elements();
-    assert_eq!(helper_regs, trace.get_user_op_helpers_at(11));
-
-    // We need to check helper registers state after the MStore operation at clock cycle 14.
-    let helper_regs = [0, 0, 0, 0, 0, 0].to_elements();
-    assert_eq!(helper_regs, trace.get_user_op_helpers_at(14));
-}
-
 #[test]
 fn mem_loadw() {
     let asm_op = "mem_loadw.0";
@@ -61,19 +39,6 @@ fn write_read() {
     build_test!(source, &pub_inputs).prove_and_verify(pub_inputs, false);
 }
 
-#[test]
-fn helper_write_read() {
-    // Sequence of operations: [Span, Pad, MStorew, Drop, Drop, Drop, Drop, Pad, MLoad, ... ]
-    let source = "begin mem_storew.0 dropw mem_load.0 movup.4 drop end";
-    let pub_inputs = vec![4, 3, 2, 1];
-
-    let trace = build_test!(source, &pub_inputs).execute().unwrap();
-    // MLoad doesn't use helper registers, so they should be zero.
-    let helper_regs = [0, 0, 0, 0, 0, 0].to_elements();
-    // We need to check helper registers state after first MLoad, which index is 8
-    assert_eq!(helper_regs, trace.get_user_op_helpers_at(8));
-}
-
 #[test]
 fn update() {
     let source = "
diff --git a/miden/tests/integration/flow_control/mod.rs b/miden/tests/integration/flow_control/mod.rs
index d2bbd4ef5b..819741b7af 100644
--- a/miden/tests/integration/flow_control/mod.rs
+++ b/miden/tests/integration/flow_control/mod.rs
@@ -272,7 +272,7 @@ fn simple_dyn_exec() {
             # move the first result of foo out of the way
             movdn.4
 
-            # use dynexec to call foo again via its hash, which is stored at memory location 42
+            # use dynexec to call foo again via its hash, which is stored at memory location 40
             mem_storew.40 dropw
             push.40
             dynexec
diff --git a/miden/tests/integration/operations/io_ops/mem_ops.rs b/miden/tests/integration/operations/io_ops/mem_ops.rs
index 99f9c67982..0b0b5ab2d1 100644
--- a/miden/tests/integration/operations/io_ops/mem_ops.rs
+++ b/miden/tests/integration/operations/io_ops/mem_ops.rs
@@ -117,7 +117,7 @@ fn mem_stream() {
     let inputs = [1, 2, 3, 4, 5, 6, 7, 8];
 
     // the state is built by replacing the values on the top of the stack with the values in memory
-    // addresses 0 and 4 (i.e., 1 through 8). Thus, the first 8 elements on the stack will be 1
+    // addresses `[0..8)`. Thus, the first 8 elements on the stack will be 1
     // through 8 (in stack order, with 8 at stack[0]), and the remaining 4 are untouched (i.e., 9,
     // 10, 11, 12).
     let state: [Felt; 12] =
diff --git a/processor/src/chiplets/aux_trace/mod.rs b/processor/src/chiplets/aux_trace/mod.rs
index 92266961ee..f99ec978bd 100644
--- a/processor/src/chiplets/aux_trace/mod.rs
+++ b/processor/src/chiplets/aux_trace/mod.rs
@@ -486,11 +486,7 @@ fn build_bitwise_request<E: FieldElement<BaseField = Felt>>(
     let b = main_trace.stack_element(0, row);
     let z = main_trace.stack_element(0, row + 1);
 
-    alphas[0]
-        + alphas[1].mul_base(op_label)
-        + alphas[2].mul_base(a)
-        + alphas[3].mul_base(b)
-        + alphas[4].mul_base(z)
+    alphas[0] + build_value(&alphas[1..5], &[op_label, a, b, z])
 }
 
 /// Builds `MSTREAM` requests made to the memory chiplet.
@@ -801,9 +797,7 @@ where
         // v_all = v_h + v_a + v_b + v_c
         if selector1 == ONE && selector2 == ZERO && selector3 == ZERO {
             let header = alphas[0]
-                + alphas[1].mul_base(transition_label)
-                + alphas[2].mul_base(Felt::from(row + 1))
-                + alphas[3].mul_base(node_index);
+                + build_value(&alphas[1..4], &[transition_label, Felt::from(row + 1), node_index]);
 
             multiplicand = header + build_value(alphas_state, &state);
         }
@@ -812,9 +806,7 @@ where
         // v_leaf = v_h + (1 - b) * v_b + b * v_d
         if selector1 == ONE && !(selector2 == ZERO && selector3 == ZERO) {
             let header = alphas[0]
-                + alphas[1].mul_base(transition_label)
-                + alphas[2].mul_base(Felt::from(row + 1))
-                + alphas[3].mul_base(node_index);
+                + build_value(&alphas[1..4], &[transition_label, Felt::from(row + 1), node_index]);
 
             let bit = (node_index.as_int() & 1) as u8;
             let left_word = build_value(&alphas_state[DIGEST_RANGE], &state[DIGEST_RANGE]);
@@ -835,9 +827,7 @@ where
         // v_res = v_h + v_b;
         if selector1 == ZERO && selector2 == ZERO && selector3 == ZERO {
             let header = alphas[0]
-                + alphas[1].mul_base(transition_label)
-                + alphas[2].mul_base(Felt::from(row + 1))
-                + alphas[3].mul_base(node_index);
+                + build_value(&alphas[1..4], &[transition_label, Felt::from(row + 1), node_index]);
 
             multiplicand = header + build_value(&alphas_state[DIGEST_RANGE], &state[DIGEST_RANGE]);
         }
@@ -846,9 +836,7 @@ where
         // v_all = v_h + v_a + v_b + v_c
         if selector1 == ZERO && selector2 == ZERO && selector3 == ONE {
             let header = alphas[0]
-                + alphas[1].mul_base(transition_label)
-                + alphas[2].mul_base(Felt::from(row + 1))
-                + alphas[3].mul_base(node_index);
+                + build_value(&alphas[1..4], &[transition_label, Felt::from(row + 1), node_index]);
 
             multiplicand = header + build_value(alphas_state, &state);
         }
@@ -857,9 +845,7 @@ where
         // v_abp = v_h + v_b' + v_c' - v_b - v_c
         if selector1 == ONE && selector2 == ZERO && selector3 == ZERO {
             let header = alphas[0]
-                + alphas[1].mul_base(transition_label)
-                + alphas[2].mul_base(Felt::from(row + 1))
-                + alphas[3].mul_base(node_index);
+                + build_value(&alphas[1..4], &[transition_label, Felt::from(row + 1), node_index]);
 
             let state_nxt = main_trace.chiplet_hasher_state(row + 1);
 
@@ -887,11 +873,7 @@ where
         let b = main_trace.chiplet_bitwise_b(row);
         let z = main_trace.chiplet_bitwise_z(row);
 
-        alphas[0]
-            + alphas[1].mul_base(op_label)
-            + alphas[2].mul_base(a)
-            + alphas[3].mul_base(b)
-            + alphas[4].mul_base(z)
+        alphas[0] + build_value(&alphas[1..5], &[op_label, a, b, z])
     } else {
         E::ONE
     }
@@ -902,10 +884,10 @@ fn build_memory_chiplet_responses<E>(main_trace: &MainTrace, row: RowIndex, alph
 where
     E: FieldElement<BaseField = Felt>,
 {
-    let element_word = main_trace.chiplet_selector_4(row);
+    let is_word_access = main_trace.chiplet_selector_4(row);
     let header = {
-        let read_write = main_trace.chiplet_selector_3(row);
-        let op_label = get_memory_op_label(read_write, element_word);
+        let is_read = main_trace.chiplet_selector_3(row);
+        let op_label = get_memory_op_label(is_read, is_word_access);
 
         let ctx = main_trace.chiplet_memory_ctx(row);
         let clk = main_trace.chiplet_memory_clk(row);
@@ -917,14 +899,10 @@ where
             batch + idx1.mul_small(2) + idx0
         };
 
-        alphas[0]
-            + alphas[1].mul_base(op_label)
-            + alphas[2].mul_base(ctx)
-            + alphas[3].mul_base(address)
-            + alphas[4].mul_base(clk)
+        alphas[0] + build_value(&alphas[1..5], &[op_label, ctx, address, clk])
     };
 
-    if element_word == MEMORY_ACCESS_ELEMENT {
+    if is_word_access == MEMORY_ACCESS_ELEMENT {
         let idx0 = main_trace.chiplet_memory_idx0(row);
         let idx1 = main_trace.chiplet_memory_idx1(row);
 
@@ -941,19 +919,15 @@ where
         };
 
         header + alphas[5].mul_base(value)
-    } else if element_word == MEMORY_ACCESS_WORD {
+    } else if is_word_access == MEMORY_ACCESS_WORD {
         let value0 = main_trace.chiplet_memory_value_0(row);
         let value1 = main_trace.chiplet_memory_value_1(row);
         let value2 = main_trace.chiplet_memory_value_2(row);
         let value3 = main_trace.chiplet_memory_value_3(row);
 
-        header
-            + alphas[5].mul_base(value0)
-            + alphas[6].mul_base(value1)
-            + alphas[7].mul_base(value2)
-            + alphas[8].mul_base(value3)
+        header + build_value(&alphas[5..9], &[value0, value1, value2, value3])
     } else {
-        panic!("Invalid memory element/word column value: {element_word}");
+        panic!("Invalid memory element/word column value: {is_word_access}");
     }
 }
 
@@ -969,12 +943,8 @@ where
     let root2 = main_trace.chiplet_kernel_root_2(row);
     let root3 = main_trace.chiplet_kernel_root_3(row);
 
-    let v = alphas[0]
-        + alphas[1].mul_base(op_label)
-        + alphas[2].mul_base(root0)
-        + alphas[3].mul_base(root1)
-        + alphas[4].mul_base(root2)
-        + alphas[5].mul_base(root3);
+    let v =
+        alphas[0] + build_value(&alphas[1..6], &[Felt::from(op_label), root0, root1, root2, root3]);
 
     let kernel_chiplet_selector = main_trace.chiplet_selector_4(row);
     v.mul_base(kernel_chiplet_selector) + E::from(ONE - kernel_chiplet_selector)
@@ -987,7 +957,7 @@ where
 /// of alphas of matching length. This can be used to build the value for a single word or for an
 /// entire [HasherState].
 fn build_value<E: FieldElement<BaseField = Felt>>(alphas: &[E], elements: &[Felt]) -> E {
-    assert_eq!(alphas.len(), elements.len());
+    debug_assert_eq!(alphas.len(), elements.len());
     let mut value = E::ZERO;
     for (&alpha, &element) in alphas.iter().zip(elements.iter()) {
         value += alpha.mul_base(element);
@@ -1005,9 +975,12 @@ fn get_op_label(s0: Felt, s1: Felt, s2: Felt, s3: Felt) -> Felt {
 /// The memory operation label is currently the only label that is built differently (or *simpler*)
 /// from the other chiplets. We should refactor the other chiplets to use a similar (simpler)
 /// approach.
-fn get_memory_op_label(read_write: Felt, element_word: Felt) -> Felt {
+fn get_memory_op_label(is_read: Felt, is_word_access: Felt) -> Felt {
     const MEMORY_SELECTOR: u8 = 0b110;
-    Felt::from(MEMORY_SELECTOR << 2) + read_write.mul_small(2) + element_word
+    // Equivalent to `is_read << 1`
+    let is_read_left_shift_1 = is_read + is_read;
+
+    Felt::from(MEMORY_SELECTOR << 2) + is_read_left_shift_1 + is_word_access
 }
 
 /// Builds `MLOADW` and `MSTOREW` requests made to the memory chiplet.
@@ -1055,12 +1028,7 @@ fn compute_mem_request_element<E: FieldElement<BaseField = Felt>>(
     let ctx = main_trace.ctx(row);
     let clk = main_trace.clk(row);
 
-    alphas[0]
-        + alphas[1].mul_base(Felt::from(op_label))
-        + alphas[2].mul_base(ctx)
-        + alphas[3].mul_base(addr)
-        + alphas[4].mul_base(clk)
-        + alphas[5].mul_base(element)
+    alphas[0] + build_value(&alphas[1..6], &[Felt::from(op_label), ctx, addr, clk, element])
 }
 
 /// Computes a memory request for a read or write of a word.
@@ -1077,12 +1045,8 @@ fn compute_mem_request_word<E: FieldElement<BaseField = Felt>>(
     let clk = main_trace.clk(row);
 
     alphas[0]
-        + alphas[1].mul_base(Felt::from(op_label))
-        + alphas[2].mul_base(ctx)
-        + alphas[3].mul_base(addr)
-        + alphas[4].mul_base(clk)
-        + alphas[5].mul_base(word[0])
-        + alphas[6].mul_base(word[1])
-        + alphas[7].mul_base(word[2])
-        + alphas[8].mul_base(word[3])
+        + build_value(
+            &alphas[1..9],
+            &[Felt::from(op_label), ctx, addr, clk, word[0], word[1], word[2], word[3]],
+        )
 }
diff --git a/processor/src/chiplets/memory/mod.rs b/processor/src/chiplets/memory/mod.rs
index 0b03df31fb..3eb642dc79 100644
--- a/processor/src/chiplets/memory/mod.rs
+++ b/processor/src/chiplets/memory/mod.rs
@@ -3,9 +3,9 @@ use alloc::{collections::BTreeMap, vec::Vec};
 use miden_air::{
     trace::chiplets::memory::{
         BATCH_COL_IDX, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX, D1_COL_IDX, D_INV_COL_IDX,
-        ELEMENT_OR_WORD_COL_IDX, FLAG_SAME_BATCH_AND_CONTEXT, IDX0_COL_IDX, IDX1_COL_IDX,
-        MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_WRITE, READ_WRITE_COL_IDX,
-        V_COL_RANGE,
+        FLAG_SAME_BATCH_AND_CONTEXT, IDX0_COL_IDX, IDX1_COL_IDX, IS_READ_COL_IDX,
+        IS_WORD_ACCESS_COL_IDX, MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ,
+        MEMORY_WRITE, V_COL_RANGE,
     },
     RowIndex,
 };
@@ -54,9 +54,10 @@ const INIT_MEM_VALUE: Word = EMPTY_WORD;
 /// ├────┴────┴────┴───────┴──────┴──────┴────┴────┴────┴────┴────┴────┴────┴───────┴───────┤
 ///
 /// In the above, the meaning of the columns is as follows:
-/// - `rw` is a selector column used to identify whether the memory operation is a read or a write.
+/// - `rw` is a selector column used to identify whether the memory operation is a read or a write
+///   (1 indicates a read).
 /// - `ew` is a selector column used to identify whether the memory operation is over an element or
-///   a word.
+///   a word (1 indicates a word).
 /// - `ctx` contains execution context ID. Values in this column must increase monotonically but
 ///   there can be gaps between two consecutive context IDs of up to 2^32. Also, two consecutive
 ///   values can be the same.
@@ -78,9 +79,9 @@ const INIT_MEM_VALUE: Word = EMPTY_WORD;
 ///   - When both the context and the batch remain the same, these columns contain (`new_clk` -
 ///     `old_clk` - 1).
 /// - `d_inv` contains the inverse of the delta between two consecutive context IDs, batches, or
-///   clock cycles computed as described above.
-/// - `f_scb` is a flag indicating whether the context and the batch are the same as in the next
-///   row.
+///   clock cycles computed as described above. It is the field inverse of `(d_1 * 2^16) + d_0`
+/// - `f_scb` is a flag indicating whether the context and the batch of the current row are the same
+///   as in the next row.
 ///
 /// For the first row of the trace, values in `d0`, `d1`, and `d_inv` are set to zeros.
 #[derive(Debug, Default)]
@@ -321,14 +322,14 @@ impl Memory {
                     let value = memory_access.batch();
 
                     match memory_access.operation() {
-                        MemoryOperation::Read => trace.set(row, READ_WRITE_COL_IDX, MEMORY_READ),
-                        MemoryOperation::Write => trace.set(row, READ_WRITE_COL_IDX, MEMORY_WRITE),
+                        MemoryOperation::Read => trace.set(row, IS_READ_COL_IDX, MEMORY_READ),
+                        MemoryOperation::Write => trace.set(row, IS_READ_COL_IDX, MEMORY_WRITE),
                     }
                     let (idx1, idx0) = match memory_access.access_type() {
                         segment::MemoryAccessType::Element {
                             addr_idx_in_batch: addr_idx_in_word,
                         } => {
-                            trace.set(row, ELEMENT_OR_WORD_COL_IDX, MEMORY_ACCESS_ELEMENT);
+                            trace.set(row, IS_WORD_ACCESS_COL_IDX, MEMORY_ACCESS_ELEMENT);
 
                             match addr_idx_in_word {
                                 0 => (ZERO, ZERO),
@@ -339,7 +340,7 @@ impl Memory {
                             }
                         },
                         segment::MemoryAccessType::Word => {
-                            trace.set(row, ELEMENT_OR_WORD_COL_IDX, MEMORY_ACCESS_WORD);
+                            trace.set(row, IS_WORD_ACCESS_COL_IDX, MEMORY_ACCESS_WORD);
                             (ZERO, ZERO)
                         },
                     };
diff --git a/processor/src/chiplets/memory/segment.rs b/processor/src/chiplets/memory/segment.rs
index dd9b014217..9792066ba4 100644
--- a/processor/src/chiplets/memory/segment.rs
+++ b/processor/src/chiplets/memory/segment.rs
@@ -132,6 +132,9 @@ impl MemorySegmentTrace {
     ///
     /// If the word starting at the specified address hasn't been previously written to, four ZERO
     /// elements are returned. This effectively implies that memory is initialized to ZERO.
+    /// 
+    /// # Preconditions
+    /// - Assumes that the address is word aligned.
     ///
     /// # Errors
     /// - Returns an error if the same address is accessed more than once in the same clock cycle.
@@ -213,6 +216,10 @@ impl MemorySegmentTrace {
 
     /// Writes the provided word starting at the specified address. The memory access is assumed to
     /// happen at the provided clock cycle.
+    /// 
+    /// # Preconditions
+    /// 
+    /// - Assumes that the address is word aligned.
     ///
     /// # Errors
     /// - Returns an error if the same address is accessed more than once in the same clock cycle.
@@ -268,6 +275,9 @@ impl MemorySegmentTrace {
     ///
     /// The access type either specifies the element in batch that was read, or that the entire word
     /// was read.
+    /// 
+    /// # Errors
+    /// - Returns an error if the same address is accessed more than once in the same clock cycle.
     fn read_batch(
         &mut self,
         ctx: ContextId,
diff --git a/processor/src/chiplets/memory/tests.rs b/processor/src/chiplets/memory/tests.rs
index 56cbce5850..3180730132 100644
--- a/processor/src/chiplets/memory/tests.rs
+++ b/processor/src/chiplets/memory/tests.rs
@@ -2,9 +2,9 @@ use alloc::vec::Vec;
 
 use miden_air::{
     trace::chiplets::memory::{
-        ELEMENT_OR_WORD_COL_IDX, FLAG_SAME_BATCH_AND_CONTEXT, IDX0_COL_IDX, IDX1_COL_IDX,
-        MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_WRITE, READ_WRITE_COL_IDX,
-        TRACE_WIDTH as MEMORY_TRACE_WIDTH,
+        FLAG_SAME_BATCH_AND_CONTEXT, IDX0_COL_IDX, IDX1_COL_IDX, IS_READ_COL_IDX,
+        IS_WORD_ACCESS_COL_IDX, MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ,
+        MEMORY_WRITE, TRACE_WIDTH as MEMORY_TRACE_WIDTH,
     },
     RowIndex,
 };
@@ -539,11 +539,11 @@ fn build_trace_row(
 
     let mut row = [ZERO; MEMORY_TRACE_WIDTH];
 
-    row[READ_WRITE_COL_IDX] = match operation {
+    row[IS_READ_COL_IDX] = match operation {
         MemoryOperation::Read => MEMORY_READ,
         MemoryOperation::Write => MEMORY_WRITE,
     };
-    row[ELEMENT_OR_WORD_COL_IDX] = match access_type {
+    row[IS_WORD_ACCESS_COL_IDX] = match access_type {
         MemoryAccessType::Element { .. } => MEMORY_ACCESS_ELEMENT,
         MemoryAccessType::Word => MEMORY_ACCESS_WORD,
     };
diff --git a/processor/src/trace/tests/chiplets/memory.rs b/processor/src/trace/tests/chiplets/memory.rs
index 04acb7205b..df7697dd9f 100644
--- a/processor/src/trace/tests/chiplets/memory.rs
+++ b/processor/src/trace/tests/chiplets/memory.rs
@@ -5,9 +5,9 @@ use miden_air::{
             MEMORY_READ_WORD_LABEL, MEMORY_WRITE, MEMORY_WRITE_ELEMENT_LABEL,
             MEMORY_WRITE_WORD_LABEL,
         },
-        MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
-        MEMORY_ELEMENT_OR_WORD_COL_IDX, MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX,
-        MEMORY_READ_WRITE_COL_IDX, MEMORY_V_COL_RANGE,
+        MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_IDX0_COL_IDX,
+        MEMORY_IDX1_COL_IDX, MEMORY_IS_READ_COL_IDX, MEMORY_IS_WORD_ACCESS_COL_IDX,
+        MEMORY_V_COL_RANGE,
     },
     RowIndex,
 };
@@ -219,8 +219,8 @@ fn build_expected_bus_msg_from_trace(
     row: RowIndex,
 ) -> Felt {
     // get the memory access operation
-    let read_write = trace.main_trace.get_column(MEMORY_READ_WRITE_COL_IDX)[row];
-    let element_or_word = trace.main_trace.get_column(MEMORY_ELEMENT_OR_WORD_COL_IDX)[row];
+    let read_write = trace.main_trace.get_column(MEMORY_IS_READ_COL_IDX)[row];
+    let element_or_word = trace.main_trace.get_column(MEMORY_IS_WORD_ACCESS_COL_IDX)[row];
     let op_label = if read_write == MEMORY_WRITE {
         if element_or_word == MEMORY_ACCESS_ELEMENT {
             MEMORY_WRITE_ELEMENT_LABEL

From 06782a6c264253324c608fa702b18c72ff12c022 Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Mon, 6 Jan 2025 15:04:42 -0500
Subject: [PATCH 10/19] nomenclature: change `batch` -> `word`

---
 air/src/constraints/chiplets/memory/mod.rs   |  80 +++++------
 air/src/constraints/chiplets/memory/tests.rs |  38 ++---
 air/src/trace/chiplets/memory.rs             |  20 +--
 air/src/trace/chiplets/mod.rs                |  10 +-
 air/src/trace/main_trace.rs                  |  10 +-
 processor/src/chiplets/aux_trace/mod.rs      |   6 +-
 processor/src/chiplets/memory/mod.rs         |  56 ++++----
 processor/src/chiplets/memory/segment.rs     | 142 +++++++++----------
 processor/src/chiplets/memory/tests.rs       | 100 ++++++-------
 processor/src/operations/io_ops.rs           |  24 ++--
 processor/src/trace/tests/chiplets/memory.rs |   6 +-
 11 files changed, 246 insertions(+), 246 deletions(-)

diff --git a/air/src/constraints/chiplets/memory/mod.rs b/air/src/constraints/chiplets/memory/mod.rs
index 487e9aa28e..4b4918750e 100644
--- a/air/src/constraints/chiplets/memory/mod.rs
+++ b/air/src/constraints/chiplets/memory/mod.rs
@@ -5,8 +5,8 @@ use winter_air::TransitionConstraintDegree;
 use super::{EvaluationFrame, FieldElement};
 use crate::{
     trace::chiplets::{
-        MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_D0_COL_IDX,
-        MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_FLAG_SAME_BATCH_AND_CONTEXT,
+        MEMORY_WORD_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_D0_COL_IDX,
+        MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_FLAG_SAME_CONTEXT_AND_WORD,
         MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX, MEMORY_IS_READ_COL_IDX,
         MEMORY_IS_WORD_ACCESS_COL_IDX, MEMORY_V_COL_RANGE,
     },
@@ -26,11 +26,11 @@ pub const NUM_CONSTRAINTS: usize = 22;
 pub const CONSTRAINT_DEGREES: [usize; NUM_CONSTRAINTS] = [
     5, 5, 5, 5, // Enforce that rw, ew, idx0 and idx1 are binary.
     7, 6, 9, 8, // Constrain the values in the d inverse column.
-    8, // Enforce values in ctx, batch, clk transition correctly.
-    7, // Enforce the correct value for the f_scb flag.
+    8, // Enforce values in ctx, word, clk transition correctly.
+    7, // Enforce the correct value for the f_scw flag.
     9, 9, 9, 9, // Constrain the values in the first row of the chiplet.
-    9, 9, 9, 9, // Constrain the values in non-first rows, new batch or context is started.
-    9, 9, 9, 9, // Constrain the values in non-first rows, same batch or context.
+    9, 9, 9, 9, // Constrain the values in non-first rows, new word or context is started.
+    9, 9, 9, 9, // Constrain the values in non-first rows, same word or context.
 ];
 
 // MEMORY TRANSITION CONSTRAINTS
@@ -66,8 +66,8 @@ pub fn enforce_constraints<E: FieldElement>(
     // Enforce values in ctx, addr, clk transition correctly.
     index += enforce_delta(frame, &mut result[index..], memory_flag_no_last);
 
-    // Enforce the correct value for the f_scb flag.
-    index += enforce_flag_same_context_and_batch(frame, &mut result[index..], memory_flag_no_last);
+    // Enforce the correct value for the f_scw flag.
+    index += enforce_flag_same_context_and_word(frame, &mut result[index..], memory_flag_no_last);
 
     // Constrain the memory values.
     enforce_values(frame, &mut result[index..], memory_flag_no_last, memory_flag_first_row);
@@ -123,7 +123,7 @@ fn enforce_delta<E: FieldElement>(
 
     // If the context changed, include the difference.
     result[0] = memory_flag_no_last * frame.n0() * frame.ctx_change();
-    // If the context is the same, include the batch difference if it changed or else include the
+    // If the context is the same, include the word difference if it changed or else include the
     // clock change.
     result.agg_constraint(
         0,
@@ -136,15 +136,15 @@ fn enforce_delta<E: FieldElement>(
     constraint_count
 }
 
-/// A constraint evaluation function to enforce that the `f_scb` flag is set to 1 when the next row
-/// is in the same context and batch, and 0 otherwise.
-fn enforce_flag_same_context_and_batch<E: FieldElement>(
+/// A constraint evaluation function to enforce that the `f_scw` flag is set to 1 when the next row
+/// is in the same context and word, and 0 otherwise.
+fn enforce_flag_same_context_and_word<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
     memory_flag_no_last: E,
 ) -> usize {
     result[0] = memory_flag_no_last
-        * (frame.f_scb_next() - binary_not(frame.n0() + frame.not_n0() * frame.n1()));
+        * (frame.f_scw_next() - binary_not(frame.n0() + frame.not_n0() * frame.n1()));
 
     1
 }
@@ -153,10 +153,10 @@ fn enforce_flag_same_context_and_batch<E: FieldElement>(
 /// before being written and that when existing memory values are read they remain unchanged.
 ///
 /// The constraints on the values depend on a few factors:
-/// - When in the first row of a new context or batch, any of the 4 values of the batch that are not
+/// - When in the first row of a new context or word, any of the 4 values of the word that are not
 ///   written to must be set to 0.
-///   - This is because the memory is initialized to 0 when a new context or batch is started.
-/// - When we remain in the same context and batch, then this is when we want to enforce the "memory
+///   - This is because the memory is initialized to 0 when a new context or word is started.
+/// - When we remain in the same context and word, then this is when we want to enforce the "memory
 ///   property" that what was previously written must be read. Therefore, the values that are not
 ///   being written need to be equal to the values in the previous row (i.e. previously written, or
 ///   initialized to 0).
@@ -178,7 +178,7 @@ fn enforce_values<E: FieldElement>(
     // must not be accessed).
     //
     // As a result, `c_i` does not include the constraint of being in the memory chiplet, or in the
-    // same context and batch - these must be enforced separately.
+    // same context and word - these must be enforced separately.
     let (c0, c1, c2, c3) = {
         // intuition: the i'th `f` flag is set to 1 when `i == 2 * idx1 + idx0`
         let f0 = binary_not(frame.idx1_next()) * binary_not(frame.idx0_next());
@@ -204,19 +204,19 @@ fn enforce_values<E: FieldElement>(
     result[2] = memory_flag_first_row * c2 * frame.v_next(2);
     result[3] = memory_flag_first_row * c3 * frame.v_next(3);
 
-    // non-first row, new batch or context constraints: when row' is a new batch/ctx, and v'[i] is
+    // non-first row, new word or context constraints: when row' is a new word/ctx, and v'[i] is
     // not written to, then v'[i] must be 0.
-    result[4] = memory_flag_no_last * binary_not(frame.f_scb_next()) * c0 * frame.v_next(0);
-    result[5] = memory_flag_no_last * binary_not(frame.f_scb_next()) * c1 * frame.v_next(1);
-    result[6] = memory_flag_no_last * binary_not(frame.f_scb_next()) * c2 * frame.v_next(2);
-    result[7] = memory_flag_no_last * binary_not(frame.f_scb_next()) * c3 * frame.v_next(3);
+    result[4] = memory_flag_no_last * binary_not(frame.f_scw_next()) * c0 * frame.v_next(0);
+    result[5] = memory_flag_no_last * binary_not(frame.f_scw_next()) * c1 * frame.v_next(1);
+    result[6] = memory_flag_no_last * binary_not(frame.f_scw_next()) * c2 * frame.v_next(2);
+    result[7] = memory_flag_no_last * binary_not(frame.f_scw_next()) * c3 * frame.v_next(3);
 
-    // non-first row, same batch or context constraints: when row' is in the same batch/ctx, and
+    // non-first row, same word or context constraints: when row' is in the same word/ctx, and
     // v'[i] is not written to, then v'[i] must be equal to v[i].
-    result[8] = memory_flag_no_last * frame.f_scb_next() * c0 * (frame.v_next(0) - frame.v(0));
-    result[9] = memory_flag_no_last * frame.f_scb_next() * c1 * (frame.v_next(1) - frame.v(1));
-    result[10] = memory_flag_no_last * frame.f_scb_next() * c2 * (frame.v_next(2) - frame.v(2));
-    result[11] = memory_flag_no_last * frame.f_scb_next() * c3 * (frame.v_next(3) - frame.v(3));
+    result[8] = memory_flag_no_last * frame.f_scw_next() * c0 * (frame.v_next(0) - frame.v(0));
+    result[9] = memory_flag_no_last * frame.f_scw_next() * c1 * (frame.v_next(1) - frame.v(1));
+    result[10] = memory_flag_no_last * frame.f_scw_next() * c2 * (frame.v_next(2) - frame.v(2));
+    result[11] = memory_flag_no_last * frame.f_scw_next() * c3 * (frame.v_next(3) - frame.v(3));
 
     12
 }
@@ -250,14 +250,14 @@ trait EvaluationFrameExt<E: FieldElement> {
     fn ctx(&self) -> E;
     /// The current address.
     #[allow(dead_code)]
-    fn batch(&self) -> E;
-    /// The 0'th bit of the index of the memory address in the current batch.
+    fn word_next(&self) -> E;
+    /// The 0'th bit of the index of the memory address in the current word.
     fn idx0(&self) -> E;
-    /// The 0'th bit of the index of the memory address in the next batch.
+    /// The 0'th bit of the index of the memory address in the next word.
     fn idx0_next(&self) -> E;
-    /// The 1st bit of the index of the memory address in the current batch.
+    /// The 1st bit of the index of the memory address in the current word.
     fn idx1(&self) -> E;
-    /// The 1st bit of the index of the memory address in the next batch.
+    /// The 1st bit of the index of the memory address in the next word.
     fn idx1_next(&self) -> E;
     /// The current clock cycle.
     #[allow(dead_code)]
@@ -278,9 +278,9 @@ trait EvaluationFrameExt<E: FieldElement> {
     /// The next value of the column tracking the inverse delta used for constraint evaluations.
     fn d_inv_next(&self) -> E;
 
-    // The flag that indicates whether the next row is in the same batch and context as the current
+    // The flag that indicates whether the next row is in the same word and context as the current
     // row.
-    fn f_scb_next(&self) -> E;
+    fn f_scw_next(&self) -> E;
 
     // --- Intermediate variables & helpers -------------------------------------------------------
 
@@ -336,8 +336,8 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
     }
 
     #[inline(always)]
-    fn batch(&self) -> E {
-        self.next()[MEMORY_BATCH_COL_IDX]
+    fn word_next(&self) -> E {
+        self.next()[MEMORY_WORD_COL_IDX]
     }
 
     #[inline(always)]
@@ -396,8 +396,8 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
     }
 
     #[inline(always)]
-    fn f_scb_next(&self) -> E {
-        self.next()[MEMORY_FLAG_SAME_BATCH_AND_CONTEXT]
+    fn f_scw_next(&self) -> E {
+        self.next()[MEMORY_FLAG_SAME_CONTEXT_AND_WORD]
     }
 
     // --- Intermediate variables & helpers -------------------------------------------------------
@@ -419,7 +419,7 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
 
     #[inline(always)]
     fn n1(&self) -> E {
-        self.change(MEMORY_BATCH_COL_IDX) * self.d_inv_next()
+        self.change(MEMORY_WORD_COL_IDX) * self.d_inv_next()
     }
 
     #[inline(always)]
@@ -434,7 +434,7 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
 
     #[inline(always)]
     fn addr_change(&self) -> E {
-        self.change(MEMORY_BATCH_COL_IDX)
+        self.change(MEMORY_WORD_COL_IDX)
     }
 
     #[inline(always)]
diff --git a/air/src/constraints/chiplets/memory/tests.rs b/air/src/constraints/chiplets/memory/tests.rs
index 6b4a673234..0869c85174 100644
--- a/air/src/constraints/chiplets/memory/tests.rs
+++ b/air/src/constraints/chiplets/memory/tests.rs
@@ -4,7 +4,7 @@ use rand_utils::rand_value;
 use vm_core::{Felt, FieldElement, WORD_SIZE};
 
 use super::{
-    EvaluationFrame, MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
+    EvaluationFrame, MEMORY_WORD_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
     MEMORY_D0_COL_IDX, MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_V_COL_RANGE,
 };
 use crate::{
@@ -12,7 +12,7 @@ use crate::{
     trace::{
         chiplets::{
             memory::{MEMORY_ACCESS_WORD, MEMORY_READ, MEMORY_WRITE},
-            MEMORY_FLAG_SAME_BATCH_AND_CONTEXT, MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX,
+            MEMORY_FLAG_SAME_CONTEXT_AND_WORD, MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX,
             MEMORY_IS_READ_COL_IDX, MEMORY_IS_WORD_ACCESS_COL_IDX,
         },
         TRACE_WIDTH,
@@ -37,7 +37,7 @@ fn test_memory_write() {
 
     // Write to a new address in the same context.
     let result =
-        get_constraint_evaluation(MEMORY_WRITE, MemoryTestDeltaType::Batch, &old_word, &new_word);
+        get_constraint_evaluation(MEMORY_WRITE, MemoryTestDeltaType::Word, &old_word, &new_word);
     assert_eq!(expected_constraint_evals, result);
 
     // Write to the same context and address at a new clock cycle.
@@ -65,7 +65,7 @@ fn test_memory_read() {
     // Read from a new address in the same context.
     let result = get_constraint_evaluation(
         MEMORY_READ,
-        MemoryTestDeltaType::Batch,
+        MemoryTestDeltaType::Word,
         &old_values,
         &init_values,
     );
@@ -90,7 +90,7 @@ fn test_memory_read() {
 /// - Clock: when the delta occurs in the clock column, context and address must stay fixed.
 enum MemoryTestDeltaType {
     Context,
-    Batch,
+    Word,
     Clock,
 }
 
@@ -146,7 +146,7 @@ fn get_test_frame(
 
     // Set the context, addr, and clock columns in the next row to the values in the delta row.
     next[MEMORY_CTX_COL_IDX] = Felt::new(delta_row[0]);
-    next[MEMORY_BATCH_COL_IDX] = Felt::new(delta_row[1]);
+    next[MEMORY_WORD_COL_IDX] = Felt::new(delta_row[1]);
     next[MEMORY_CLK_COL_IDX] = Felt::new(delta_row[2]);
 
     // Set the old and new values.
@@ -167,7 +167,7 @@ fn get_test_frame(
     let delta: u64 = match delta_type {
         MemoryTestDeltaType::Clock => delta_row[MemoryTestDeltaType::Clock as usize] - 1,
         MemoryTestDeltaType::Context => delta_row[MemoryTestDeltaType::Context as usize],
-        MemoryTestDeltaType::Batch => delta_row[MemoryTestDeltaType::Batch as usize],
+        MemoryTestDeltaType::Word => delta_row[MemoryTestDeltaType::Word as usize],
     };
     next[MEMORY_D0_COL_IDX] = Felt::new(delta as u16 as u64);
     next[MEMORY_D1_COL_IDX] = Felt::new(delta >> 16);
@@ -177,31 +177,31 @@ fn get_test_frame(
     next[MEMORY_IDX0_COL_IDX] = ZERO;
     next[MEMORY_IDX1_COL_IDX] = ZERO;
 
-    // If the context or batch columns are changed, the "same batch and context" flag should be
+    // If the context or word columns are changed, the "same context and word" flag should be
     // zero.
-    if delta_row[MemoryTestDeltaType::Batch as usize] > 0
+    if delta_row[MemoryTestDeltaType::Word as usize] > 0
         || delta_row[MemoryTestDeltaType::Context as usize] > 0
     {
-        next[MEMORY_FLAG_SAME_BATCH_AND_CONTEXT] = ZERO;
+        next[MEMORY_FLAG_SAME_CONTEXT_AND_WORD] = ZERO;
     } else {
-        next[MEMORY_FLAG_SAME_BATCH_AND_CONTEXT] = ONE;
+        next[MEMORY_FLAG_SAME_CONTEXT_AND_WORD] = ONE;
     }
 
     EvaluationFrame::<Felt>::from_rows(current, next)
 }
 
-/// Generates a row of valid test values for the context, batch, and clock columns according to
+/// Generates a row of valid test values for the context, word, and clock columns according to
 /// the specified delta type, which determines the column over which the delta and delta inverse
 /// values of the trace would be calculated.
 ///
-/// - When the delta type is Context, the batch and clock columns can be anything.
-/// - When the delta type is Batch, the context must remain unchanged but the clock can change.
-/// - When the delta type is Clock, both the context and batch columns must remain unchanged.
+/// - When the delta type is Context, the word and clock columns can be anything.
+/// - When the delta type is Word, the context must remain unchanged but the clock can change.
+/// - When the delta type is Clock, both the context and word columns must remain unchanged.
 fn get_test_delta_row(delta_type: &MemoryTestDeltaType) -> Vec<u64> {
     let delta_value = word_aligned_rand_value() as u64;
     let mut row = vec![0; 3];
     let ctx_idx = MemoryTestDeltaType::Context as usize;
-    let batch_idx = MemoryTestDeltaType::Batch as usize;
+    let word_addr_idx = MemoryTestDeltaType::Word as usize;
     let clk_idx = MemoryTestDeltaType::Clock as usize;
 
     // Set the context, addr, and clock columns according to the specified delta type.
@@ -211,13 +211,13 @@ fn get_test_delta_row(delta_type: &MemoryTestDeltaType) -> Vec<u64> {
             row[ctx_idx] = delta_value;
 
             // Set addr and clock in the row column to random values.
-            row[batch_idx] = word_aligned_rand_value() as u64;
+            row[word_addr_idx] = word_aligned_rand_value() as u64;
             row[clk_idx] = rand_value::<u32>() as u64;
         },
-        MemoryTestDeltaType::Batch => {
+        MemoryTestDeltaType::Word => {
             // Keep the context value the same in current and row rows (leave it as ZERO).
             // Set the row value for the address.
-            row[batch_idx] = delta_value;
+            row[word_addr_idx] = delta_value;
 
             // Set clock in the row column to a random value.
             row[clk_idx] = rand_value::<u32>() as u64;
diff --git a/air/src/trace/chiplets/memory.rs b/air/src/trace/chiplets/memory.rs
index 759af60f7c..f7e1af37ac 100644
--- a/air/src/trace/chiplets/memory.rs
+++ b/air/src/trace/chiplets/memory.rs
@@ -46,17 +46,17 @@ pub const IS_READ_COL_IDX: usize = 0;
 pub const IS_WORD_ACCESS_COL_IDX: usize = IS_READ_COL_IDX + 1;
 /// Column to hold the context ID of the current memory context.
 pub const CTX_COL_IDX: usize = IS_WORD_ACCESS_COL_IDX + 1;
-/// Column to hold the batch (i.e. group of 4 memory slots, referred to by the address of the first
-/// slot in the batch).
-pub const BATCH_COL_IDX: usize = CTX_COL_IDX + 1;
-/// Column to hold the first bit of the index of the address in the batch.
-pub const IDX0_COL_IDX: usize = BATCH_COL_IDX + 1;
-/// Column to hold the second bit of the index of the address in the batch.
+/// Column to hold the word (i.e. group of 4 memory slots, referred to by the address of the first
+/// slot in the word).
+pub const WORD_COL_IDX: usize = CTX_COL_IDX + 1;
+/// Column to hold the first bit of the index of the address in the word.
+pub const IDX0_COL_IDX: usize = WORD_COL_IDX + 1;
+/// Column to hold the second bit of the index of the address in the word.
 pub const IDX1_COL_IDX: usize = IDX0_COL_IDX + 1;
 /// Column for the clock cycle in which the memory operation occurred.
 pub const CLK_COL_IDX: usize = IDX1_COL_IDX + 1;
-/// Columns to hold the values stored at a given memory context, batch, and clock cycle after
-/// the memory operation. When reading from a new batch, these are initialized to zero.
+/// Columns to hold the values stored at a given memory context, word, and clock cycle after
+/// the memory operation. When reading from a new word, these are initialized to zero.
 pub const V_COL_RANGE: Range<usize> = create_range(CLK_COL_IDX + 1, WORD_SIZE);
 /// Column for the lower 16-bits of the delta between two consecutive context IDs, addresses, or
 /// clock cycles.
@@ -67,6 +67,6 @@ pub const D1_COL_IDX: usize = D0_COL_IDX + 1;
 /// Column for the inverse of the delta between two consecutive context IDs, addresses, or clock
 /// cycles, used to enforce that changes are correctly constrained.
 pub const D_INV_COL_IDX: usize = D1_COL_IDX + 1;
-/// Column to hold the flag indicating whether the current memory operation is in the same batch and
+/// Column to hold the flag indicating whether the current memory operation is in the same word and
 /// same context as the previous operation.
-pub const FLAG_SAME_BATCH_AND_CONTEXT: usize = D_INV_COL_IDX + 1;
+pub const FLAG_SAME_CONTEXT_AND_WORD: usize = D_INV_COL_IDX + 1;
diff --git a/air/src/trace/chiplets/mod.rs b/air/src/trace/chiplets/mod.rs
index e0bc47b262..9ceeef90fb 100644
--- a/air/src/trace/chiplets/mod.rs
+++ b/air/src/trace/chiplets/mod.rs
@@ -98,7 +98,7 @@ pub const MEMORY_IS_WORD_ACCESS_COL_IDX: usize =
 /// The index within the main trace of the column containing the memory context.
 pub const MEMORY_CTX_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::CTX_COL_IDX;
 /// The index within the main trace of the column containing the memory address.
-pub const MEMORY_BATCH_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::BATCH_COL_IDX;
+pub const MEMORY_WORD_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::WORD_COL_IDX;
 /// The index within the main trace of the column containing the 0'th memory index.
 pub const MEMORY_IDX0_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::IDX0_COL_IDX;
 /// The index within the main trace of the column containing the 1st memory index.
@@ -121,7 +121,7 @@ pub const MEMORY_D1_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::D1_COL_IDX;
 /// memory context IDs, addresses, or clock cycles, used to enforce that changes are correctly
 /// constrained.
 pub const MEMORY_D_INV_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::D_INV_COL_IDX;
-/// Column to hold the flag indicating whether the current memory operation is in the same batch and
-/// same context as the previous operation.
-pub const MEMORY_FLAG_SAME_BATCH_AND_CONTEXT: usize =
-    MEMORY_TRACE_OFFSET + memory::FLAG_SAME_BATCH_AND_CONTEXT;
+/// Column to hold the flag indicating whether the current memory operation is in the same context and
+/// same word as the previous operation.
+pub const MEMORY_FLAG_SAME_CONTEXT_AND_WORD: usize =
+    MEMORY_TRACE_OFFSET + memory::FLAG_SAME_CONTEXT_AND_WORD;
diff --git a/air/src/trace/main_trace.rs b/air/src/trace/main_trace.rs
index 5f6d7981eb..ad3ef61970 100644
--- a/air/src/trace/main_trace.rs
+++ b/air/src/trace/main_trace.rs
@@ -9,7 +9,7 @@ use super::{
     chiplets::{
         hasher::{DIGEST_LEN, HASH_CYCLE_LEN, STATE_WIDTH},
         BITWISE_A_COL_IDX, BITWISE_B_COL_IDX, BITWISE_OUTPUT_COL_IDX, HASHER_NODE_INDEX_COL_IDX,
-        HASHER_STATE_COL_RANGE, MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
+        HASHER_STATE_COL_RANGE, MEMORY_WORD_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
         MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX, MEMORY_V_COL_RANGE,
     },
     decoder::{
@@ -371,16 +371,16 @@ impl MainTrace {
     }
 
     /// Returns the i-th row of the chiplet column containing memory address.
-    pub fn chiplet_memory_batch(&self, i: RowIndex) -> Felt {
-        self.columns.get_column(MEMORY_BATCH_COL_IDX)[i]
+    pub fn chiplet_memory_word(&self, i: RowIndex) -> Felt {
+        self.columns.get_column(MEMORY_WORD_COL_IDX)[i]
     }
 
-    /// Returns the i-th row of the chiplet column containing 0th bit of the batch index.
+    /// Returns the i-th row of the chiplet column containing 0th bit of the word index.
     pub fn chiplet_memory_idx0(&self, i: RowIndex) -> Felt {
         self.columns.get_column(MEMORY_IDX0_COL_IDX)[i]
     }
 
-    /// Returns the i-th row of the chiplet column containing 1st bit of the batch index.
+    /// Returns the i-th row of the chiplet column containing 1st bit of the word index.
     pub fn chiplet_memory_idx1(&self, i: RowIndex) -> Felt {
         self.columns.get_column(MEMORY_IDX1_COL_IDX)[i]
     }
diff --git a/processor/src/chiplets/aux_trace/mod.rs b/processor/src/chiplets/aux_trace/mod.rs
index f99ec978bd..b03586e0d8 100644
--- a/processor/src/chiplets/aux_trace/mod.rs
+++ b/processor/src/chiplets/aux_trace/mod.rs
@@ -892,11 +892,11 @@ where
         let ctx = main_trace.chiplet_memory_ctx(row);
         let clk = main_trace.chiplet_memory_clk(row);
         let address = {
-            let batch = main_trace.chiplet_memory_batch(row);
+            let word = main_trace.chiplet_memory_word(row);
             let idx0 = main_trace.chiplet_memory_idx0(row);
             let idx1 = main_trace.chiplet_memory_idx1(row);
 
-            batch + idx1.mul_small(2) + idx0
+            word + idx1.mul_small(2) + idx0
         };
 
         alphas[0] + build_value(&alphas[1..5], &[op_label, ctx, address, clk])
@@ -915,7 +915,7 @@ where
         } else if idx1 == ONE && idx0 == ONE {
             main_trace.chiplet_memory_value_3(row)
         } else {
-            panic!("Invalid batch indices. idx0: {idx0}, idx1: {idx1}");
+            panic!("Invalid word indices. idx0: {idx0}, idx1: {idx1}");
         };
 
         header + alphas[5].mul_base(value)
diff --git a/processor/src/chiplets/memory/mod.rs b/processor/src/chiplets/memory/mod.rs
index 3eb642dc79..02539264f5 100644
--- a/processor/src/chiplets/memory/mod.rs
+++ b/processor/src/chiplets/memory/mod.rs
@@ -2,8 +2,8 @@ use alloc::{collections::BTreeMap, vec::Vec};
 
 use miden_air::{
     trace::chiplets::memory::{
-        BATCH_COL_IDX, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX, D1_COL_IDX, D_INV_COL_IDX,
-        FLAG_SAME_BATCH_AND_CONTEXT, IDX0_COL_IDX, IDX1_COL_IDX, IS_READ_COL_IDX,
+        WORD_COL_IDX, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX, D1_COL_IDX, D_INV_COL_IDX,
+        FLAG_SAME_CONTEXT_AND_WORD, IDX0_COL_IDX, IDX1_COL_IDX, IS_READ_COL_IDX,
         IS_WORD_ACCESS_COL_IDX, MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ,
         MEMORY_WRITE, V_COL_RANGE,
     },
@@ -40,9 +40,9 @@ const INIT_MEM_VALUE: Word = EMPTY_WORD;
 /// The memory is comprised of one or more segments, each segment accessible from a specific
 /// execution context. The root (kernel) context has context ID 0, and all additional contexts have
 /// increasing IDs. Within each segment, the memory is element-addressable, even though the trace
-/// tracks batches of four elements for optimization purposes. That is, a single field element is
-/// located at each memory address, and we can read and write elements to/from memory either
-/// individually or in batches of four.
+/// tracks words for optimization purposes. That is, a single field element is located at each
+/// memory address, and we can read and write elements to/from memory either individually or in
+/// groups of four.
 ///
 /// Memory for a given address is always initialized to zero. That is, reading from an address
 /// before writing to it will return ZERO.
@@ -50,8 +50,8 @@ const INIT_MEM_VALUE: Word = EMPTY_WORD;
 /// ## Execution trace
 /// The layout of the memory access trace is shown below.
 ///
-///   rw   ew   ctx  batch   idx0   idx1  clk   v0   v1   v2   v3   d0   d1   d_inv   f_scb
-/// ├────┴────┴────┴───────┴──────┴──────┴────┴────┴────┴────┴────┴────┴────┴───────┴───────┤
+///   rw   ew   ctx  word_addr   idx0   idx1  clk   v0   v1   v2   v3   d0   d1   d_inv   f_scw
+/// ├────┴────┴────┴───────────┴──────┴──────┴────┴────┴────┴────┴────┴────┴────┴───────┴───────┤
 ///
 /// In the above, the meaning of the columns is as follows:
 /// - `rw` is a selector column used to identify whether the memory operation is a read or a write
@@ -61,26 +61,26 @@ const INIT_MEM_VALUE: Word = EMPTY_WORD;
 /// - `ctx` contains execution context ID. Values in this column must increase monotonically but
 ///   there can be gaps between two consecutive context IDs of up to 2^32. Also, two consecutive
 ///   values can be the same.
-/// - `batch` contains the the index of the batch of addresses, which is the address of the first
-///   element in the batch. For example, the value of `batch` for the batch of addresses 40, 41, 42,
-///   and 43 is 40. Note then that the first address of a batch *must* be divisible by 4. Values in
-///   this column must increase monotonically for a given context but there can be gaps between two
-///   consecutive values of up to 2^32. Also, two consecutive values can be the same.
+/// - `word_addr` contains the address of the first element in the word. For example, the value of
+///   `word_addr` for the group of addresses 40, 41, 42, 43 is 40. Note then that `word_addr` *must*
+///   be divisible by 4. Values in this column must increase monotonically for a given context but
+///   there can be gaps between two consecutive values of up to 2^32. Also, two consecutive values
+///   can be the same.
 /// - `clk` contains the clock cycle at which a memory operation happened. Values in this column
-///   must increase monotonically for a given context and batch but there can be gaps between two
+///   must increase monotonically for a given context and word but there can be gaps between two
 ///   consecutive values of up to 2^32.
-/// - Columns `v0`, `v1`, `v2`, `v3` contain field elements stored at a given context/batch/clock
+/// - Columns `v0`, `v1`, `v2`, `v3` contain field elements stored at a given context/word/clock
 ///   cycle after the memory operation.
 /// - Columns `d0` and `d1` contain lower and upper 16 bits of the delta between two consecutive
-///   context IDs, batches, or clock cycles. Specifically:
+///   context IDs, words, or clock cycles. Specifically:
 ///   - When the context changes, these columns contain (`new_ctx` - `old_ctx`).
-///   - When the context remains the same but the batch changes, these columns contain (`new_batch`
-///     - `old_batch`).
-///   - When both the context and the batch remain the same, these columns contain (`new_clk` -
+///   - When the context remains the same but the word changes, these columns contain (`new_word`
+///     - `old_word`).
+///   - When both the context and the word remain the same, these columns contain (`new_clk` -
 ///     `old_clk` - 1).
-/// - `d_inv` contains the inverse of the delta between two consecutive context IDs, batches, or
+/// - `d_inv` contains the inverse of the delta between two consecutive context IDs, words, or
 ///   clock cycles computed as described above. It is the field inverse of `(d_1 * 2^16) + d_0`
-/// - `f_scb` is a flag indicating whether the context and the batch of the current row are the same
+/// - `f_scw` is a flag indicating whether the context and the word of the current row are the same
 ///   as in the next row.
 ///
 /// For the first row of the trace, values in `d0`, `d1`, and `d_inv` are set to zeros.
@@ -319,7 +319,7 @@ impl Memory {
                 let felt_addr = Felt::from(addr);
                 for memory_access in addr_trace {
                     let clk = memory_access.clk();
-                    let value = memory_access.batch();
+                    let value = memory_access.word();
 
                     match memory_access.operation() {
                         MemoryOperation::Read => trace.set(row, IS_READ_COL_IDX, MEMORY_READ),
@@ -327,7 +327,7 @@ impl Memory {
                     }
                     let (idx1, idx0) = match memory_access.access_type() {
                         segment::MemoryAccessType::Element {
-                            addr_idx_in_batch: addr_idx_in_word,
+                            addr_idx_in_word,
                         } => {
                             trace.set(row, IS_WORD_ACCESS_COL_IDX, MEMORY_ACCESS_ELEMENT);
 
@@ -345,7 +345,7 @@ impl Memory {
                         },
                     };
                     trace.set(row, CTX_COL_IDX, ctx);
-                    trace.set(row, BATCH_COL_IDX, felt_addr);
+                    trace.set(row, WORD_COL_IDX, felt_addr);
                     trace.set(row, IDX0_COL_IDX, idx0);
                     trace.set(row, IDX1_COL_IDX, idx1);
                     trace.set(row, CLK_COL_IDX, clk);
@@ -369,9 +369,9 @@ impl Memory {
                     trace.set(row, D_INV_COL_IDX, delta.inv());
 
                     if prev_ctx == ctx && prev_addr == felt_addr {
-                        trace.set(row, FLAG_SAME_BATCH_AND_CONTEXT, ONE);
+                        trace.set(row, FLAG_SAME_CONTEXT_AND_WORD, ONE);
                     } else {
-                        trace.set(row, FLAG_SAME_BATCH_AND_CONTEXT, ZERO);
+                        trace.set(row, FLAG_SAME_CONTEXT_AND_WORD, ZERO);
                     };
 
                     // update values for the next iteration of the loop
@@ -403,9 +403,9 @@ impl Memory {
     // TEST HELPERS
     // --------------------------------------------------------------------------------------------
 
-    /// Returns the number of batches that were accessed at least once across all contexts.
+    /// Returns the number of words that were accessed at least once across all contexts.
     #[cfg(test)]
-    pub fn num_accessed_batches(&self) -> usize {
-        self.trace.iter().fold(0, |acc, (_, s)| acc + s.num_accessed_batches())
+    pub fn num_accessed_words(&self) -> usize {
+        self.trace.iter().fold(0, |acc, (_, s)| acc + s.num_accessed_words())
     }
 }
diff --git a/processor/src/chiplets/memory/segment.rs b/processor/src/chiplets/memory/segment.rs
index 9792066ba4..e2dac655d0 100644
--- a/processor/src/chiplets/memory/segment.rs
+++ b/processor/src/chiplets/memory/segment.rs
@@ -16,7 +16,7 @@ use crate::{ContextId, ExecutionError};
 ///
 /// A memory segment is an isolated address space accessible from a specific execution context.
 /// Within each segment, the memory is word-addressable. That is, four field elements are located
-/// at each memory address, and we can read and write elements to/from memory in batches of four.
+/// at each memory address, and we can read and write elements to/from memory in groups of four.
 #[derive(Debug, Default)]
 pub struct MemorySegmentTrace(BTreeMap<u32, Vec<MemorySegmentAccess>>);
 
@@ -30,11 +30,11 @@ impl MemorySegmentTrace {
     /// Unlike read() which modifies the memory access trace, this method returns the value at the
     /// specified address (if one exists) without altering the memory access trace.
     pub fn get_value(&self, addr: u32) -> Option<Felt> {
-        let (batch, addr_idx_in_word) = addr_to_batch_and_idx(addr);
+        let (word_addr, addr_idx_in_word) = addr_to_word_addr_and_idx(addr);
 
-        match self.0.get(&batch) {
+        match self.0.get(&word_addr) {
             Some(addr_trace) => {
-                addr_trace.last().map(|access| access.batch()[addr_idx_in_word as usize])
+                addr_trace.last().map(|access| access.word()[addr_idx_in_word as usize])
             },
             None => None,
         }
@@ -50,10 +50,10 @@ impl MemorySegmentTrace {
             return Err(());
         }
 
-        let (batch, _) = addr_to_batch_and_idx(addr);
+        let (word_addr, _) = addr_to_word_addr_and_idx(addr);
 
-        match self.0.get(&batch) {
-            Some(addr_trace) => Ok(addr_trace.last().map(|access| access.batch())),
+        match self.0.get(&word_addr) {
+            Some(addr_trace) => Ok(addr_trace.last().map(|access| access.word())),
             None => Ok(None),
         }
     }
@@ -74,13 +74,13 @@ impl MemorySegmentTrace {
         for (&addr, addr_trace) in self.0.iter() {
             match addr_trace.binary_search_by(|access| access.clk().as_int().cmp(&search_clk)) {
                 Ok(i) => {
-                    let batch = addr_trace[i].batch();
+                    let word_addr = addr_trace[i].word();
                     let addr: u64 = addr.into();
                     result.extend([
-                        (addr, batch[0]),
-                        (addr + 1, batch[1]),
-                        (addr + 2, batch[2]),
-                        (addr + 3, batch[3]),
+                        (addr, word_addr[0]),
+                        (addr + 1, word_addr[1]),
+                        (addr + 2, word_addr[2]),
+                        (addr + 3, word_addr[3]),
                     ]);
                 },
                 Err(i) => {
@@ -88,13 +88,13 @@ impl MemorySegmentTrace {
                     // Decrement the index to get the trace from the previously accessed clock
                     // cycle to insert into the results.
                     if i > 0 {
-                        let batch = addr_trace[i - 1].batch();
+                        let word_addr = addr_trace[i - 1].word();
                         let addr: u64 = addr.into();
                         result.extend([
-                            (addr, batch[0]),
-                            (addr + 1, batch[1]),
-                            (addr + 2, batch[2]),
-                            (addr + 3, batch[3]),
+                            (addr, word_addr[0]),
+                            (addr + 1, word_addr[1]),
+                            (addr + 2, word_addr[2]),
+                            (addr + 3, word_addr[3]),
                         ]);
                     }
                 },
@@ -115,16 +115,16 @@ impl MemorySegmentTrace {
     /// # Errors
     /// - Returns an error if the same address is accessed more than once in the same clock cycle.
     pub fn read(&mut self, ctx: ContextId, addr: u32, clk: Felt) -> Result<Felt, ExecutionError> {
-        let (batch, addr_idx_in_word) = addr_to_batch_and_idx(addr);
+        let (word_addr, addr_idx_in_word) = addr_to_word_addr_and_idx(addr);
 
-        let batch_values = self.read_batch(
+        let word = self.read_word_helper(
             ctx,
-            batch,
+            word_addr,
             clk,
-            MemoryAccessType::Element { addr_idx_in_batch: addr_idx_in_word },
+            MemoryAccessType::Element { addr_idx_in_word },
         )?;
 
-        Ok(batch_values[addr_idx_in_word as usize])
+        Ok(word[addr_idx_in_word as usize])
     }
 
     /// Returns a word located in memory starting at the specified address, which must be word
@@ -141,13 +141,13 @@ impl MemorySegmentTrace {
     pub fn read_word(
         &mut self,
         ctx: ContextId,
-        addr: u32,
+        word_addr: u32,
         clk: Felt,
     ) -> Result<Word, ExecutionError> {
-        debug_assert!(addr % 4 == 0, "unaligned word access: {addr}");
+        debug_assert!(word_addr % 4 == 0, "unaligned word access: {word_addr}");
 
-        let (batch, _) = addr_to_batch_and_idx(addr);
-        self.read_batch(ctx, batch, clk, MemoryAccessType::Word)
+        let (word_addr, _) = addr_to_word_addr_and_idx(word_addr);
+        self.read_word_helper(ctx, word_addr, clk, MemoryAccessType::Word)
     }
 
     /// Writes the element located at the specified address. The memory access is assumed to happen
@@ -164,47 +164,47 @@ impl MemorySegmentTrace {
         clk: Felt,
         value: Felt,
     ) -> Result<(), ExecutionError> {
-        let (batch, addr_idx_in_word) = addr_to_batch_and_idx(addr);
+        let (word_addr, addr_idx_in_word) = addr_to_word_addr_and_idx(addr);
 
-        match self.0.entry(batch) {
+        match self.0.entry(word_addr) {
             Entry::Vacant(vacant_entry) => {
-                // If this is the first access to the ctx/batch pair, then all values in the batch
+                // If this is the first access to the ctx/word pair, then all values in the word
                 // are initialized to 0, except for the address being written.
-                let batch = {
-                    let mut batch = Word::default();
-                    batch[addr_idx_in_word as usize] = value;
-                    batch
+                let word = {
+                    let mut word = Word::default();
+                    word[addr_idx_in_word as usize] = value;
+                    word
                 };
 
                 let access = MemorySegmentAccess::new(
                     clk,
                     MemoryOperation::Write,
-                    MemoryAccessType::Element { addr_idx_in_batch: addr_idx_in_word },
-                    batch,
+                    MemoryAccessType::Element { addr_idx_in_word },
+                    word,
                 );
                 vacant_entry.insert(vec![access]);
                 Ok(())
             },
             Entry::Occupied(mut occupied_entry) => {
-                // If the ctx/batch pair has been accessed before, then the values in the batch are
+                // If the ctx/word pair has been accessed before, then the values in the word are
                 // the same as the previous access, except for the address being written.
                 let addr_trace = occupied_entry.get_mut();
                 if addr_trace.last().expect("empty address trace").clk() == clk {
                     Err(ExecutionError::DuplicateMemoryAccess { ctx, addr, clk })
                 } else {
-                    let batch = {
-                        let mut last_batch =
-                            addr_trace.last().expect("empty address trace").batch();
-                        last_batch[addr_idx_in_word as usize] = value;
+                    let word = {
+                        let mut last_word =
+                            addr_trace.last().expect("empty address trace").word();
+                        last_word[addr_idx_in_word as usize] = value;
 
-                        last_batch
+                        last_word
                     };
 
                     let access = MemorySegmentAccess::new(
                         clk,
                         MemoryOperation::Write,
-                        MemoryAccessType::Element { addr_idx_in_batch: addr_idx_in_word },
-                        batch,
+                        MemoryAccessType::Element { addr_idx_in_word },
+                        word,
                     );
                     addr_trace.push(access);
 
@@ -232,18 +232,18 @@ impl MemorySegmentTrace {
     ) -> Result<(), ExecutionError> {
         debug_assert!(addr % 4 == 0, "unaligned memory access: {addr}");
 
-        let (batch, _) = addr_to_batch_and_idx(addr);
+        let (word_addr, _) = addr_to_word_addr_and_idx(addr);
 
         let access =
             MemorySegmentAccess::new(clk, MemoryOperation::Write, MemoryAccessType::Word, word);
-        match self.0.entry(batch) {
+        match self.0.entry(word_addr) {
             Entry::Vacant(vacant_entry) => {
-                // All values in the batch are set to the word being written.
+                // All values in the word are set to the word being written.
                 vacant_entry.insert(vec![access]);
                 Ok(())
             },
             Entry::Occupied(mut occupied_entry) => {
-                // All values in the batch are set to the word being written.
+                // All values in the word are set to the word being written.
                 let addr_trace = occupied_entry.get_mut();
                 if addr_trace.last().expect("empty address trace").clk() == clk {
                     Err(ExecutionError::DuplicateMemoryAccess { ctx, addr, clk })
@@ -271,23 +271,23 @@ impl MemorySegmentTrace {
     // HELPER FUNCTIONS
     // --------------------------------------------------------------------------------------------
 
-    /// Records a read operation on the specified batch at the specified clock cycle.
+    /// Records a read operation on the specified word at the specified clock cycle.
     ///
-    /// The access type either specifies the element in batch that was read, or that the entire word
+    /// The access type either specifies the element in word that was read, or that the entire word
     /// was read.
     /// 
     /// # Errors
     /// - Returns an error if the same address is accessed more than once in the same clock cycle.
-    fn read_batch(
+    fn read_word_helper(
         &mut self,
         ctx: ContextId,
-        batch: u32,
+        word_addr: u32,
         clk: Felt,
         access_type: MemoryAccessType,
     ) -> Result<Word, ExecutionError> {
-        match self.0.entry(batch) {
+        match self.0.entry(word_addr) {
             Entry::Vacant(vacant_entry) => {
-                // If this is the first access to the ctx/batch pair, then all values in the batch
+                // If this is the first access to the ctx/word pair, then all values in the word
                 // are initialized to 0.
                 let access = MemorySegmentAccess::new(
                     clk,
@@ -299,30 +299,30 @@ impl MemorySegmentTrace {
                 Ok(INIT_MEM_VALUE)
             },
             Entry::Occupied(mut occupied_entry) => {
-                // If the ctx/batch pair has been accessed before, then the values in the batch are
+                // If the ctx/word pair has been accessed before, then the values in the word are
                 // the same as the previous access.
                 let addr_trace = occupied_entry.get_mut();
                 if addr_trace.last().expect("empty address trace").clk() == clk {
-                    Err(ExecutionError::DuplicateMemoryAccess { ctx, addr: batch, clk })
+                    Err(ExecutionError::DuplicateMemoryAccess { ctx, addr: word_addr, clk })
                 } else {
-                    let last_batch = addr_trace.last().expect("empty address trace").batch();
+                    let last_word = addr_trace.last().expect("empty address trace").word();
                     let access = MemorySegmentAccess::new(
                         clk,
                         MemoryOperation::Read,
                         access_type,
-                        last_batch,
+                        last_word,
                     );
                     addr_trace.push(access);
 
-                    Ok(last_batch)
+                    Ok(last_word)
                 }
             },
         }
     }
 
-    /// Returns the number of batches that were accessed at least once.
+    /// Returns the number of words that were accessed at least once.
     #[cfg(test)]
-    pub fn num_accessed_batches(&self) -> usize {
+    pub fn num_accessed_words(&self) -> usize {
         self.0.len()
     }
 }
@@ -338,7 +338,7 @@ pub enum MemoryOperation {
 
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub enum MemoryAccessType {
-    Element { addr_idx_in_batch: u8 },
+    Element { addr_idx_in_word: u8 },
     Word,
 }
 
@@ -349,12 +349,12 @@ pub struct MemorySegmentAccess {
     clk: Felt,
     operation: MemoryOperation,
     access_type: MemoryAccessType,
-    batch: Word,
+    word: Word,
 }
 
 impl MemorySegmentAccess {
-    fn new(clk: Felt, op: MemoryOperation, access_type: MemoryAccessType, batch: Word) -> Self {
-        Self { clk, operation: op, access_type, batch }
+    fn new(clk: Felt, op: MemoryOperation, access_type: MemoryAccessType, word: Word) -> Self {
+        Self { clk, operation: op, access_type, word }
     }
 
     /// Returns the clock cycle at which this memory access happened.
@@ -372,12 +372,12 @@ impl MemorySegmentAccess {
         self.access_type
     }
 
-    /// Returns the batch associated with this memory access.
+    /// Returns the word associated with this memory access.
     ///
-    /// For example, if the memory access is an element read of address 42, the batch will contain
+    /// For example, if the memory access is an element read of address 42, the word will contain
     /// the values of addresses 40, 41, 42, and 43.
-    pub(super) fn batch(&self) -> Word {
-        self.batch
+    pub(super) fn word(&self) -> Word {
+        self.word
     }
 }
 
@@ -385,9 +385,9 @@ impl MemorySegmentAccess {
 // ================================================================================================
 
 /// Splits an address into two components:
-/// 1. a batch, which is the closest value to `addr` that is both smaller and word aligned,  and
-/// 2. the index within the batch which `addr` represents.
-pub fn addr_to_batch_and_idx(addr: u32) -> (u32, u8) {
+/// 1. a word, which is the closest value to `addr` that is both smaller and word aligned,  and
+/// 2. the index within the word which `addr` represents.
+pub fn addr_to_word_addr_and_idx(addr: u32) -> (u32, u8) {
     let idx = addr % WORD_SIZE as u32;
     (addr - idx, idx as u8)
 }
diff --git a/processor/src/chiplets/memory/tests.rs b/processor/src/chiplets/memory/tests.rs
index 3180730132..a6e4aaab61 100644
--- a/processor/src/chiplets/memory/tests.rs
+++ b/processor/src/chiplets/memory/tests.rs
@@ -2,7 +2,7 @@ use alloc::vec::Vec;
 
 use miden_air::{
     trace::chiplets::memory::{
-        FLAG_SAME_BATCH_AND_CONTEXT, IDX0_COL_IDX, IDX1_COL_IDX, IS_READ_COL_IDX,
+        FLAG_SAME_CONTEXT_AND_WORD, IDX0_COL_IDX, IDX1_COL_IDX, IS_READ_COL_IDX,
         IS_WORD_ACCESS_COL_IDX, MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ,
         MEMORY_WRITE, TRACE_WIDTH as MEMORY_TRACE_WIDTH,
     },
@@ -13,7 +13,7 @@ use vm_core::{assert_matches, Word, WORD_SIZE};
 use super::{
     super::ZERO,
     segment::{MemoryAccessType, MemoryOperation},
-    Felt, FieldElement, Memory, TraceFragment, BATCH_COL_IDX, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX,
+    Felt, FieldElement, Memory, TraceFragment, WORD_COL_IDX, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX,
     D1_COL_IDX, D_INV_COL_IDX, EMPTY_WORD, ONE, V_COL_RANGE,
 };
 use crate::{ContextId, ExecutionError};
@@ -21,7 +21,7 @@ use crate::{ContextId, ExecutionError};
 #[test]
 fn mem_init() {
     let mem = Memory::default();
-    assert_eq!(0, mem.num_accessed_batches());
+    assert_eq!(0, mem.num_accessed_words());
     assert_eq!(0, mem.trace_len());
 }
 
@@ -33,38 +33,38 @@ fn mem_read() {
     let addr0 = ZERO;
     let value = mem.read(ContextId::root(), addr0, 1.into()).unwrap();
     assert_eq!(ZERO, value);
-    assert_eq!(1, mem.num_accessed_batches());
+    assert_eq!(1, mem.num_accessed_words());
     assert_eq!(1, mem.trace_len());
 
     // read a value from address 3; clk = 2
     let addr3 = Felt::from(3_u32);
     let value = mem.read(ContextId::root(), addr3, 2.into()).unwrap();
     assert_eq!(ZERO, value);
-    assert_eq!(1, mem.num_accessed_batches());
+    assert_eq!(1, mem.num_accessed_words());
     assert_eq!(2, mem.trace_len());
 
     // read a value from address 0 again; clk = 3
     let value = mem.read(ContextId::root(), addr0, 3.into()).unwrap();
     assert_eq!(ZERO, value);
-    assert_eq!(1, mem.num_accessed_batches());
+    assert_eq!(1, mem.num_accessed_words());
     assert_eq!(3, mem.trace_len());
 
     // read a value from address 2; clk = 4
     let addr2 = Felt::from(2_u32);
     let value = mem.read(ContextId::root(), addr2, 4.into()).unwrap();
     assert_eq!(ZERO, value);
-    assert_eq!(1, mem.num_accessed_batches());
+    assert_eq!(1, mem.num_accessed_words());
     assert_eq!(4, mem.trace_len());
 
     // check generated trace and memory data provided to the ChipletsBus; rows should be sorted only
-    // by clock cycle, since they all access the same batch
+    // by clock cycle, since they all access the same word
     let trace = build_trace(mem, 4);
 
     // clk 1
     let mut prev_row = [ZERO; MEMORY_TRACE_WIDTH];
     let memory_access = MemoryAccess::new(
         MemoryOperation::Read,
-        MemoryAccessType::Element { addr_idx_in_batch: 0 },
+        MemoryAccessType::Element { addr_idx_in_word: 0 },
         ContextId::root(),
         addr0,
         1.into(),
@@ -75,7 +75,7 @@ fn mem_read() {
     // clk 2
     let memory_access = MemoryAccess::new(
         MemoryOperation::Read,
-        MemoryAccessType::Element { addr_idx_in_batch: 3 },
+        MemoryAccessType::Element { addr_idx_in_word: 3 },
         ContextId::root(),
         addr3,
         2.into(),
@@ -86,7 +86,7 @@ fn mem_read() {
     // clk 3
     let memory_access = MemoryAccess::new(
         MemoryOperation::Read,
-        MemoryAccessType::Element { addr_idx_in_batch: 0 },
+        MemoryAccessType::Element { addr_idx_in_word: 0 },
         ContextId::root(),
         addr0,
         3.into(),
@@ -97,7 +97,7 @@ fn mem_read() {
     // clk 4
     let memory_access = MemoryAccess::new(
         MemoryOperation::Read,
-        MemoryAccessType::Element { addr_idx_in_batch: 2 },
+        MemoryAccessType::Element { addr_idx_in_word: 2 },
         ContextId::root(),
         addr2,
         4.into(),
@@ -133,7 +133,7 @@ fn mem_write() {
     let word1 = [ONE, ZERO, ZERO, ZERO];
     mem.write_word(ContextId::root(), addr0.into(), 1.into(), word1).unwrap();
     assert_eq!(word1, mem.get_word(ContextId::root(), addr0).unwrap().unwrap());
-    assert_eq!(1, mem.num_accessed_batches());
+    assert_eq!(1, mem.num_accessed_words());
     assert_eq!(1, mem.trace_len());
 
     // write a value into address 2; clk = 2
@@ -141,7 +141,7 @@ fn mem_write() {
     let value5 = Felt::new(5);
     mem.write(ContextId::root(), addr2.into(), 2.into(), value5).unwrap();
     assert_eq!(value5, mem.get_value(ContextId::root(), addr2).unwrap());
-    assert_eq!(1, mem.num_accessed_batches());
+    assert_eq!(1, mem.num_accessed_words());
     assert_eq!(2, mem.trace_len());
 
     // write a value into address 1; clk = 3
@@ -149,7 +149,7 @@ fn mem_write() {
     let value7 = Felt::new(7);
     mem.write(ContextId::root(), addr1.into(), 3.into(), value7).unwrap();
     assert_eq!(value7, mem.get_value(ContextId::root(), addr1).unwrap());
-    assert_eq!(1, mem.num_accessed_batches());
+    assert_eq!(1, mem.num_accessed_words());
     assert_eq!(3, mem.trace_len());
 
     // write a value into address 3; clk = 4
@@ -157,7 +157,7 @@ fn mem_write() {
     let value9 = Felt::new(9);
     mem.write(ContextId::root(), addr3.into(), 4.into(), value9).unwrap();
     assert_eq!(value9, mem.get_value(ContextId::root(), addr3).unwrap());
-    assert_eq!(1, mem.num_accessed_batches());
+    assert_eq!(1, mem.num_accessed_words());
     assert_eq!(4, mem.trace_len());
 
     // write a word into address 4; clk = 5
@@ -165,21 +165,21 @@ fn mem_write() {
     let word1234 = [ONE, 2_u32.into(), 3_u32.into(), 4_u32.into()];
     mem.write_word(ContextId::root(), addr4.into(), 5.into(), word1234).unwrap();
     assert_eq!(word1234, mem.get_word(ContextId::root(), addr4).unwrap().unwrap());
-    assert_eq!(2, mem.num_accessed_batches());
+    assert_eq!(2, mem.num_accessed_words());
     assert_eq!(5, mem.trace_len());
 
     // write a word into address 0; clk = 6
     let word5678: [Felt; 4] = [5_u32.into(), 6_u32.into(), 7_u32.into(), 8_u32.into()];
     mem.write_word(ContextId::root(), addr0.into(), 6.into(), word5678).unwrap();
     assert_eq!(word5678, mem.get_word(ContextId::root(), addr0).unwrap().unwrap());
-    assert_eq!(2, mem.num_accessed_batches());
+    assert_eq!(2, mem.num_accessed_words());
     assert_eq!(6, mem.trace_len());
 
     // check generated trace and memory data provided to the ChipletsBus; rows should be sorted by
     // address and then clock cycle
     let trace = build_trace(mem, 6);
 
-    // batch 0
+    // word 0
     let mut prev_row = [ZERO; MEMORY_TRACE_WIDTH];
     let memory_access = MemoryAccess::new(
         MemoryOperation::Write,
@@ -193,7 +193,7 @@ fn mem_write() {
 
     let memory_access = MemoryAccess::new(
         MemoryOperation::Write,
-        MemoryAccessType::Element { addr_idx_in_batch: 2 },
+        MemoryAccessType::Element { addr_idx_in_word: 2 },
         ContextId::root(),
         addr2.into(),
         2.into(),
@@ -203,7 +203,7 @@ fn mem_write() {
 
     let memory_access = MemoryAccess::new(
         MemoryOperation::Write,
-        MemoryAccessType::Element { addr_idx_in_batch: 1 },
+        MemoryAccessType::Element { addr_idx_in_word: 1 },
         ContextId::root(),
         addr1.into(),
         3.into(),
@@ -213,7 +213,7 @@ fn mem_write() {
 
     let memory_access = MemoryAccess::new(
         MemoryOperation::Write,
-        MemoryAccessType::Element { addr_idx_in_batch: 3 },
+        MemoryAccessType::Element { addr_idx_in_word: 3 },
         ContextId::root(),
         addr3.into(),
         4.into(),
@@ -231,7 +231,7 @@ fn mem_write() {
     );
     prev_row = verify_memory_access(&trace, 4, memory_access, prev_row);
 
-    // batch 1
+    // word 1
     let memory_access = MemoryAccess::new(
         MemoryOperation::Write,
         MemoryAccessType::Word,
@@ -326,7 +326,7 @@ fn mem_write_read() {
 
     let memory_access = MemoryAccess::new(
         MemoryOperation::Read,
-        MemoryAccessType::Element { addr_idx_in_batch: 3 },
+        MemoryAccessType::Element { addr_idx_in_word: 3 },
         ContextId::root(),
         3_u32.into(),
         clk,
@@ -337,7 +337,7 @@ fn mem_write_read() {
 
     let memory_access = MemoryAccess::new(
         MemoryOperation::Read,
-        MemoryAccessType::Element { addr_idx_in_batch: 2 },
+        MemoryAccessType::Element { addr_idx_in_word: 2 },
         ContextId::root(),
         2_u32.into(),
         clk,
@@ -348,7 +348,7 @@ fn mem_write_read() {
 
     let memory_access = MemoryAccess::new(
         MemoryOperation::Read,
-        MemoryAccessType::Element { addr_idx_in_batch: 1 },
+        MemoryAccessType::Element { addr_idx_in_word: 1 },
         ContextId::root(),
         1_u32.into(),
         clk,
@@ -359,7 +359,7 @@ fn mem_write_read() {
 
     let memory_access = MemoryAccess::new(
         MemoryOperation::Read,
-        MemoryAccessType::Element { addr_idx_in_batch: 0 },
+        MemoryAccessType::Element { addr_idx_in_word: 0 },
         ContextId::root(),
         ZERO,
         clk,
@@ -381,7 +381,7 @@ fn mem_write_read() {
 
     let memory_access = MemoryAccess::new(
         MemoryOperation::Write,
-        MemoryAccessType::Element { addr_idx_in_batch: 2 },
+        MemoryAccessType::Element { addr_idx_in_word: 2 },
         ContextId::root(),
         2_u32.into(),
         clk,
@@ -392,7 +392,7 @@ fn mem_write_read() {
 
     let memory_access = MemoryAccess::new(
         MemoryOperation::Read,
-        MemoryAccessType::Element { addr_idx_in_batch: 2 },
+        MemoryAccessType::Element { addr_idx_in_word: 2 },
         ContextId::root(),
         2_u32.into(),
         clk,
@@ -465,7 +465,7 @@ pub struct MemoryAccess {
     ctx: ContextId,
     addr: Felt,
     clk: Felt,
-    batch_values: [Felt; 4],
+    word_values: [Felt; 4],
 }
 
 impl MemoryAccess {
@@ -475,9 +475,9 @@ impl MemoryAccess {
         ctx: ContextId,
         addr: Felt,
         clk: RowIndex,
-        batch_values: Word,
+        word_values: Word,
     ) -> Self {
-        if let MemoryAccessType::Element { addr_idx_in_batch: addr_idx_in_word } = access_type {
+        if let MemoryAccessType::Element { addr_idx_in_word } = access_type {
             let addr: u32 = addr.try_into().unwrap();
             assert_eq!(addr_idx_in_word as u32, addr % WORD_SIZE as u32);
         }
@@ -488,7 +488,7 @@ impl MemoryAccess {
             ctx,
             addr,
             clk: Felt::from(clk),
-            batch_values,
+            word_values,
         }
     }
 }
@@ -520,19 +520,19 @@ fn build_trace_row(
         ctx,
         addr,
         clk,
-        batch_values,
+        word_values,
     } = memory_access;
 
-    let (batch, idx1, idx0) = {
+    let (word, idx1, idx0) = {
         let addr: u32 = addr.try_into().unwrap();
         let remainder = addr % WORD_SIZE as u32;
-        let batch = Felt::from(addr - remainder);
+        let word = Felt::from(addr - remainder);
 
         match remainder {
-            0 => (batch, ZERO, ZERO),
-            1 => (batch, ZERO, ONE),
-            2 => (batch, ONE, ZERO),
-            3 => (batch, ONE, ONE),
+            0 => (word, ZERO, ZERO),
+            1 => (word, ZERO, ONE),
+            2 => (word, ONE, ZERO),
+            3 => (word, ONE, ONE),
             _ => unreachable!(),
         }
     };
@@ -548,20 +548,20 @@ fn build_trace_row(
         MemoryAccessType::Word => MEMORY_ACCESS_WORD,
     };
     row[CTX_COL_IDX] = ctx.into();
-    row[BATCH_COL_IDX] = batch;
+    row[WORD_COL_IDX] = word;
     row[IDX0_COL_IDX] = idx0;
     row[IDX1_COL_IDX] = idx1;
     row[CLK_COL_IDX] = clk;
-    row[V_COL_RANGE.start] = batch_values[0];
-    row[V_COL_RANGE.start + 1] = batch_values[1];
-    row[V_COL_RANGE.start + 2] = batch_values[2];
-    row[V_COL_RANGE.start + 3] = batch_values[3];
+    row[V_COL_RANGE.start] = word_values[0];
+    row[V_COL_RANGE.start + 1] = word_values[1];
+    row[V_COL_RANGE.start + 2] = word_values[2];
+    row[V_COL_RANGE.start + 3] = word_values[3];
 
     if prev_row != [ZERO; MEMORY_TRACE_WIDTH] {
         let delta = if row[CTX_COL_IDX] != prev_row[CTX_COL_IDX] {
             row[CTX_COL_IDX] - prev_row[CTX_COL_IDX]
-        } else if row[BATCH_COL_IDX] != prev_row[BATCH_COL_IDX] {
-            row[BATCH_COL_IDX] - prev_row[BATCH_COL_IDX]
+        } else if row[WORD_COL_IDX] != prev_row[WORD_COL_IDX] {
+            row[WORD_COL_IDX] - prev_row[WORD_COL_IDX]
         } else {
             row[CLK_COL_IDX] - prev_row[CLK_COL_IDX] - ONE
         };
@@ -572,10 +572,10 @@ fn build_trace_row(
         row[D_INV_COL_IDX] = delta.inv();
     }
 
-    if row[BATCH_COL_IDX] == prev_row[BATCH_COL_IDX] && row[CTX_COL_IDX] == prev_row[CTX_COL_IDX] {
-        row[FLAG_SAME_BATCH_AND_CONTEXT] = ONE;
+    if row[WORD_COL_IDX] == prev_row[WORD_COL_IDX] && row[CTX_COL_IDX] == prev_row[CTX_COL_IDX] {
+        row[FLAG_SAME_CONTEXT_AND_WORD] = ONE;
     } else {
-        row[FLAG_SAME_BATCH_AND_CONTEXT] = ZERO;
+        row[FLAG_SAME_CONTEXT_AND_WORD] = ZERO;
     }
 
     row
diff --git a/processor/src/operations/io_ops.rs b/processor/src/operations/io_ops.rs
index b1ded47a2c..ad67c13316 100644
--- a/processor/src/operations/io_ops.rs
+++ b/processor/src/operations/io_ops.rs
@@ -296,7 +296,7 @@ mod tests {
     fn op_mloadw() {
         let mut host = DefaultHost::default();
         let mut process = Process::new_dummy_with_decoder_helpers_and_empty_stack();
-        assert_eq!(0, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(0, process.chiplets.memory().num_accessed_words());
 
         // push a word onto the stack and save it at address 4
         let word = [1, 3, 5, 7].to_elements().try_into().unwrap();
@@ -315,7 +315,7 @@ mod tests {
         assert_eq!(expected_stack, process.stack.trace_state());
 
         // check memory state
-        assert_eq!(1, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(1, process.chiplets.memory().num_accessed_words());
         assert_eq!(
             word,
             process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
@@ -334,7 +334,7 @@ mod tests {
     fn op_mload() {
         let mut host = DefaultHost::default();
         let mut process = Process::new_dummy_with_decoder_helpers_and_empty_stack();
-        assert_eq!(0, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(0, process.chiplets.memory().num_accessed_words());
 
         // push a word onto the stack and save it at address 4
         let word = [1, 3, 5, 7].to_elements().try_into().unwrap();
@@ -348,7 +348,7 @@ mod tests {
         assert_eq!(expected_stack, process.stack.trace_state());
 
         // check memory state
-        assert_eq!(1, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(1, process.chiplets.memory().num_accessed_words());
         assert_eq!(
             word,
             process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
@@ -377,7 +377,7 @@ mod tests {
         store_value(&mut process, 8, word2_felts, &mut host);
 
         // check memory state
-        assert_eq!(2, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(2, process.chiplets.memory().num_accessed_words());
         assert_eq!(
             word1_felts,
             process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
@@ -432,7 +432,7 @@ mod tests {
     fn op_mstorew() {
         let mut host = DefaultHost::default();
         let mut process = Process::new_dummy_with_decoder_helpers_and_empty_stack();
-        assert_eq!(0, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(0, process.chiplets.memory().num_accessed_words());
 
         // push the first word onto the stack and save it at address 0
         let word1 = [1, 3, 5, 7].to_elements().try_into().unwrap();
@@ -443,7 +443,7 @@ mod tests {
         assert_eq!(expected_stack, process.stack.trace_state());
 
         // check memory state
-        assert_eq!(1, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(1, process.chiplets.memory().num_accessed_words());
         assert_eq!(
             word1,
             process.chiplets.memory().get_word(ContextId::root(), 0).unwrap().unwrap()
@@ -458,7 +458,7 @@ mod tests {
         assert_eq!(expected_stack, process.stack.trace_state());
 
         // check memory state
-        assert_eq!(2, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(2, process.chiplets.memory().num_accessed_words());
         assert_eq!(
             word1,
             process.chiplets.memory().get_word(ContextId::root(), 0).unwrap().unwrap()
@@ -481,7 +481,7 @@ mod tests {
     fn op_mstore() {
         let mut host = DefaultHost::default();
         let mut process = Process::new_dummy_with_decoder_helpers_and_empty_stack();
-        assert_eq!(0, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(0, process.chiplets.memory().num_accessed_words());
 
         // push new element onto the stack and save it as first element of the word on
         // uninitialized memory at address 0
@@ -494,7 +494,7 @@ mod tests {
 
         // check memory state
         let mem_0 = [element, ZERO, ZERO, ZERO];
-        assert_eq!(1, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(1, process.chiplets.memory().num_accessed_words());
         assert_eq!(
             mem_0,
             process.chiplets.memory().get_word(ContextId::root(), 0).unwrap().unwrap()
@@ -514,7 +514,7 @@ mod tests {
 
         // check memory state to make sure the other 3 elements were not affected
         let mem_2 = [element, Felt::new(3), Felt::new(5), Felt::new(7)];
-        assert_eq!(2, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(2, process.chiplets.memory().num_accessed_words());
         assert_eq!(
             mem_2,
             process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
@@ -560,7 +560,7 @@ mod tests {
         process.execute_op(Operation::Pipe, &mut host).unwrap();
 
         // check memory state contains the words from the advice stack
-        assert_eq!(2, process.chiplets.memory().num_accessed_batches());
+        assert_eq!(2, process.chiplets.memory().num_accessed_words());
         assert_eq!(
             word1_felts,
             process.chiplets.memory().get_word(ContextId::root(), 4).unwrap().unwrap()
diff --git a/processor/src/trace/tests/chiplets/memory.rs b/processor/src/trace/tests/chiplets/memory.rs
index df7697dd9f..9519682fab 100644
--- a/processor/src/trace/tests/chiplets/memory.rs
+++ b/processor/src/trace/tests/chiplets/memory.rs
@@ -5,7 +5,7 @@ use miden_air::{
             MEMORY_READ_WORD_LABEL, MEMORY_WRITE, MEMORY_WRITE_ELEMENT_LABEL,
             MEMORY_WRITE_WORD_LABEL,
         },
-        MEMORY_BATCH_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_IDX0_COL_IDX,
+        MEMORY_WORD_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_IDX0_COL_IDX,
         MEMORY_IDX1_COL_IDX, MEMORY_IS_READ_COL_IDX, MEMORY_IS_WORD_ACCESS_COL_IDX,
         MEMORY_V_COL_RANGE,
     },
@@ -240,11 +240,11 @@ fn build_expected_bus_msg_from_trace(
     // get the memory access data
     let ctx = trace.main_trace.get_column(MEMORY_CTX_COL_IDX)[row];
     let addr = {
-        let batch = trace.main_trace.get_column(MEMORY_BATCH_COL_IDX)[row];
+        let word = trace.main_trace.get_column(MEMORY_WORD_COL_IDX)[row];
         let idx1 = trace.main_trace.get_column(MEMORY_IDX1_COL_IDX)[row];
         let idx0 = trace.main_trace.get_column(MEMORY_IDX0_COL_IDX)[row];
 
-        batch + idx1.mul_small(2) + idx0
+        word + idx1.mul_small(2) + idx0
     };
     let clk = trace.main_trace.get_column(MEMORY_CLK_COL_IDX)[row];
 

From 19c3a9ca71708be4946ea022e358d7969f0ac215 Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Mon, 6 Jan 2025 15:16:05 -0500
Subject: [PATCH 11/19] more PR fixes

---
 air/src/constraints/chiplets/memory/mod.rs     |  8 ++++----
 air/src/constraints/chiplets/memory/tests.rs   |  4 ++--
 air/src/trace/chiplets/mod.rs                  |  4 ++--
 air/src/trace/main_trace.rs                    |  4 ++--
 miden/src/cli/debug/executor.rs                | 14 ++++----------
 miden/src/helpers.rs                           |  6 ++++++
 miden/src/main.rs                              |  2 ++
 miden/src/repl/mod.rs                          |  8 ++------
 processor/src/chiplets/aux_trace/mod.rs        |  3 +--
 processor/src/chiplets/memory/mod.rs           | 12 +++++-------
 processor/src/chiplets/memory/segment.rs       | 11 +++++------
 processor/src/chiplets/memory/tests.rs         |  4 ++--
 processor/src/host/debug.rs                    | 18 +++++++++---------
 processor/src/operations/io_ops.rs             | 12 ++++++++++++
 .../operations/sys_ops/sys_event_handlers.rs   |  2 +-
 processor/src/trace/tests/chiplets/memory.rs   |  6 +++---
 stdlib/asm/crypto/dsa/rpo_falcon512.masm       | 10 +++++-----
 stdlib/asm/crypto/stark/random_coin.masm       |  2 ++
 stdlib/docs/mem.md                             |  2 +-
 19 files changed, 70 insertions(+), 62 deletions(-)
 create mode 100644 miden/src/helpers.rs

diff --git a/air/src/constraints/chiplets/memory/mod.rs b/air/src/constraints/chiplets/memory/mod.rs
index 4b4918750e..79af15c825 100644
--- a/air/src/constraints/chiplets/memory/mod.rs
+++ b/air/src/constraints/chiplets/memory/mod.rs
@@ -5,10 +5,10 @@ use winter_air::TransitionConstraintDegree;
 use super::{EvaluationFrame, FieldElement};
 use crate::{
     trace::chiplets::{
-        MEMORY_WORD_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_D0_COL_IDX,
-        MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_FLAG_SAME_CONTEXT_AND_WORD,
-        MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX, MEMORY_IS_READ_COL_IDX,
-        MEMORY_IS_WORD_ACCESS_COL_IDX, MEMORY_V_COL_RANGE,
+        MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_D0_COL_IDX, MEMORY_D1_COL_IDX,
+        MEMORY_D_INV_COL_IDX, MEMORY_FLAG_SAME_CONTEXT_AND_WORD, MEMORY_IDX0_COL_IDX,
+        MEMORY_IDX1_COL_IDX, MEMORY_IS_READ_COL_IDX, MEMORY_IS_WORD_ACCESS_COL_IDX,
+        MEMORY_V_COL_RANGE, MEMORY_WORD_COL_IDX,
     },
     utils::{binary_not, is_binary, EvaluationResult},
 };
diff --git a/air/src/constraints/chiplets/memory/tests.rs b/air/src/constraints/chiplets/memory/tests.rs
index 0869c85174..fb0cacf934 100644
--- a/air/src/constraints/chiplets/memory/tests.rs
+++ b/air/src/constraints/chiplets/memory/tests.rs
@@ -4,8 +4,8 @@ use rand_utils::rand_value;
 use vm_core::{Felt, FieldElement, WORD_SIZE};
 
 use super::{
-    EvaluationFrame, MEMORY_WORD_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
-    MEMORY_D0_COL_IDX, MEMORY_D1_COL_IDX, MEMORY_D_INV_COL_IDX, MEMORY_V_COL_RANGE,
+    EvaluationFrame, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_D0_COL_IDX, MEMORY_D1_COL_IDX,
+    MEMORY_D_INV_COL_IDX, MEMORY_V_COL_RANGE, MEMORY_WORD_COL_IDX,
 };
 use crate::{
     chiplets::memory,
diff --git a/air/src/trace/chiplets/mod.rs b/air/src/trace/chiplets/mod.rs
index 9ceeef90fb..64991a8855 100644
--- a/air/src/trace/chiplets/mod.rs
+++ b/air/src/trace/chiplets/mod.rs
@@ -121,7 +121,7 @@ pub const MEMORY_D1_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::D1_COL_IDX;
 /// memory context IDs, addresses, or clock cycles, used to enforce that changes are correctly
 /// constrained.
 pub const MEMORY_D_INV_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::D_INV_COL_IDX;
-/// Column to hold the flag indicating whether the current memory operation is in the same context and
-/// same word as the previous operation.
+/// Column to hold the flag indicating whether the current memory operation is in the same context
+/// and same word as the previous operation.
 pub const MEMORY_FLAG_SAME_CONTEXT_AND_WORD: usize =
     MEMORY_TRACE_OFFSET + memory::FLAG_SAME_CONTEXT_AND_WORD;
diff --git a/air/src/trace/main_trace.rs b/air/src/trace/main_trace.rs
index ad3ef61970..9c0d101ff0 100644
--- a/air/src/trace/main_trace.rs
+++ b/air/src/trace/main_trace.rs
@@ -9,8 +9,8 @@ use super::{
     chiplets::{
         hasher::{DIGEST_LEN, HASH_CYCLE_LEN, STATE_WIDTH},
         BITWISE_A_COL_IDX, BITWISE_B_COL_IDX, BITWISE_OUTPUT_COL_IDX, HASHER_NODE_INDEX_COL_IDX,
-        HASHER_STATE_COL_RANGE, MEMORY_WORD_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX,
-        MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX, MEMORY_V_COL_RANGE,
+        HASHER_STATE_COL_RANGE, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_IDX0_COL_IDX,
+        MEMORY_IDX1_COL_IDX, MEMORY_V_COL_RANGE, MEMORY_WORD_COL_IDX,
     },
     decoder::{
         GROUP_COUNT_COL_IDX, HASHER_STATE_OFFSET, IN_SPAN_COL_IDX, IS_CALL_FLAG_COL_IDX,
diff --git a/miden/src/cli/debug/executor.rs b/miden/src/cli/debug/executor.rs
index 4f3a209213..7c3012d4b3 100644
--- a/miden/src/cli/debug/executor.rs
+++ b/miden/src/cli/debug/executor.rs
@@ -1,10 +1,9 @@
 use std::sync::Arc;
 
-use miden_vm::{
-    math::Felt, DefaultHost, MemAdviceProvider, Program, StackInputs, VmState, VmStateIterator,
-};
+use miden_vm::{DefaultHost, MemAdviceProvider, Program, StackInputs, VmState, VmStateIterator};
 
 use super::DebugCommand;
+use crate::helpers::print_mem_address;
 
 /// Holds debugger state and iterator used for debugging.
 pub struct DebugExecutor {
@@ -155,7 +154,7 @@ impl DebugExecutor {
     /// Prints all memory entries.
     pub fn print_memory(&self) {
         for &(address, mem) in self.vm_state.memory.iter() {
-            Self::print_memory_data(address, mem)
+            print_mem_address(address, mem)
         }
     }
 
@@ -167,7 +166,7 @@ impl DebugExecutor {
         });
 
         match entry {
-            Some(&mem) => Self::print_memory_data(address, mem),
+            Some(&mem) => print_mem_address(address, mem),
             None => println!("memory at address '{address}' not found"),
         }
     }
@@ -175,11 +174,6 @@ impl DebugExecutor {
     // HELPERS
     // --------------------------------------------------------------------------------------------
 
-    /// Prints memory data.
-    fn print_memory_data(address: u64, mem_value: Felt) {
-        println!("{address} {mem_value:?}");
-    }
-
     /// Prints help message
     fn print_help() {
         let message = "---------------------------------------------------------------------\n\
diff --git a/miden/src/helpers.rs b/miden/src/helpers.rs
new file mode 100644
index 0000000000..a83c65e753
--- /dev/null
+++ b/miden/src/helpers.rs
@@ -0,0 +1,6 @@
+use vm_core::Felt;
+
+/// Prints the memory address along with the memory value at that address.
+pub fn print_mem_address(addr: u64, mem_value: Felt) {
+    println!("{addr} {mem_value}")
+}
diff --git a/miden/src/main.rs b/miden/src/main.rs
index 85d15d73a7..b03d3c9202 100644
--- a/miden/src/main.rs
+++ b/miden/src/main.rs
@@ -10,6 +10,8 @@ mod cli;
 mod repl;
 mod tools;
 
+pub(crate) mod helpers;
+
 /// Root CLI struct
 #[derive(Parser, Debug)]
 #[clap(name = "Miden", about = "Miden CLI", version, rename_all = "kebab-case")]
diff --git a/miden/src/repl/mod.rs b/miden/src/repl/mod.rs
index 53cf3cecdb..ed3335c688 100644
--- a/miden/src/repl/mod.rs
+++ b/miden/src/repl/mod.rs
@@ -6,6 +6,8 @@ use processor::ContextId;
 use rustyline::{error::ReadlineError, DefaultEditor};
 use stdlib::StdLibrary;
 
+use crate::helpers::print_mem_address;
+
 // This work is in continuation to the amazing work done by team `Scribe`
 // [here](https://github.com/ControlCplusControlV/Scribe/blob/main/transpiler/src/repl.rs#L8)
 //
@@ -401,9 +403,3 @@ fn print_stack(stack: Vec<Felt>) {
     // converts the stack which is a vector of felt into string and prints it.
     println!("{}", stack.iter().map(|f| format!("{}", f)).collect::<Vec<_>>().join(" "),)
 }
-
-/// Accepts and returns a memory at an address by converting its register into integer
-/// from Felt.
-fn print_mem_address(addr: u64, mem_value: Felt) {
-    println!("{addr} {mem_value}")
-}
diff --git a/processor/src/chiplets/aux_trace/mod.rs b/processor/src/chiplets/aux_trace/mod.rs
index b03586e0d8..483d04f78a 100644
--- a/processor/src/chiplets/aux_trace/mod.rs
+++ b/processor/src/chiplets/aux_trace/mod.rs
@@ -943,8 +943,7 @@ where
     let root2 = main_trace.chiplet_kernel_root_2(row);
     let root3 = main_trace.chiplet_kernel_root_3(row);
 
-    let v =
-        alphas[0] + build_value(&alphas[1..6], &[Felt::from(op_label), root0, root1, root2, root3]);
+    let v = alphas[0] + build_value(&alphas[1..6], &[op_label, root0, root1, root2, root3]);
 
     let kernel_chiplet_selector = main_trace.chiplet_selector_4(row);
     v.mul_base(kernel_chiplet_selector) + E::from(ONE - kernel_chiplet_selector)
diff --git a/processor/src/chiplets/memory/mod.rs b/processor/src/chiplets/memory/mod.rs
index 02539264f5..f24b5b592f 100644
--- a/processor/src/chiplets/memory/mod.rs
+++ b/processor/src/chiplets/memory/mod.rs
@@ -2,10 +2,10 @@ use alloc::{collections::BTreeMap, vec::Vec};
 
 use miden_air::{
     trace::chiplets::memory::{
-        WORD_COL_IDX, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX, D1_COL_IDX, D_INV_COL_IDX,
+        CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX, D1_COL_IDX, D_INV_COL_IDX,
         FLAG_SAME_CONTEXT_AND_WORD, IDX0_COL_IDX, IDX1_COL_IDX, IS_READ_COL_IDX,
         IS_WORD_ACCESS_COL_IDX, MEMORY_ACCESS_ELEMENT, MEMORY_ACCESS_WORD, MEMORY_READ,
-        MEMORY_WRITE, V_COL_RANGE,
+        MEMORY_WRITE, V_COL_RANGE, WORD_COL_IDX,
     },
     RowIndex,
 };
@@ -78,8 +78,8 @@ const INIT_MEM_VALUE: Word = EMPTY_WORD;
 ///     - `old_word`).
 ///   - When both the context and the word remain the same, these columns contain (`new_clk` -
 ///     `old_clk` - 1).
-/// - `d_inv` contains the inverse of the delta between two consecutive context IDs, words, or
-///   clock cycles computed as described above. It is the field inverse of `(d_1 * 2^16) + d_0`
+/// - `d_inv` contains the inverse of the delta between two consecutive context IDs, words, or clock
+///   cycles computed as described above. It is the field inverse of `(d_1 * 2^16) + d_0`
 /// - `f_scw` is a flag indicating whether the context and the word of the current row are the same
 ///   as in the next row.
 ///
@@ -326,9 +326,7 @@ impl Memory {
                         MemoryOperation::Write => trace.set(row, IS_READ_COL_IDX, MEMORY_WRITE),
                     }
                     let (idx1, idx0) = match memory_access.access_type() {
-                        segment::MemoryAccessType::Element {
-                            addr_idx_in_word,
-                        } => {
+                        segment::MemoryAccessType::Element { addr_idx_in_word } => {
                             trace.set(row, IS_WORD_ACCESS_COL_IDX, MEMORY_ACCESS_ELEMENT);
 
                             match addr_idx_in_word {
diff --git a/processor/src/chiplets/memory/segment.rs b/processor/src/chiplets/memory/segment.rs
index e2dac655d0..20ea1bb4f0 100644
--- a/processor/src/chiplets/memory/segment.rs
+++ b/processor/src/chiplets/memory/segment.rs
@@ -132,7 +132,7 @@ impl MemorySegmentTrace {
     ///
     /// If the word starting at the specified address hasn't been previously written to, four ZERO
     /// elements are returned. This effectively implies that memory is initialized to ZERO.
-    /// 
+    ///
     /// # Preconditions
     /// - Assumes that the address is word aligned.
     ///
@@ -193,8 +193,7 @@ impl MemorySegmentTrace {
                     Err(ExecutionError::DuplicateMemoryAccess { ctx, addr, clk })
                 } else {
                     let word = {
-                        let mut last_word =
-                            addr_trace.last().expect("empty address trace").word();
+                        let mut last_word = addr_trace.last().expect("empty address trace").word();
                         last_word[addr_idx_in_word as usize] = value;
 
                         last_word
@@ -216,9 +215,9 @@ impl MemorySegmentTrace {
 
     /// Writes the provided word starting at the specified address. The memory access is assumed to
     /// happen at the provided clock cycle.
-    /// 
+    ///
     /// # Preconditions
-    /// 
+    ///
     /// - Assumes that the address is word aligned.
     ///
     /// # Errors
@@ -275,7 +274,7 @@ impl MemorySegmentTrace {
     ///
     /// The access type either specifies the element in word that was read, or that the entire word
     /// was read.
-    /// 
+    ///
     /// # Errors
     /// - Returns an error if the same address is accessed more than once in the same clock cycle.
     fn read_word_helper(
diff --git a/processor/src/chiplets/memory/tests.rs b/processor/src/chiplets/memory/tests.rs
index a6e4aaab61..74123efa68 100644
--- a/processor/src/chiplets/memory/tests.rs
+++ b/processor/src/chiplets/memory/tests.rs
@@ -13,8 +13,8 @@ use vm_core::{assert_matches, Word, WORD_SIZE};
 use super::{
     super::ZERO,
     segment::{MemoryAccessType, MemoryOperation},
-    Felt, FieldElement, Memory, TraceFragment, WORD_COL_IDX, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX,
-    D1_COL_IDX, D_INV_COL_IDX, EMPTY_WORD, ONE, V_COL_RANGE,
+    Felt, FieldElement, Memory, TraceFragment, CLK_COL_IDX, CTX_COL_IDX, D0_COL_IDX, D1_COL_IDX,
+    D_INV_COL_IDX, EMPTY_WORD, ONE, V_COL_RANGE, WORD_COL_IDX,
 };
 use crate::{ContextId, ExecutionError};
 
diff --git a/processor/src/host/debug.rs b/processor/src/host/debug.rs
index bb4c7ad46d..876e593662 100644
--- a/processor/src/host/debug.rs
+++ b/processor/src/host/debug.rs
@@ -74,7 +74,7 @@ impl Printer {
     /// Prints the whole memory state at the cycle `clk` in context `ctx`.
     fn print_mem_all(&self, process: ProcessState) {
         let mem = process.get_mem_state(self.ctx);
-        let ele_width = mem
+        let element_width = mem
             .iter()
             .map(|(_addr, value)| element_printed_width(Some(*value)))
             .max()
@@ -84,12 +84,12 @@ impl Printer {
 
         // print the main part of the memory (wihtout the last value)
         for (addr, value) in mem.iter().take(mem.len() - 1) {
-            print_mem_address(*addr as u32, Some(*value), false, false, ele_width);
+            print_mem_address(*addr as u32, Some(*value), false, false, element_width);
         }
 
         // print the last memory value
         if let Some((addr, value)) = mem.last() {
-            print_mem_address(*addr as u32, Some(*value), true, false, ele_width);
+            print_mem_address(*addr as u32, Some(*value), true, false, element_width);
         }
     }
 
@@ -154,7 +154,7 @@ impl Printer {
 /// If `is_local` is true, the output addresses are formatted as decimal values, otherwise as hex
 /// strings.
 fn print_interval(mem_interval: Vec<(u32, Option<Felt>)>, is_local: bool) {
-    let ele_width = mem_interval
+    let element_width = mem_interval
         .iter()
         .map(|(_addr, value)| element_printed_width(*value))
         .max()
@@ -162,12 +162,12 @@ fn print_interval(mem_interval: Vec<(u32, Option<Felt>)>, is_local: bool) {
 
     // print the main part of the memory (wihtout the last value)
     for (addr, mem_value) in mem_interval.iter().take(mem_interval.len() - 1) {
-        print_mem_address(*addr, *mem_value, false, is_local, ele_width)
+        print_mem_address(*addr, *mem_value, false, is_local, element_width)
     }
 
     // print the last memory value
     if let Some((addr, value)) = mem_interval.last() {
-        print_mem_address(*addr, *value, true, is_local, ele_width);
+        print_mem_address(*addr, *value, true, is_local, element_width);
     }
 }
 
@@ -180,7 +180,7 @@ fn print_mem_address(
     mem_value: Option<Felt>,
     is_last: bool,
     is_local: bool,
-    ele_width: usize,
+    element_width: usize,
 ) {
     if let Some(value) = mem_value {
         if is_last {
@@ -189,14 +189,14 @@ fn print_mem_address(
             } else {
                 print!("└── {addr:#010x}: ");
             }
-            println!("{:>width$}\n", value.as_int(), width = ele_width);
+            println!("{:>width$}\n", value.as_int(), width = element_width);
         } else {
             if is_local {
                 print!("├── {addr:>5}: ");
             } else {
                 print!("├── {addr:#010x}: ");
             }
-            println!("{:>width$}", value.as_int(), width = ele_width);
+            println!("{:>width$}", value.as_int(), width = element_width);
         }
     } else if is_last {
         if is_local {
diff --git a/processor/src/operations/io_ops.rs b/processor/src/operations/io_ops.rs
index ad67c13316..9c6bc8782d 100644
--- a/processor/src/operations/io_ops.rs
+++ b/processor/src/operations/io_ops.rs
@@ -32,6 +32,9 @@ impl Process {
     /// - The top four elements of the stack are overwritten with values retrieved from memory.
     ///
     /// Thus, the net result of the operation is that the stack is shifted left by one item.
+    ///
+    /// # Errors
+    /// - Returns an error if the address is not aligned to a word boundary.
     pub(super) fn op_mloadw(&mut self) -> Result<(), ExecutionError> {
         // get the address from the stack and read the word from current memory context
         let mut word = self.chiplets.memory_mut().read_word(
@@ -79,6 +82,9 @@ impl Process {
     ///   aligned on a word boundary. The items are not removed from the stack.
     ///
     /// Thus, the net result of the operation is that the stack is shifted left by one item.
+    ///
+    /// # Errors
+    /// - Returns an error if the address is not aligned to a word boundary.
     pub(super) fn op_mstorew(&mut self) -> Result<(), ExecutionError> {
         // get the address from the stack and build the word to be saved from the stack values
         let addr = self.stack.get(0);
@@ -133,6 +139,9 @@ impl Process {
     ///   stack order).
     /// - Memory address (in position 12) is incremented by 8.
     /// - All other stack elements remain the same.
+    ///
+    /// # Errors
+    /// - Returns an error if the address is not aligned to a word boundary.
     pub(super) fn op_mstream(&mut self) -> Result<(), ExecutionError> {
         const MEM_ADDR_STACK_IDX: usize = 12;
 
@@ -178,6 +187,9 @@ impl Process {
     /// - These words replace the top 8 elements of the stack (element-wise, in stack order).
     /// - Memory address (in position 12) is incremented by 8.
     /// - All other stack elements remain the same.
+    ///
+    /// # Errors
+    /// - Returns an error if the address is not aligned to a word boundary.
     pub(super) fn op_pipe(&mut self, host: &mut impl Host) -> Result<(), ExecutionError> {
         const MEM_ADDR_STACK_IDX: usize = 12;
 
diff --git a/processor/src/operations/sys_ops/sys_event_handlers.rs b/processor/src/operations/sys_ops/sys_event_handlers.rs
index 7ff647945b..295ed7990d 100644
--- a/processor/src/operations/sys_ops/sys_event_handlers.rs
+++ b/processor/src/operations/sys_ops/sys_event_handlers.rs
@@ -64,7 +64,7 @@ impl Process {
     }
 }
 
-/// Reads words from memory at the specified range and inserts them into the advice map under
+/// Reads elements from memory at the specified range and inserts them into the advice map under
 /// the key `KEY` located at the top of the stack.
 ///
 /// Inputs:
diff --git a/processor/src/trace/tests/chiplets/memory.rs b/processor/src/trace/tests/chiplets/memory.rs
index 9519682fab..79d77ece73 100644
--- a/processor/src/trace/tests/chiplets/memory.rs
+++ b/processor/src/trace/tests/chiplets/memory.rs
@@ -5,9 +5,9 @@ use miden_air::{
             MEMORY_READ_WORD_LABEL, MEMORY_WRITE, MEMORY_WRITE_ELEMENT_LABEL,
             MEMORY_WRITE_WORD_LABEL,
         },
-        MEMORY_WORD_COL_IDX, MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_IDX0_COL_IDX,
-        MEMORY_IDX1_COL_IDX, MEMORY_IS_READ_COL_IDX, MEMORY_IS_WORD_ACCESS_COL_IDX,
-        MEMORY_V_COL_RANGE,
+        MEMORY_CLK_COL_IDX, MEMORY_CTX_COL_IDX, MEMORY_IDX0_COL_IDX, MEMORY_IDX1_COL_IDX,
+        MEMORY_IS_READ_COL_IDX, MEMORY_IS_WORD_ACCESS_COL_IDX, MEMORY_V_COL_RANGE,
+        MEMORY_WORD_COL_IDX,
     },
     RowIndex,
 };
diff --git a/stdlib/asm/crypto/dsa/rpo_falcon512.masm b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
index c639a1493a..037ceb3d50 100644
--- a/stdlib/asm/crypto/dsa/rpo_falcon512.masm
+++ b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
@@ -467,9 +467,9 @@ end
 #! We can compute s1 in a single pass by delaying the q-modular reduction til the end. This can
 #! be achieved through a careful analysis of the computation of the difference between pi and c.
 #!
-#! The i-th coefficient s1_i of s1 is equal to c_i - (pi_i - pi_{2048 + i}) which is equal to
-#! c_i  + pi_{2048 + i} - pi_i. Now, we know that the size of the pi_i coefficients is bounded by
-#! J := 512 * q^2 and this means that J + pi_{2048 + i} - pi_i does not Q-underflow and since
+#! The i-th coefficient s1_i of s1 is equal to c_i - (pi_i - pi_{512 + i}) which is equal to
+#! c_i  + pi_{512 + i} - pi_i. Now, we know that the size of the pi_i coefficients is bounded by
+#! J := 512 * q^2 and this means that J + pi_{512 + i} - pi_i does not Q-underflow and since
 #! J = 0 modulo q, the addition of J does not affect the final result. It is also important to
 #! note that adding J does not Q-overflow by virtue of q * 2^50 < Q.
 #! All of the above implies that we can compute s1_i with only one modular reduction at the end,
@@ -493,7 +493,7 @@ export.compute_s1_norm_sq
         dup.4 add.5124
         mem_loadw
 
-        # load pi_{i+2048}
+        # load pi_{i+512}
         padw
         dup.8 add.512
         mem_loadw
@@ -502,7 +502,7 @@ export.compute_s1_norm_sq
         padw
         dup.12
         mem_loadw
-        #=> [PI, PI_{i+2048}, C, pi_ptr, ...]
+        #=> [PI, PI_{i+512}, C, pi_ptr, ...]
 
         # 2) Compute the squared norm of (i + 0)-th coefficient of s1
         movup.8
diff --git a/stdlib/asm/crypto/stark/random_coin.masm b/stdlib/asm/crypto/stark/random_coin.masm
index f52ac05f56..d271c38fe6 100644
--- a/stdlib/asm/crypto/stark/random_coin.masm
+++ b/stdlib/asm/crypto/stark/random_coin.masm
@@ -1,3 +1,5 @@
+#! Disclaimer: most of the procedures in this file assume that the input pointers are word-aligned.
+
 use.std::crypto::stark::constants
 use.std::crypto::stark::utils
 use.std::crypto::hashes::rpo
diff --git a/stdlib/docs/mem.md b/stdlib/docs/mem.md
index 56c645cd8d..f88f914f00 100644
--- a/stdlib/docs/mem.md
+++ b/stdlib/docs/mem.md
@@ -2,7 +2,7 @@
 ## std::mem
 | Procedure | Description |
 | ----------- | ------------- |
-| memcopy_words | Copies `n` words from `read_ptr` to `write_ptr`.<br /><br />Stack transition looks as follows:<br />[n, read_ptr, write_ptr, ...] -> [...]<br />cycles: 15 + 16n<br /> |
+| memcopy_words | Copies `n` words from `read_ptr` to `write_ptr`, both of which must be word-aligned.<br /><br />Stack transition looks as follows:<br />[n, read_ptr, write_ptr, ...] -> [...]<br />cycles: 15 + 16n<br /> |
 | pipe_double_words_to_memory | Copies an even number of words from the advice_stack to memory.<br /><br />Input: [C, B, A, write_ptr, end_ptr, ...]<br />Output: [C, B, A, write_ptr, ...]<br /><br />Where:<br />- The words C, B, and A are the RPO hasher state<br />- A is the capacity<br />- C,B are the rate portion of the state<br />- The value `words = end_ptr - write_ptr` must be positive and even<br /><br />Cycles: 10 + 9 * word_pairs<br /> |
 | pipe_words_to_memory | Copies an arbitrary number of words from the advice stack to memory<br /><br />Input: [num_words, write_ptr, ...]<br />Output: [C, B, A, write_ptr', ...]<br />Cycles:<br />even num_words: 41 + 9 * num_words / 2<br />odd num_words: 58 + 9 * round_down(num_words / 2)<br /> |
 | pipe_preimage_to_memory | Moves an arbitrary number of words from the advice stack to memory and asserts it matches the commitment.<br /><br />Input: [num_words, write_ptr, COM, ...]<br />Output: [write_ptr', ...]<br />Cycles:<br />even num_words: 62 + 9 * num_words / 2<br />odd num_words: 79 + 9 * round_down(num_words / 2)<br /> |

From d9581d5f7b2ef1b97df1b0243340b1882b730aa5 Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Mon, 6 Jan 2025 16:50:03 -0500
Subject: [PATCH 12/19] clippy

---
 core/src/debuginfo/source_file.rs |  2 +-
 stdlib/tests/crypto/keccak256.rs  | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/core/src/debuginfo/source_file.rs b/core/src/debuginfo/source_file.rs
index 2f7fc69f76..9b750a6e0d 100644
--- a/core/src/debuginfo/source_file.rs
+++ b/core/src/debuginfo/source_file.rs
@@ -515,7 +515,7 @@ impl SourceContent {
     ///
     /// Returns `None` if the given index is out of bounds
     pub fn line_start(&self, line_index: LineIndex) -> Option<ByteIndex> {
-        self.line_starts.get(line_index.to_usize()).copied().map(ByteIndex::from)
+        self.line_starts.get(line_index.to_usize()).copied()
     }
 
     /// Returns the index of the last line in this file
diff --git a/stdlib/tests/crypto/keccak256.rs b/stdlib/tests/crypto/keccak256.rs
index 8991c87def..3c179bfb73 100644
--- a/stdlib/tests/crypto/keccak256.rs
+++ b/stdlib/tests/crypto/keccak256.rs
@@ -80,13 +80,13 @@ fn keccak256_2_to_1_hash() {
 fn to_stack(i_digest: &[u8], stack: &mut [u64]) {
     for i in 0..(i_digest.len() >> 3) {
         // byte array ( = 8 -bytes ) to little endian 64 -bit unsigned integer
-        let word = (i_digest[(i << 3) + 7] as u64) << 56
-            | (i_digest[(i << 3) + 6] as u64) << 48
-            | (i_digest[(i << 3) + 5] as u64) << 40
-            | (i_digest[(i << 3) + 4] as u64) << 32
-            | (i_digest[(i << 3) + 3] as u64) << 24
-            | (i_digest[(i << 3) + 2] as u64) << 16
-            | (i_digest[(i << 3) + 1] as u64) << 8
+        let word = ((i_digest[(i << 3) + 7] as u64) << 56)
+            | ((i_digest[(i << 3) + 6] as u64) << 48)
+            | ((i_digest[(i << 3) + 5] as u64) << 40)
+            | ((i_digest[(i << 3) + 4] as u64) << 32)
+            | ((i_digest[(i << 3) + 3] as u64) << 24)
+            | ((i_digest[(i << 3) + 2] as u64) << 16)
+            | ((i_digest[(i << 3) + 1] as u64) << 8)
             | (i_digest[i << 3] as u64);
 
         // split into higher/ lower bits of u64

From 9a81fe770f0c16297f4f57dd6699b44045b4c33f Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Tue, 7 Jan 2025 10:29:18 -0500
Subject: [PATCH 13/19] cleanup

---
 air/src/constraints/chiplets/memory/mod.rs | 54 +++++++++++++---------
 air/src/constraints/chiplets/mod.rs        | 14 +++---
 air/src/trace/chiplets/mod.rs              |  4 --
 stdlib/asm/crypto/fri/helper.masm          |  3 +-
 stdlib/asm/crypto/stark/constants.masm     | 11 +++--
 stdlib/asm/crypto/stark/random_coin.masm   |  4 +-
 6 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/air/src/constraints/chiplets/memory/mod.rs b/air/src/constraints/chiplets/memory/mod.rs
index 79af15c825..a5144e19b0 100644
--- a/air/src/constraints/chiplets/memory/mod.rs
+++ b/air/src/constraints/chiplets/memory/mod.rs
@@ -50,27 +50,36 @@ pub fn get_transition_constraint_count() -> usize {
 }
 
 /// Enforces constraints for the memory chiplet.
+///
+/// The flags are:
+/// - `memory_flag_all_rows`: a flag that is set to 1 when the current row is part of the memory
+///   chiplet,
+/// - `memory_flag_no_last_row`: a flag that is set to 1 when the current row is part of the memory
+///   chiplet, but excludes the last row of the chiplet,
+/// - `memory_flag_first_row`: a flag that is set to 1 when the *next* row is the first row of the
+///   memory chiplet.
 pub fn enforce_constraints<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
-    memory_flag: E,
-    memory_flag_no_last: E,
+    memory_flag_all_rows: E,
+    memory_flag_no_last_row: E,
     memory_flag_first_row: E,
 ) {
     // Constrain the binary columns.
-    let mut index = enforce_binary_columns(frame, result, memory_flag);
+    let mut index = enforce_binary_columns(frame, result, memory_flag_all_rows);
 
     // Constrain the values in the d inverse column.
-    index += enforce_d_inv(frame, &mut result[index..], memory_flag_no_last);
+    index += enforce_d_inv(frame, &mut result[index..], memory_flag_no_last_row);
 
     // Enforce values in ctx, addr, clk transition correctly.
-    index += enforce_delta(frame, &mut result[index..], memory_flag_no_last);
+    index += enforce_delta(frame, &mut result[index..], memory_flag_no_last_row);
 
     // Enforce the correct value for the f_scw flag.
-    index += enforce_flag_same_context_and_word(frame, &mut result[index..], memory_flag_no_last);
+    index +=
+        enforce_flag_same_context_and_word(frame, &mut result[index..], memory_flag_no_last_row);
 
     // Constrain the memory values.
-    enforce_values(frame, &mut result[index..], memory_flag_no_last, memory_flag_first_row);
+    enforce_values(frame, &mut result[index..], memory_flag_no_last_row, memory_flag_first_row);
 }
 
 // TRANSITION CONSTRAINT HELPERS
@@ -89,25 +98,24 @@ fn enforce_binary_columns<E: FieldElement>(
     4
 }
 
-// TODO(plafer): review these constraints
 /// A constraint evaluation function to enforce that the `d_inv` "delta inverse" column used to
 /// constrain the delta between two consecutive contexts, addresses, or clock cycles is updated
 /// correctly.
 fn enforce_d_inv<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
-    memory_flag_no_last: E,
+    memory_flag_no_last_row: E,
 ) -> usize {
     let constraint_count = 4;
 
     // n0 is binary
-    result[0] = memory_flag_no_last * is_binary(frame.n0());
+    result[0] = memory_flag_no_last_row * is_binary(frame.n0());
     // when the context changes, n0 should be set to 1.
-    result[1] = memory_flag_no_last * frame.not_n0() * frame.ctx_change();
+    result[1] = memory_flag_no_last_row * frame.not_n0() * frame.ctx_change();
     // when n0 is 0, n1 is binary.
-    result[2] = memory_flag_no_last * frame.not_n0() * is_binary(frame.n1());
-    // TODO(plafer)
-    result[3] = memory_flag_no_last * frame.not_n0() * frame.not_n1() * frame.addr_change();
+    result[2] = memory_flag_no_last_row * frame.not_n0() * is_binary(frame.n1());
+    // when n0 and n1 are 0, then `word_addr` doesn't change.
+    result[3] = memory_flag_no_last_row * frame.not_n0() * frame.not_n1() * frame.word_addr_change();
 
     constraint_count
 }
@@ -117,21 +125,21 @@ fn enforce_d_inv<E: FieldElement>(
 fn enforce_delta<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
-    memory_flag_no_last: E,
+    memory_flag_no_last_row: E,
 ) -> usize {
     let constraint_count = 1;
 
     // If the context changed, include the difference.
-    result[0] = memory_flag_no_last * frame.n0() * frame.ctx_change();
+    result[0] = memory_flag_no_last_row * frame.n0() * frame.ctx_change();
     // If the context is the same, include the word difference if it changed or else include the
     // clock change.
     result.agg_constraint(
         0,
-        memory_flag_no_last * frame.not_n0(),
-        frame.n1() * frame.addr_change() + frame.not_n1() * frame.clk_change(),
+        memory_flag_no_last_row * frame.not_n0(),
+        frame.n1() * frame.word_addr_change() + frame.not_n1() * frame.clk_change(),
     );
     // Always subtract the delta. It should offset the other changes.
-    result[0] -= memory_flag_no_last * frame.delta_next();
+    result[0] -= memory_flag_no_last_row * frame.delta_next();
 
     constraint_count
 }
@@ -141,9 +149,9 @@ fn enforce_delta<E: FieldElement>(
 fn enforce_flag_same_context_and_word<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
-    memory_flag_no_last: E,
+    memory_flag_no_last_row: E,
 ) -> usize {
-    result[0] = memory_flag_no_last
+    result[0] = memory_flag_no_last_row
         * (frame.f_scw_next() - binary_not(frame.n0() + frame.not_n0() * frame.n1()));
 
     1
@@ -300,7 +308,7 @@ trait EvaluationFrameExt<E: FieldElement> {
     /// The difference between the next context and the current context.
     fn ctx_change(&self) -> E;
     /// The difference between the next address and the current address.
-    fn addr_change(&self) -> E;
+    fn word_addr_change(&self) -> E;
     /// The difference between the next clock value and the current one, minus 1.
     fn clk_change(&self) -> E;
     /// The delta between two consecutive context IDs, addresses, or clock cycles.
@@ -433,7 +441,7 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
     }
 
     #[inline(always)]
-    fn addr_change(&self) -> E {
+    fn word_addr_change(&self) -> E {
         self.change(MEMORY_WORD_COL_IDX)
     }
 
diff --git a/air/src/constraints/chiplets/mod.rs b/air/src/constraints/chiplets/mod.rs
index 4263273855..d29b8000e4 100644
--- a/air/src/constraints/chiplets/mod.rs
+++ b/air/src/constraints/chiplets/mod.rs
@@ -86,7 +86,6 @@ pub fn enforce_constraints<E: FieldElement<BaseField = Felt>>(
     );
     constraint_offset += bitwise::get_transition_constraint_count();
 
-    // TODO(plafer): refactor
     // memory transition constraints
     memory::enforce_constraints(
         frame,
@@ -150,19 +149,20 @@ trait EvaluationFrameExt<E: FieldElement> {
     /// Flag to indicate whether the frame is in the bitwise portion of the Chiplets trace.
     fn bitwise_flag(&self) -> E;
 
-    /// Flag to indicate whether the frame is in the memory portion of the Chiplets trace.
+    /// Flag to indicate whether the current row of the frame is in the memory portion of the
+    /// Chiplets trace.
     fn memory_flag(&self) -> E;
 
-    /// Flag to indicate whether the frame is in the memory portion of the Chiplets trace, except
-    /// for the last memory chiplet row.
+    /// Flag to indicate whether the current row of the frame is in the memory portion of the
+    /// Chiplets trace, except for the last memory chiplet row.
     fn memory_flag_no_last(&self) -> E;
 
-    /// Flag to indicate whether the next row in the frame is in the memory portion of the Chiplets
+    /// Flag to indicate whether the next row of the frame is in the memory portion of the Chiplets
     /// trace.
     fn memory_flag_next(&self) -> E;
 
-    /// Flag to indicate whether the next row in the frame is in the memory portion of the Chiplets
-    /// trace.
+    /// Flag to indicate whether the next row of the frame is the first row of the memory portion of
+    /// the Chiplets trace.
     fn memory_flag_first_row(&self) -> E;
 }
 
diff --git a/air/src/trace/chiplets/mod.rs b/air/src/trace/chiplets/mod.rs
index 64991a8855..5186254c0a 100644
--- a/air/src/trace/chiplets/mod.rs
+++ b/air/src/trace/chiplets/mod.rs
@@ -86,10 +86,6 @@ pub const BITWISE_OUTPUT_COL_IDX: usize = BITWISE_TRACE_OFFSET + bitwise::OUTPUT
 
 // --- GLOBALLY-INDEXED CHIPLET COLUMN ACCESSORS: MEMORY ------------------------------------------
 
-// TODO(plafer): remove unused constants at the end
-/// The index within the main trace of the column containing the first memory selector, which
-/// indicates the operation (read or write).
-pub const MEMORY_SELECTORS_COL_IDX: usize = MEMORY_TRACE_OFFSET;
 /// The index within the main trace of the column containing the memory read/write column.
 pub const MEMORY_IS_READ_COL_IDX: usize = MEMORY_TRACE_OFFSET + memory::IS_READ_COL_IDX;
 /// The index within the main trace of the column containing the memory element/word column.
diff --git a/stdlib/asm/crypto/fri/helper.masm b/stdlib/asm/crypto/fri/helper.masm
index 884f8afa21..c97dfa1033 100644
--- a/stdlib/asm/crypto/fri/helper.masm
+++ b/stdlib/asm/crypto/fri/helper.masm
@@ -179,8 +179,7 @@ export.load_and_verify_remainder
 
     # adv_pipe the remainder codeword
     ## Get the length of remainder
-    # TODO(plafer): add a `padw` here
-    exec.constants::tmp6 mem_loadw
+    padw exec.constants::tmp6 mem_loadw
     ## Compute the correct remainder pointer using length of remainder
     exec.constants::fri_com_ptr
     #=> [fri_com_ptr, num_fri_layers, remainder_size, lde_size, lde_size]
diff --git a/stdlib/asm/crypto/stark/constants.masm b/stdlib/asm/crypto/stark/constants.masm
index 2d5c5bf9e7..a77b487884 100644
--- a/stdlib/asm/crypto/stark/constants.masm
+++ b/stdlib/asm/crypto/stark/constants.masm
@@ -6,15 +6,16 @@ const.ROOT_UNITY=7277203076849721926
 const.DOMAIN_OFFSET=7
 const.DOMAIN_OFFSET_INV=2635249152773512046
 
-# TODO(plafer): remove "MULTIPLIED_BY_TWO"
 # Number of coefficients corresponds to the number of boundary + transition constraints 
 # (including auxiliary constraints)
-const.NUM_CONSTRAINT_COMPOSITION_COEF_MULTIPLIED_BY_TWO_ROUNDED_UP_TO_FOUR=232
+const.NUM_CONSTRAINT_COMPOSITION_COEF_ROUNDED_UP_TO_FOUR=232
 
 # Number of coefficients corresponds to "number of main & aux columns" + 8,
 # where "8" is the number of columns needed to store the constraint composition polynomial.
 const.NUM_DEEP_COMPOSITION_COEF_MULTIPLIED_BY_TWO_ROUNDED_UP_TO_FOUR=88
 
+# Number of random extension field coefficients related to the auxiliary trace (i.e. the alphas)
+const.NUM_AUX_TRACE_COEFS=16
 
 # MEMORY POINTERS
 # =================================================================================================
@@ -159,6 +160,10 @@ export.root_unity
     push.ROOT_UNITY
 end
 
+export.num_aux_trace_coefs
+    push.NUM_AUX_TRACE_COEFS
+end
+
 # Procedure to push the trace domain generator address to the stack.
 #
 # Input: [...]
@@ -177,7 +182,7 @@ export.domain_offset_inv
 end
 
 export.num_constraint_composition_coef_multiplied_by_two_and_rounded_up_to_4
-    push.NUM_CONSTRAINT_COMPOSITION_COEF_MULTIPLIED_BY_TWO_ROUNDED_UP_TO_FOUR
+    push.NUM_CONSTRAINT_COMPOSITION_COEF_ROUNDED_UP_TO_FOUR
 end
 
 export.num_deep_composition_coef_multiplied_by_two_and_rounded_up_to_4
diff --git a/stdlib/asm/crypto/stark/random_coin.masm b/stdlib/asm/crypto/stark/random_coin.masm
index d271c38fe6..3e94abb234 100644
--- a/stdlib/asm/crypto/stark/random_coin.masm
+++ b/stdlib/asm/crypto/stark/random_coin.masm
@@ -452,9 +452,7 @@ end
 #! Output: [...]
 #! Cycles: 159
 export.generate_aux_randomness
-
-    # TODO(plafer): make 16 a constant in constants.masm
-    push.16 swap
+    exec.constants::num_aux_trace_coefs swap
     exec.generate_random_coefficients
     #=> [...]
 end

From 6e4fd25b22a8d2429e8af79bbcc5e579efcd34c9 Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Tue, 7 Jan 2025 13:38:01 -0500
Subject: [PATCH 14/19] feat: make the number of locals element-addressable

---
 air/src/constraints/chiplets/memory/mod.rs    |  3 +-
 assembly/src/assembler/instruction/mem_ops.rs |  8 ++--
 assembly/src/assembler/instruction/mod.rs     | 45 ++++++++++++-------
 assembly/src/assembler/mod.rs                 | 18 +++++---
 assembly/src/errors.rs                        | 18 ++++++++
 assembly/src/tests.rs                         | 26 +++++------
 test-utils/src/lib.rs                         |  2 +-
 7 files changed, 81 insertions(+), 39 deletions(-)

diff --git a/air/src/constraints/chiplets/memory/mod.rs b/air/src/constraints/chiplets/memory/mod.rs
index a5144e19b0..d91a1fa805 100644
--- a/air/src/constraints/chiplets/memory/mod.rs
+++ b/air/src/constraints/chiplets/memory/mod.rs
@@ -115,7 +115,8 @@ fn enforce_d_inv<E: FieldElement>(
     // when n0 is 0, n1 is binary.
     result[2] = memory_flag_no_last_row * frame.not_n0() * is_binary(frame.n1());
     // when n0 and n1 are 0, then `word_addr` doesn't change.
-    result[3] = memory_flag_no_last_row * frame.not_n0() * frame.not_n1() * frame.word_addr_change();
+    result[3] =
+        memory_flag_no_last_row * frame.not_n0() * frame.not_n1() * frame.word_addr_change();
 
     constraint_count
 }
diff --git a/assembly/src/assembler/instruction/mem_ops.rs b/assembly/src/assembler/instruction/mem_ops.rs
index 09ec16d596..98b5c5c9f1 100644
--- a/assembly/src/assembler/instruction/mem_ops.rs
+++ b/assembly/src/assembler/instruction/mem_ops.rs
@@ -1,6 +1,6 @@
 use alloc::string::ToString;
 
-use vm_core::{Felt, Operation::*, WORD_SIZE};
+use vm_core::{Felt, Operation::*};
 
 use super::{push_felt, push_u32_value, validate_param, BasicBlockBuilder};
 use crate::{assembler::ProcedureContext, diagnostics::Report, AssemblyError};
@@ -128,9 +128,9 @@ pub fn local_to_absolute_addr(
     validate_param(index_of_local, 0..=max)?;
 
     // Local values are placed under the frame pointer, so we need to calculate the offset of the
-    // local value from the frame pointer. Local values are also indexed by word, so we need to
-    // multiply the index by the word size.
-    let fmp_offset_of_local = (max - index_of_local) * WORD_SIZE as u16;
+    // local value from the frame pointer.
+    // The offset is in the range [1, num_proc_locals], which is then subtracted from `fmp`.
+    let fmp_offset_of_local = num_proc_locals - index_of_local;
     push_felt(block_builder, -Felt::from(fmp_offset_of_local));
     block_builder.push_op(FmpAdd);
 
diff --git a/assembly/src/assembler/instruction/mod.rs b/assembly/src/assembler/instruction/mod.rs
index 34b45bc156..889e7c3685 100644
--- a/assembly/src/assembler/instruction/mod.rs
+++ b/assembly/src/assembler/instruction/mod.rs
@@ -1,7 +1,7 @@
 use core::ops::RangeBounds;
 
 use miette::miette;
-use vm_core::{mast::MastNodeId, Decorator, ONE, ZERO};
+use vm_core::{debuginfo::Spanned, mast::MastNodeId, Decorator, ONE, WORD_SIZE, ZERO};
 
 use super::{ast::InvokeKind, Assembler, BasicBlockBuilder, Felt, Operation, ProcedureContext};
 use crate::{ast::Instruction, utils::bound_into_included_u64, AssemblyError, Span};
@@ -331,13 +331,21 @@ impl Assembler {
                 true,
                 true,
             )?,
-            Instruction::LocLoadW(v) => mem_ops::mem_read(
-                block_builder,
-                proc_ctx,
-                Some(v.expect_value() as u32),
-                true,
-                false,
-            )?,
+            Instruction::LocLoadW(v) => {
+                let local_addr = v.expect_value();
+                if local_addr % WORD_SIZE as u16 != 0 {
+                    return Err(AssemblyError::InvalidLocalWordIndex {
+                        span: instruction.span(),
+                        source_file: proc_ctx
+                            .source_manager()
+                            .get(proc_ctx.span().source_id())
+                            .ok(),
+                        local_addr,
+                    });
+                }
+
+                mem_ops::mem_read(block_builder, proc_ctx, Some(local_addr as u32), true, false)?
+            },
             Instruction::MemStore => block_builder.push_ops([MStore, Drop]),
             Instruction::MemStoreW => block_builder.push_ops([MStoreW]),
             Instruction::MemStoreImm(v) => {
@@ -353,14 +361,21 @@ impl Assembler {
                 true,
                 true,
             )?,
-            Instruction::LocStoreW(v) => mem_ops::mem_write_imm(
-                block_builder,
-                proc_ctx,
-                v.expect_value() as u32,
-                true,
-                false,
-            )?,
+            Instruction::LocStoreW(v) => {
+                let local_addr = v.expect_value();
+                if local_addr % WORD_SIZE as u16 != 0 {
+                    return Err(AssemblyError::InvalidLocalWordIndex {
+                        span: instruction.span(),
+                        source_file: proc_ctx
+                            .source_manager()
+                            .get(proc_ctx.span().source_id())
+                            .ok(),
+                        local_addr,
+                    });
+                }
 
+                mem_ops::mem_write_imm(block_builder, proc_ctx, local_addr as u32, true, false)?
+            },
             Instruction::SysEvent(system_event) => {
                 block_builder.push_system_event(system_event.into())
             },
diff --git a/assembly/src/assembler/mod.rs b/assembly/src/assembler/mod.rs
index a5ead4f33d..1985241864 100644
--- a/assembly/src/assembler/mod.rs
+++ b/assembly/src/assembler/mod.rs
@@ -569,16 +569,24 @@ impl Assembler {
         let gid = proc_ctx.id();
         let num_locals = proc_ctx.num_locals();
 
+        // Locals are forced to be a multiple of the word size to properly support reading and
+        // writing words.
+        if num_locals % WORD_SIZE as u16 != 0 {
+            return Err(AssemblyError::InvalidNumLocals {
+                span: proc_ctx.span(),
+                source_file: proc_ctx.source_manager().get(proc_ctx.span().source_id()).ok(),
+                num_locals,
+            })?;
+        }
+
         let wrapper_proc = self.module_graph.get_procedure_unsafe(gid);
         let proc = wrapper_proc.unwrap_ast().unwrap_procedure();
         let proc_body_id = if num_locals > 0 {
             // For procedures with locals, we need to update fmp register before and after the
             // procedure body is executed. Specifically:
-            // - to allocate procedure locals we need to increment fmp by 4 times the number of
-            //   locals, and
-            // - to deallocate procedure locals we need to decrement it by the same amount. We leave
-            //   4 elements between locals to properly support reading and writing words to locals.
-            let locals_frame = Felt::from(num_locals * WORD_SIZE as u16);
+            // - to allocate procedure locals we need to increment fmp by the number of locals, and
+            // - to deallocate procedure locals we need to decrement it by the same amount.
+            let locals_frame = Felt::from(num_locals);
             let wrapper = BodyWrapper {
                 prologue: vec![Operation::Push(locals_frame), Operation::FmpUpdate],
                 epilogue: vec![Operation::Push(-locals_frame), Operation::FmpUpdate],
diff --git a/assembly/src/errors.rs b/assembly/src/errors.rs
index bee90c05f1..ddda42c160 100644
--- a/assembly/src/errors.rs
+++ b/assembly/src/errors.rs
@@ -61,6 +61,24 @@ pub enum AssemblyError {
         source_file: Option<Arc<SourceFile>>,
         callee: QualifiedProcedureName,
     },
+    #[error("invalid number of declared local variables for procedure: {num_locals}")]
+    #[diagnostic(help("the number of local variables must be a multiple of 4"))]
+    InvalidNumLocals {
+        #[label]
+        span: SourceSpan,
+        #[source_code]
+        source_file: Option<Arc<SourceFile>>,
+        num_locals: u16,
+    },
+    #[error("invalid local word index: {local_addr}")]
+    #[diagnostic(help("the index to a local word must be a multiple of 4"))]
+    InvalidLocalWordIndex {
+        #[label]
+        span: SourceSpan,
+        #[source_code]
+        source_file: Option<Arc<SourceFile>>,
+        local_addr: u16,
+    },
     #[error("invalid use of 'caller' instruction outside of kernel")]
     #[diagnostic(help(
         "the 'caller' instruction is only allowed in procedures defined in a kernel"
diff --git a/assembly/src/tests.rs b/assembly/src/tests.rs
index f9d578e5bd..20383751ff 100644
--- a/assembly/src/tests.rs
+++ b/assembly/src/tests.rs
@@ -822,12 +822,12 @@ fn mem_operations_with_constants() -> TestResult {
     // Define constant values
     const PROC_LOC_STORE_PTR: u64 = 0;
     const PROC_LOC_LOAD_PTR: u64 = 1;
-    const PROC_LOC_STOREW_PTR: u64 = 2;
-    const PROC_LOC_LOADW_PTR: u64 = 3;
-    const GLOBAL_STORE_PTR: u64 = 4;
-    const GLOBAL_LOAD_PTR: u64 = 5;
-    const GLOBAL_STOREW_PTR: u64 = 6;
-    const GLOBAL_LOADW_PTR: u64 = 7;
+    const PROC_LOC_STOREW_PTR: u64 = 4;
+    const PROC_LOC_LOADW_PTR: u64 = 8;
+    const GLOBAL_STORE_PTR: u64 = 12;
+    const GLOBAL_LOAD_PTR: u64 = 13;
+    const GLOBAL_STOREW_PTR: u64 = 16;
+    const GLOBAL_LOADW_PTR: u64 = 20;
 
     let source = source_file!(
         &context,
@@ -842,7 +842,7 @@ fn mem_operations_with_constants() -> TestResult {
     const.GLOBAL_STOREW_PTR={GLOBAL_STOREW_PTR}
     const.GLOBAL_LOADW_PTR={GLOBAL_LOADW_PTR}
 
-    proc.test_const_loc.4
+    proc.test_const_loc.24
         # constant should resolve using locaddr operation
         locaddr.PROC_LOC_STORE_PTR
 
@@ -885,7 +885,7 @@ fn mem_operations_with_constants() -> TestResult {
         &context,
         format!(
             "\
-    proc.test_const_loc.4
+    proc.test_const_loc.24
         # constant should resolve using locaddr operation
         locaddr.{PROC_LOC_STORE_PTR}
 
@@ -1793,7 +1793,7 @@ fn program_with_proc_locals() -> TestResult {
     let source = source_file!(
         &context,
         "\
-        proc.foo.1 \
+        proc.foo.4 \
             loc_store.0 \
             add \
             loc_load.0 \
@@ -1814,12 +1814,12 @@ begin
         push(8)
         push(4)
         fmpupdate
-        pad
+        push(18446744069414584317)
         fmpadd
         mstore
         drop
         add
-        pad
+        push(18446744069414584317)
         fmpadd
         mload
         mul
@@ -2925,7 +2925,7 @@ fn test_reexported_proc_with_same_name_as_local_proc_diff_locals() {
     let mod1 = {
         let source = source_file!(
             &context,
-            "export.foo.2
+            "export.foo.8
                 push.1
                 drop
             end
@@ -2962,7 +2962,7 @@ fn test_reexported_proc_with_same_name_as_local_proc_diff_locals() {
     use.test::mod1
     use.test::mod2
 
-    proc.foo.1
+    proc.foo.4
         exec.mod1::foo
         exec.mod2::foo
     end
diff --git a/test-utils/src/lib.rs b/test-utils/src/lib.rs
index ae5b934873..d232e7b0cb 100644
--- a/test-utils/src/lib.rs
+++ b/test-utils/src/lib.rs
@@ -71,7 +71,7 @@ pub const U32_BOUND: u64 = u32::MAX as u64 + 1;
 
 /// A source code of the `truncate_stack` procedure.
 pub const TRUNCATE_STACK_PROC: &str = "
-proc.truncate_stack.1
+proc.truncate_stack.4
     loc_storew.0 dropw movupw.3
     sdepth neq.16
     while.true

From a21e20f1be306555ad2c6c87065f77bd07679192 Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Tue, 7 Jan 2025 16:45:42 -0500
Subject: [PATCH 15/19] fix: fix stdlib after locals change

---
 assembly/src/ast/tests.rs                     |   2 +-
 miden/masm-examples/debug/debug.masm          |   4 +-
 .../integration/operations/io_ops/env_ops.rs  |  42 +-
 .../operations/io_ops/local_ops.rs            |  48 +-
 stdlib/asm/crypto/dsa/ecdsa/secp256k1.masm    | 148 ++--
 stdlib/asm/crypto/dsa/rpo_falcon512.masm      |  32 +-
 stdlib/asm/crypto/elgamal_ecgfp5.masm         |  10 +-
 stdlib/asm/crypto/fri/frie2f4.masm            |  18 +-
 stdlib/asm/crypto/hashes/blake3.masm          |  62 +-
 stdlib/asm/crypto/hashes/keccak256.masm       |  78 +-
 stdlib/asm/crypto/hashes/sha256.masm          |  96 +--
 stdlib/asm/math/ecgfp5/group.masm             | 272 +++----
 stdlib/asm/math/ecgfp5/scalar_field.masm      | 692 +++++++++---------
 stdlib/asm/math/secp256k1/base_field.masm     |  40 +-
 stdlib/asm/math/secp256k1/group.masm          | 524 ++++++-------
 stdlib/asm/math/secp256k1/scalar_field.masm   |  40 +-
 stdlib/asm/math/u256.masm                     |  84 +--
 stdlib/asm/sys.masm                           |   2 +-
 stdlib/tests/crypto/falcon.rs                 |   2 +-
 stdlib/tests/crypto/fri/remainder.rs          |  12 +-
 stdlib/tests/math/secp256k1/group.rs          | 130 ++--
 test-utils/src/test_builders.rs               |   4 +-
 22 files changed, 1171 insertions(+), 1171 deletions(-)

diff --git a/assembly/src/ast/tests.rs b/assembly/src/ast/tests.rs
index dd9e910297..a708143e2e 100644
--- a/assembly/src/ast/tests.rs
+++ b/assembly/src/ast/tests.rs
@@ -1120,7 +1120,7 @@ fn assert_parsing_line_invalid_op() {
         while.true
             push.5.7
             u32wrapping_add
-            loc_store.1
+            loc_store.4
             push.0
         end
 
diff --git a/miden/masm-examples/debug/debug.masm b/miden/masm-examples/debug/debug.masm
index 0c4c27f237..54649b27c6 100644
--- a/miden/masm-examples/debug/debug.masm
+++ b/miden/masm-examples/debug/debug.masm
@@ -2,7 +2,7 @@ proc.foo.3
     push.11
     loc_store.0
     push.101
-    loc_store.1
+    loc_store.4
 
     debug.local
     debug.local.1
@@ -16,7 +16,7 @@ proc.bar.4
     push.21
     loc_store.0
     push.121
-    loc_store.1
+    loc_store.4
     debug.local
     debug.local.2
 end
diff --git a/miden/tests/integration/operations/io_ops/env_ops.rs b/miden/tests/integration/operations/io_ops/env_ops.rs
index 475f58d87b..bd3552ca37 100644
--- a/miden/tests/integration/operations/io_ops/env_ops.rs
+++ b/miden/tests/integration/operations/io_ops/env_ops.rs
@@ -49,9 +49,9 @@ fn sdepth() {
 fn locaddr() {
     // --- locaddr returns expected address -------------------------------------------------------
     let source = "
-        proc.foo.2
+        proc.foo.8
             locaddr.0
-            locaddr.1
+            locaddr.4
         end
         begin
             exec.foo
@@ -59,19 +59,19 @@ fn locaddr() {
         end";
 
     let test = build_test!(source, &[10]);
-    test.expect_stack(&[FMP_MIN + 8, FMP_MIN + 4, 10]);
+    test.expect_stack(&[FMP_MIN + 4, FMP_MIN, 10]);
 
     // --- accessing mem via locaddr updates the correct variables --------------------------------
     let source = "
-        proc.foo.2
+        proc.foo.8
             locaddr.0
             mem_store
-            locaddr.1
+            locaddr.4
             mem_storew
             dropw
             loc_load.0
             push.0.0.0.0
-            loc_loadw.1
+            loc_loadw.4
         end
         begin
             exec.foo
@@ -86,15 +86,15 @@ fn locaddr() {
         "
         {TRUNCATE_STACK_PROC}
 
-        proc.foo.3
+        proc.foo.12
             locaddr.0
-            locaddr.1
-            locaddr.2
+            locaddr.4
+            locaddr.8
         end
-        proc.bar.2
+        proc.bar.8
             locaddr.0
             exec.foo
-            locaddr.1
+            locaddr.4
         end
         begin
             exec.bar
@@ -106,35 +106,35 @@ fn locaddr() {
 
     let test = build_test!(source, &[10]);
     test.expect_stack(&[
-        FMP_MIN + 12,
         FMP_MIN + 8,
         FMP_MIN + 4,
-        FMP_MIN + 8,
-        FMP_MIN + 20,
+        FMP_MIN,
+        FMP_MIN + 4,
         FMP_MIN + 16,
         FMP_MIN + 12,
-        FMP_MIN + 4,
+        FMP_MIN + 8,
+        FMP_MIN,
         10,
     ]);
 
     // --- accessing mem via locaddr in nested procedures updates the correct variables -----------
     let source = "
-        proc.foo.2
+        proc.foo.8
             locaddr.0
             mem_store
-            locaddr.1
+            locaddr.4
             mem_storew
             dropw
             push.0.0.0.0
-            loc_loadw.1
+            loc_loadw.4
             loc_load.0
         end
-        proc.bar.2
+        proc.bar.8
             locaddr.0
             mem_store
-            loc_store.1
+            loc_store.4
             exec.foo
-            locaddr.1
+            locaddr.4
             mem_load
             loc_load.0
         end
diff --git a/miden/tests/integration/operations/io_ops/local_ops.rs b/miden/tests/integration/operations/io_ops/local_ops.rs
index b5f753611f..25c021c4e3 100644
--- a/miden/tests/integration/operations/io_ops/local_ops.rs
+++ b/miden/tests/integration/operations/io_ops/local_ops.rs
@@ -6,7 +6,7 @@ use super::{build_test, TRUNCATE_STACK_PROC};
 #[test]
 fn push_local() {
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_load.0
         end
 
@@ -31,11 +31,11 @@ fn push_local() {
 fn pop_local() {
     // --- test write to local memory -------------------------------------------------------------
     let source = "
-        proc.foo.2
+        proc.foo.8
             loc_store.0
-            loc_store.1
+            loc_store.4
             loc_load.0
-            loc_load.1
+            loc_load.4
         end
         begin
             exec.foo
@@ -47,7 +47,7 @@ fn pop_local() {
 
     // --- test existing memory is not affected ---------------------------------------------------
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_store.0
         end
         begin
@@ -67,7 +67,7 @@ fn pop_local() {
 #[test]
 fn loadw_local() {
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_loadw.0
         end
         begin
@@ -93,15 +93,15 @@ fn storew_local() {
         "
         {TRUNCATE_STACK_PROC}
 
-        proc.foo.2
+        proc.foo.8
             loc_storew.0
             swapw
-            loc_storew.1
+            loc_storew.4
             swapw
             push.0.0.0.0
             loc_loadw.0
             push.0.0.0.0
-            loc_loadw.1
+            loc_loadw.4
         end
         begin
             exec.foo
@@ -115,7 +115,7 @@ fn storew_local() {
 
     // --- test existing memory is not affected ---------------------------------------------------
     let source = "
-        proc.foo.1
+        proc.foo.8
             loc_storew.0
         end
         begin
@@ -138,7 +138,7 @@ fn storew_local() {
 fn inverse_operations() {
     // --- pop and push are inverse operations, so the stack should be left unchanged -------------
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_store.0
             loc_load.0
         end
@@ -156,7 +156,7 @@ fn inverse_operations() {
 
     // --- popw and pushw are inverse operations, so the stack should be left unchanged -----------
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_storew.0
             dropw
             push.0.0.0.0
@@ -176,7 +176,7 @@ fn inverse_operations() {
 
     // --- storew and loadw are inverse operations, so the stack should be left unchanged ---------
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_storew.0
             loc_loadw.0
         end
@@ -196,7 +196,7 @@ fn inverse_operations() {
 fn read_after_write() {
     // --- write to memory first, then test read with push --------------------------------------
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_storew.0
             loc_load.0
         end
@@ -210,7 +210,7 @@ fn read_after_write() {
 
     // --- write to memory first, then test read with pushw --------------------------------------
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_storew.0
             push.0.0.0.0
             loc_loadw.0
@@ -225,7 +225,7 @@ fn read_after_write() {
 
     // --- write to memory first, then test read with loadw --------------------------------------
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_storew.0
             dropw
             loc_loadw.0
@@ -242,11 +242,11 @@ fn read_after_write() {
 fn nested_procedures() {
     // --- test nested procedures - pop/push ------------------------------------------------------
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_store.0
         end
 
-        proc.bar.1
+        proc.bar.4
             loc_store.0
             exec.foo
             loc_load.0
@@ -263,11 +263,11 @@ fn nested_procedures() {
 
     // --- test nested procedures - popw/pushw ----------------------------------------------------
     let source = "
-        proc.foo.1
+        proc.foo.4
             loc_storew.0
             dropw
         end
-        proc.bar.1
+        proc.bar.4
             loc_storew.0
             dropw
             exec.foo
@@ -285,11 +285,11 @@ fn nested_procedures() {
 
     // --- test nested procedures - storew/loadw --------------------------------------------------
     let source = "
-        proc.foo.1
+        proc.foo.4
             push.0 push.0
             loc_storew.0
         end
-        proc.bar.1
+        proc.bar.4
             loc_storew.0
             exec.foo
             loc_loadw.0
@@ -308,9 +308,9 @@ fn nested_procedures() {
 fn free_memory_pointer() {
     // ensure local procedure memory doesn't overwrite memory from outer scope
     let source = "
-        proc.bar.2
+        proc.bar.8
             loc_store.0
-            loc_store.1
+            loc_store.4
         end
         begin
             mem_store.0
diff --git a/stdlib/asm/crypto/dsa/ecdsa/secp256k1.masm b/stdlib/asm/crypto/dsa/ecdsa/secp256k1.masm
index 20b7837316..7fba59328d 100644
--- a/stdlib/asm/crypto/dsa/ecdsa/secp256k1.masm
+++ b/stdlib/asm/crypto/dsa/ecdsa/secp256k1.masm
@@ -25,31 +25,31 @@ use.std::math::secp256k1::group
 #! If verification fails, program execution will be aborted.
 #!
 #! See https://github.com/itzmeanjan/secp256k1/blob/37b339db3e03d24c2977399eb8896ef515ebb09b/ecdsa/verify.py#L11-L45
-export.verify.24
+export.verify.96
     # cache pub_key
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
-    loc_storew.2
+    loc_storew.8
     dropw
-    loc_storew.3
+    loc_storew.12
     dropw
-    loc_storew.4
+    loc_storew.16
     dropw
-    loc_storew.5
+    loc_storew.20
     dropw
 
     # cache h
-    loc_storew.6
+    loc_storew.24
     dropw
-    loc_storew.7
+    loc_storew.28
     dropw
 
     # cache r
-    loc_storew.8
+    loc_storew.32
     dropw
-    loc_storew.9
+    loc_storew.36
     dropw
 
     # Only s lives on stack
@@ -64,45 +64,45 @@ export.verify.24
     push.0.0.0.0.0.0.0.0
 
     # load h
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.6
+    loc_loadw.24
 
     # compute h * s^-1
     exec.scalar_field::mul
     exec.scalar_field::from_mont
 
     # cache h * s^-1
-    loc_storew.6
+    loc_storew.24
     swapw
-    loc_storew.7
+    loc_storew.28
 
     # load r
-    loc_loadw.9
+    loc_loadw.36
     swapw
-    loc_loadw.8
+    loc_loadw.32
 
     # compute r * s^-1
     exec.scalar_field::mul
     exec.scalar_field::from_mont
 
     # cache r * s^-1
-    loc_storew.10
+    loc_storew.40
     dropw
-    loc_storew.11
+    loc_storew.44
     dropw
 
-    locaddr.17
-    locaddr.16
-    locaddr.15
-    locaddr.14
-    locaddr.13
-    locaddr.12
+    locaddr.68
+    locaddr.64
+    locaddr.60
+    locaddr.56
+    locaddr.52
+    locaddr.48
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.6
+    loc_loadw.24
 
     # compute G * ((h * s^-1) mod N) = P0
     exec.group::gen_mul
@@ -111,47 +111,47 @@ export.verify.24
 
     movup.4
     mem_loadw
-    loc_storew.12
+    loc_storew.48
 
     movup.4
     mem_loadw
-    loc_storew.13
+    loc_storew.52
 
     movup.4
     mem_loadw
-    loc_storew.14
+    loc_storew.56
 
     movup.4
     mem_loadw
-    loc_storew.15
+    loc_storew.60
 
     movup.4
     mem_loadw
-    loc_storew.16
+    loc_storew.64
 
     movup.4
     mem_loadw
-    loc_storew.17
+    loc_storew.68
 
     dropw
 
-    locaddr.23
-    locaddr.22
-    locaddr.21
-    locaddr.20
-    locaddr.19
-    locaddr.18
+    locaddr.92
+    locaddr.88
+    locaddr.84
+    locaddr.80
+    locaddr.76
+    locaddr.72
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.11
+    loc_loadw.44
     swapw
-    loc_loadw.10
+    loc_loadw.40
 
-    locaddr.5
+    locaddr.20
+    locaddr.16
+    locaddr.12
+    locaddr.8
     locaddr.4
-    locaddr.3
-    locaddr.2
-    locaddr.1
     locaddr.0
 
     # compute pkey * ((r * s^-1) mod N) = P1
@@ -165,45 +165,45 @@ export.verify.24
 
     movup.4
     mem_loadw
-    loc_storew.1
+    loc_storew.4
 
     movup.4
     mem_loadw
-    loc_storew.2
+    loc_storew.8
 
     movup.4
     mem_loadw
-    loc_storew.3
+    loc_storew.12
 
     movup.4
     mem_loadw
-    loc_storew.4
+    loc_storew.16
 
     movup.4
     mem_loadw
-    loc_storew.5
+    loc_storew.20
 
     dropw
 
-    locaddr.23
-    locaddr.22
-    locaddr.21
-    locaddr.20
-    locaddr.19
-    locaddr.18
+    locaddr.92
+    locaddr.88
+    locaddr.84
+    locaddr.80
+    locaddr.76
+    locaddr.72
+
+    locaddr.68
+    locaddr.64
+    locaddr.60
+    locaddr.56
+    locaddr.52
+    locaddr.48
 
-    locaddr.17
+    locaddr.20
     locaddr.16
-    locaddr.15
-    locaddr.14
-    locaddr.13
     locaddr.12
-
-    locaddr.5
+    locaddr.8
     locaddr.4
-    locaddr.3
-    locaddr.2
-    locaddr.1
     locaddr.0
 
     # compute P0 + P1 = P2
@@ -217,33 +217,33 @@ export.verify.24
 
     movup.4
     mem_loadw
-    loc_storew.1
+    loc_storew.4
 
     movup.4
     mem_loadw
-    loc_storew.2
+    loc_storew.8
 
     movup.4
     mem_loadw
-    loc_storew.3
+    loc_storew.12
 
     movup.4
     mem_loadw
-    loc_storew.4
+    loc_storew.16
 
     movup.4
     mem_loadw
-    loc_storew.5
+    loc_storew.20
 
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swapw
-    loc_loadw.4
+    loc_loadw.16
 
     exec.base_field::inv
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     swapw
     loc_loadw.0
 
@@ -252,9 +252,9 @@ export.verify.24
     exec.base_field::from_mont
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.9
+    loc_loadw.36
     swapw
-    loc_loadw.8
+    loc_loadw.32
 
     # compute r ( in radix-2^32 form )
     exec.scalar_field::from_mont
diff --git a/stdlib/asm/crypto/dsa/rpo_falcon512.masm b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
index 037ceb3d50..a5fa681363 100644
--- a/stdlib/asm/crypto/dsa/rpo_falcon512.masm
+++ b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
@@ -76,19 +76,19 @@ end
 #! Output: [...]
 #!
 #! Cycles: 1327
-export.hash_to_point.2
+export.hash_to_point.8
     # Move pointer out of the way
     movdn.12
 
     # Store MSG for later absorption
-    loc_storew.1 dropw
+    loc_storew.4 dropw
 
     # Absorb the nonce
     padw movdnw.2
     hperm
 
     # Absorb the message
-    swapw loc_loadw.1 swapw hperm
+    swapw loc_loadw.4 swapw hperm
 
     # Squeeze the coefficients and save them
     repeat.63
@@ -176,7 +176,7 @@ end
 #! Output: [tau1, tau0, ptr + 512*4 ...]
 #!
 #! Cycles: 5049
-export.load_h_s2_and_product.1
+export.load_h_s2_and_product.4
 
     # 1) Store PK for later comparison
     movdn.4
@@ -244,7 +244,7 @@ end
 #! Output: [...]
 #!
 #! Cycles: 2504
-export.probabilistic_product.4
+export.probabilistic_product.16
     # 1) Save the pointers
     push.0 movdn.3
     loc_storew.0
@@ -266,7 +266,7 @@ export.probabilistic_product.4
     end
 
     # Save the evaluation h(tau)
-    swapdw loc_storew.1
+    swapdw loc_storew.4
     dropw
     #=> [X, X, X, ...]
 
@@ -289,7 +289,7 @@ export.probabilistic_product.4
     end
 
     # Save the evaluation of s2(tau)
-    swapdw loc_storew.2
+    swapdw loc_storew.8
     dropw
     #=> [X, X, X, ...]
 
@@ -320,7 +320,7 @@ export.probabilistic_product.4
     # where (ev0, ev1) := pi1(tau)
 
     # Save pi_1(tau)
-    swapw.2 loc_storew.3
+    swapw.2 loc_storew.12
 
     # Setup the pointers
     swapw.3 loc_loadw.0 add.1536
@@ -349,7 +349,7 @@ export.probabilistic_product.4
     #=> [a1, a0, X, x, x, X, ...]
 
     # Compute (res0, res1) := pi(tau)
-    swapw loc_loadw.3 drop drop
+    swapw loc_loadw.12 drop drop
     ext2add
     #=> [res1, res0, x, x, X, ...]
 
@@ -358,11 +358,11 @@ export.probabilistic_product.4
 
     ## a) Load h(tau)
     swapw
-    loc_loadw.1
+    loc_loadw.4
 
     ## b) Load s2(tau)
     push.0.0
-    loc_loadw.2
+    loc_loadw.8
 
     ## c) compute the product
     drop drop
@@ -593,7 +593,7 @@ end
 #! Output: [...]
 #!
 #! Cycles: ~ 92029
-export.verify.1665
+export.verify.6660
 
     # 1) Generate a Falcon signature using the secret key associated to PK on message MSG.
     adv.push_sig.rpo_falcon512
@@ -631,8 +631,8 @@ export.verify.1665
     # 5) Check that we indeed have pi := h * s2 in Z_Q[x] by checking that pi(tau) = h(tau) * s2(tau)
     # where tau is a random (Fiat-Shamir) challenge resulting from hashing h, s2 and pi.
 
-    locaddr.512     # tau_ptr
-    locaddr.1025    # z_ptr
+    locaddr.2048     # tau_ptr
+    locaddr.4100    # z_ptr
     locaddr.0       # h ptr
     #=> [h_ptr, zeros_ptr, tau_ptr, ...]
 
@@ -641,7 +641,7 @@ export.verify.1665
 
     # 6) Compute the squared norm of s1 := c - h * s2 (in Z_q[x]/(phi))
 
-    locaddr.256
+    locaddr.1024
     #=> [pi_ptr, ...]
 
     exec.compute_s1_norm_sq
@@ -649,7 +649,7 @@ export.verify.1665
 
     # 7) Compute the squared norm of s2
 
-    locaddr.128
+    locaddr.512
     #=> [s2_ptr, norm_sq(s1), ...]
 
     exec.compute_s2_norm_sq
diff --git a/stdlib/asm/crypto/elgamal_ecgfp5.masm b/stdlib/asm/crypto/elgamal_ecgfp5.masm
index d3b51ba2bc..0a2b6f0c83 100644
--- a/stdlib/asm/crypto/elgamal_ecgfp5.masm
+++ b/stdlib/asm/crypto/elgamal_ecgfp5.masm
@@ -3,7 +3,7 @@ use.std::math::ecgfp5::group
 #! Generates the public key, point H
 #! the private key is expected as input and is a 319-bit random
 #! number of 10 32-bit limbs.
-export.gen_privatekey.8
+export.gen_privatekey.32
     exec.group::gen_mul
 end
 
@@ -14,7 +14,7 @@ end
 #! [r0, r1, ..., r9]
 #! Final stack state
 #! [Ca_x0, ..., C_x4, Ca_y0, ..., Ca_y4, Ca_inf]
-export.encrypt_ca.8
+export.encrypt_ca.32
     exec.group::gen_mul
 end
 
@@ -28,7 +28,7 @@ end
 #! [H_x0, ..., H_x4, H_y0, ..., H_y4, H_inf, r0, r1, ..., M_x0, ..., M_x4, M_y0, ..., M_y4, M_inf,]
 #! Final stack state
 #! [Cb_x0, ..., Cb_x4, Cb_y0, ..., Cb_y4, Cb_inf]
-export.encrypt_cb.20
+export.encrypt_cb.80
     exec.group::mul
     exec.group::add
 end
@@ -40,7 +40,7 @@ end
 #!
 #! Final stack state
 #! [C'a_x0, ..., C'a_x4, C'a_y0, ..., C'a_y4, C'a_inf]
-export.remask_ca.5
+export.remask_ca.20
     exec.group::gen_mul
     exec.group::add
 end
@@ -52,7 +52,7 @@ end
 #!
 #! Final stack state
 #! [C'b_x0, ..., C'b_x4, C'b_y0, ..., C'b_y4, C'b_inf]
-export.remask_cb.14
+export.remask_cb.56
     exec.group::mul
     exec.group::add
 end
diff --git a/stdlib/asm/crypto/fri/frie2f4.masm b/stdlib/asm/crypto/fri/frie2f4.masm
index 3cd725b4e8..da054ed154 100644
--- a/stdlib/asm/crypto/fri/frie2f4.masm
+++ b/stdlib/asm/crypto/fri/frie2f4.masm
@@ -6,8 +6,8 @@
 #!  g being the initial domain generator.
 #! TODO: This pre-processing function should in fact compute d_size and t_depth for each C
 #! starting from the original domain size.
-export.preprocess.4
-    locaddr.3
+export.preprocess.16
+    locaddr.12
     adv_push.1                  #[num_queries, query_ptr, g, ..]
     sub.1
     push.0.0.0.0
@@ -80,7 +80,7 @@ export.preprocess.4
     dropw drop drop
 
     swap
-    locaddr.3
+    locaddr.12
     #=> [query_ptr, layer_ptr, remainder_ptr, g]
 end
 
@@ -91,7 +91,7 @@ end
 #! Output: [layer_ptr+8, layer_ptr+8, poe^4, f_pos, ne1, ne0, layer_ptr+8, rem_ptr, x, x, x, x, x, x, x, x, ...]
 #!
 #! Cycles: 76
-export.verify_query_layer.3
+export.verify_query_layer.12
 
     # load layer commitment C as well as [a0, a1, t_depth, d_size] (7 cycles)
     swapdw
@@ -118,9 +118,9 @@ export.verify_query_layer.3
     # => [V, C, f_pos, d_seg, poe, e1, e0, a1, a0, layer_ptr, rem_ptr, ...]
     # where f_pos = p % d_size and d_seg = p / 4
 
-    # unhash V and save the pre-image in locaddr.0 and locaddr.1; we don't clear values of C
+    # unhash V and save the pre-image in locaddr.0 and locaddr.4; we don't clear values of C
     # because adv_pipe overwrites the first 8 elements of the stack (15 cycles)
-    locaddr.1
+    locaddr.4
     movdn.4
     push.0.0.0.0
     swapw
@@ -141,9 +141,9 @@ export.verify_query_layer.3
     assert_eq
 
     # load (v7, ..v0) from memory (8 cycles)
-    loc_loadw.1
+    loc_loadw.4
     swapw
-    loc_loadw.2
+    loc_loadw.8
     # => [v7, ..., v0, f_pos, d_seg, poe, e1, e0, a1, a0, layer_ptr, rem_ptr, ...]
 
     # fold by 4 (1 cycle)
@@ -258,7 +258,7 @@ end
 #! 2. layer_ptr - 1 points to the last (e0, e1, p, poe) tuple.
 #!
 #! Cycles: 7 + 4 + num_queries * (42 + num_layers * 76 + 26)
-export.verify.1
+export.verify.4
 
     # store [query_ptr, layer_ptr, rem_ptr, g] to keep track of all queries
     # (3 cycles)
diff --git a/stdlib/asm/crypto/hashes/blake3.masm b/stdlib/asm/crypto/hashes/blake3.masm
index 326023d1c6..1290b68df6 100644
--- a/stdlib/asm/crypto/hashes/blake3.masm
+++ b/stdlib/asm/crypto/hashes/blake3.masm
@@ -177,7 +177,7 @@ end
 #! [state0, state1, state2, state3, state4, state5, state6, state7, state8, state9, state10, state11, state12, state13, state14, state15]
 #!
 #! i.e. whole blake3 state is placed on stack ( in order ).
-proc.columnar_mixing.1
+proc.columnar_mixing.4
     swapw.2
     swapw
 
@@ -416,7 +416,7 @@ end
 #! [state0, state1, state2, state3, state4, state5, state6, state7, state8, state9, state10, state11, state12, state13, state14, state15]
 #!
 #! i.e. whole blake3 state is placed on stack ( in order ).
-proc.diagonal_mixing.1
+proc.diagonal_mixing.4
     swapw.2
     swapw
 
@@ -658,24 +658,24 @@ end
 #!
 #! i.e. mixed state matrix lives in memory addresses {state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr},
 #! which were provided, on stack top, while invoking this routine.
-proc.round.5
+proc.round.20
     loc_storew.0
 
     exec.columnar_mixing
 
-    loc_storew.1
+    loc_storew.4
     dropw
-    loc_storew.2
+    loc_storew.8
     dropw
-    loc_storew.3
+    loc_storew.12
     dropw
-    loc_storew.4
+    loc_storew.16
     dropw
 
+    locaddr.16
+    locaddr.12
+    locaddr.8
     locaddr.4
-    locaddr.3
-    locaddr.2
-    locaddr.1
 
     exec.diagonal_mixing
 
@@ -721,7 +721,7 @@ end
 #! i.e. 7 -round mixed state matrix lives in memory addresses {state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr},
 #! which were provided, on stack top, while invoking this routine. So updated state matrix can be read by caller routine, by reading
 #! the content of memory addresses where state was provided as routine input.
-proc.compress.1
+proc.compress.4
     loc_storew.0
     dropw
 
@@ -757,10 +757,10 @@ end
 #! [dig0, dig1, dig2, dig3, dig4, dig5, dig6, dig7, ...]
 #!
 #! dig`i` -> 32 -bit digest word | i ∈ [0, 8)
-export.hash_2to1.4
-    locaddr.3
-    locaddr.2
-    locaddr.1
+export.hash_2to1.16
+    locaddr.12
+    locaddr.8
+    locaddr.4
     locaddr.0
 
     exec.initialize_2to1
@@ -769,19 +769,19 @@ export.hash_2to1.4
     # block ( = 64 -bytes ) because what we're doing here is 2-to-1 hashing i.e. 64 -bytes
     # input being converted to 32 -bytes output
 
-    locaddr.3
-    locaddr.2
-    locaddr.1
+    locaddr.12
+    locaddr.8
+    locaddr.4
     locaddr.0
 
     exec.compress
 
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
     push.0.0.0.0
-    loc_loadw.2
+    loc_loadw.8
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
 
@@ -801,15 +801,15 @@ end
 #! [dig0, dig1, dig2, dig3, dig4, dig5, dig6, dig7, ...]
 #!
 #! dig`i` -> 32 -bit digest word | i ∈ [0, 8)
-export.hash_1to1.4
+export.hash_1to1.16
     # Pad 32 -bytes input message with zero bytes to make
     # 64 -bytes, which is processed same as 2-to-1 hashing
     push.0.0.0.0.0.0.0.0
     swapdw
 
-    locaddr.3
-    locaddr.2
-    locaddr.1
+    locaddr.12
+    locaddr.8
+    locaddr.4
     locaddr.0
 
     exec.initialize_1to1
@@ -817,19 +817,19 @@ export.hash_1to1.4
     # Note, chunk compression routine needs to compress only one chunk with one message
     # block ( = 64 -bytes ), which is obtained by padding 32 -bytes input.
 
-    locaddr.3
-    locaddr.2
-    locaddr.1
+    locaddr.12
+    locaddr.8
+    locaddr.4
     locaddr.0
 
     exec.compress
 
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
     push.0.0.0.0
-    loc_loadw.2
+    loc_loadw.8
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
 
diff --git a/stdlib/asm/crypto/hashes/keccak256.masm b/stdlib/asm/crypto/hashes/keccak256.masm
index 59e407affd..c9547b7d24 100644
--- a/stdlib/asm/crypto/hashes/keccak256.masm
+++ b/stdlib/asm/crypto/hashes/keccak256.masm
@@ -15,7 +15,7 @@
 #! s.t. last two elements of 12 -th ( when indexed from zero ) memory address are zeroed.
 #!
 #! Consecutive memory addresses can be computed by repeated application of `add.4`.
-proc.theta.3
+proc.theta.12
     dup
     locaddr.0
     mem_store
@@ -235,7 +235,7 @@ proc.theta.3
 
     # stack = [c0, c1, c2, c3]
 
-    locaddr.1
+    locaddr.4
     mem_storew
     dropw
 
@@ -459,7 +459,7 @@ proc.theta.3
 
     # stack = [c4, c5, c6, c7]
 
-    locaddr.2
+    locaddr.8
     mem_storew
     dropw
 
@@ -573,11 +573,11 @@ proc.theta.3
 
     # stack = [c8, c9]
 
-    locaddr.2
+    locaddr.8
     push.0.0.0.0
     movup.4
     mem_loadw
-    locaddr.1
+    locaddr.4
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -1055,7 +1055,7 @@ end
 #! s.t. last two elements of 12 -th ( when indexed from zero ) memory address are zeroed.
 #!
 #! Consecutive memory addresses can be computed by repeated application of `add.4`.
-proc.rho.1
+proc.rho.4
     dup
     locaddr.0
     mem_store
@@ -1340,12 +1340,12 @@ end
 #! s.t. last two elements of 12 -th ( when indexed from zero ) memory address are zeroed.
 #!
 #! Consecutive memory addresses can be computed by repeated application of `add.4`.
-proc.pi.14
+proc.pi.56
     dup
     locaddr.0
     mem_store
 
-    locaddr.1
+    locaddr.4
     swap
     push.0.0.0.0
 
@@ -1759,7 +1759,7 @@ end
 #! s.t. last two elements of 12 -th ( when indexed from zero ) memory address are zeroed.
 #!
 #! Consecutive memory addresses can be computed by repeated application of `add.4`.
-proc.chi.4
+proc.chi.16
     dup
     locaddr.0
     mem_store
@@ -1815,7 +1815,7 @@ proc.chi.4
     movup.3
     movup.3
 
-    locaddr.1
+    locaddr.4
     mem_storew
 
     dup.4
@@ -1877,7 +1877,7 @@ proc.chi.4
     movup.7
     movup.7
 
-    locaddr.2
+    locaddr.8
     mem_storew
     dropw
 
@@ -1901,7 +1901,7 @@ proc.chi.4
     dup.4
     mem_loadw
 
-    locaddr.1
+    locaddr.4
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -1934,7 +1934,7 @@ proc.chi.4
     dup.4
     mem_loadw
 
-    locaddr.2
+    locaddr.8
     push.0.0.0.0
     movup.4
     mem_loadw
@@ -2001,7 +2001,7 @@ proc.chi.4
     swap
 
     push.0.0
-    locaddr.1
+    locaddr.4
     mem_storew
 
     movup.6
@@ -2044,7 +2044,7 @@ proc.chi.4
     movup.3
     movup.3
 
-    locaddr.2
+    locaddr.8
     mem_storew
 
     movup.6
@@ -2105,7 +2105,7 @@ proc.chi.4
     movup.3
     movup.3
 
-    locaddr.3
+    locaddr.12
     mem_storew
 
     locaddr.0
@@ -2117,7 +2117,7 @@ proc.chi.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
 
     movup.4
     u32xor
@@ -2148,7 +2148,7 @@ proc.chi.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.2
+    loc_loadw.8
 
     movup.4
     u32xor
@@ -2179,7 +2179,7 @@ proc.chi.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
 
     movup.4
     u32xor
@@ -2255,7 +2255,7 @@ proc.chi.4
     movup.3
     movup.3
 
-    loc_storew.1
+    loc_storew.4
 
     movup.6
     add.4
@@ -2317,7 +2317,7 @@ proc.chi.4
     movup.7
     movup.7
 
-    loc_storew.2
+    loc_storew.8
     dropw
 
     u32not
@@ -2336,13 +2336,13 @@ proc.chi.4
     movdn.3
     movdn.3
 
-    loc_storew.3
+    loc_storew.12
 
     dup.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
 
     movup.4
     u32xor
@@ -2373,7 +2373,7 @@ proc.chi.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.2
+    loc_loadw.8
 
     movup.4
     u32xor
@@ -2404,7 +2404,7 @@ proc.chi.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
 
     movup.4
     u32xor
@@ -2451,7 +2451,7 @@ proc.chi.4
     swap
 
     push.0.0
-    loc_storew.1
+    loc_storew.4
 
     movup.6
     add.4
@@ -2499,7 +2499,7 @@ proc.chi.4
     movup.3
     movup.3
 
-    loc_storew.2
+    loc_storew.8
 
     movup.6
     sub.8
@@ -2561,7 +2561,7 @@ proc.chi.4
     movup.3
     movup.3
 
-    loc_storew.3
+    loc_storew.12
 
     movup.4
     sub.4
@@ -2571,7 +2571,7 @@ proc.chi.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
 
     movup.4
     u32xor
@@ -2602,7 +2602,7 @@ proc.chi.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.2
+    loc_loadw.8
 
     movup.4
     u32xor
@@ -2633,7 +2633,7 @@ proc.chi.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
 
     movup.4
     u32xor
@@ -2715,7 +2715,7 @@ proc.chi.4
     movup.3
     movup.3
 
-    loc_storew.1
+    loc_storew.4
 
     movup.6
     add.4
@@ -2777,7 +2777,7 @@ proc.chi.4
     movup.7
     movup.7
 
-    loc_storew.2
+    loc_storew.8
     dropw
 
     u32not
@@ -2796,13 +2796,13 @@ proc.chi.4
     movdn.3
     movdn.3
 
-    loc_storew.3
+    loc_storew.12
 
     dup.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
 
     movup.4
     u32xor
@@ -2833,7 +2833,7 @@ proc.chi.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.2
+    loc_loadw.8
 
     movup.4
     u32xor
@@ -2864,7 +2864,7 @@ proc.chi.4
     mem_loadw
 
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
 
     movup.4
     u32xor
@@ -3485,7 +3485,7 @@ end
 #! [oword0, oword1, oword2, oword3, oword4, oword5, oword6, oword7, ... ]
 #!
 #! See https://github.com/itzmeanjan/merklize-sha/blob/1d35aae9da7fed20127489f362b4bc93242a516c/include/keccak_256.hpp#L232-L257
-export.hash.13
+export.hash.52
     # prapare keccak256 state from input message
     locaddr.0
     exec.to_state_array
@@ -3496,7 +3496,7 @@ export.hash.13
 
     # prapare keccak256 digest from state
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.to_digest
diff --git a/stdlib/asm/crypto/hashes/sha256.masm b/stdlib/asm/crypto/hashes/sha256.masm
index efbf6e139a..6cce695ddd 100644
--- a/stdlib/asm/crypto/hashes/sha256.masm
+++ b/stdlib/asm/crypto/hashes/sha256.masm
@@ -233,12 +233,12 @@ end
 #! - msg0 through msg15 are the 64 -bytes input message (in terms of 16 SHA256 words)
 #! See https://github.com/itzmeanjan/merklize-sha/blob/8a2c006/include/sha2.hpp#L89-L113
 #! & https://github.com/itzmeanjan/merklize-sha/blob/8a2c006/include/sha2_256.hpp#L148-L187 ( loop body execution )
-proc.prepare_message_schedule_and_consume.4
+proc.prepare_message_schedule_and_consume.16
     loc_storew.0
-    loc_storew.2
+    loc_storew.8
     dropw
-    loc_storew.1
-    loc_storew.3
+    loc_storew.4
+    loc_storew.12
     dropw
 
     dup.15
@@ -282,7 +282,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0x428a2f98
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[0]
@@ -301,7 +301,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     dup.15
@@ -343,7 +343,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0x3956c25b
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[4]
@@ -362,7 +362,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     dup.6
@@ -400,7 +400,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0xd807aa98
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[8]
@@ -419,7 +419,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     movupw.3
@@ -461,7 +461,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0x72be5d74
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[12]
@@ -480,7 +480,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     movupw.3
@@ -522,7 +522,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0xe49b69c1
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[16]
@@ -541,7 +541,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     movupw.3
@@ -583,7 +583,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0x2de92c6f
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[20]
@@ -602,7 +602,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     movupw.3
@@ -644,7 +644,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0x983e5152
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[24]
@@ -663,7 +663,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     movupw.3
@@ -705,7 +705,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0xc6e00bf3
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[28]
@@ -724,7 +724,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     movupw.3
@@ -766,7 +766,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0x27b70a85
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[32]
@@ -785,7 +785,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     movupw.3
@@ -827,7 +827,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0x650a7354
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[36]
@@ -846,7 +846,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     movupw.3
@@ -888,7 +888,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0xa2bfe8a1
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[40]
@@ -907,7 +907,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     movupw.3
@@ -949,7 +949,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0xd192e819
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[44]
@@ -968,7 +968,7 @@ proc.prepare_message_schedule_and_consume.4
 
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
 
     movupw.2
@@ -979,7 +979,7 @@ proc.prepare_message_schedule_and_consume.4
 
     push.0x19a4c116
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     push.0.0.0.0
     loc_loadw.0
     exec.consume_message_word # consume msg[48]
@@ -1057,10 +1057,10 @@ proc.prepare_message_schedule_and_consume.4
     exec.consume_message_word # consume msg[63]
 
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
 
     push.0.0.0.0
-    loc_loadw.2
+    loc_loadw.8
 
     repeat.8
         movup.8
@@ -1563,51 +1563,51 @@ end
 #!
 #! Input:  [addr, len, ...]
 #! Output: [dig0, dig1, dig2, dig3, dig4, dig5, dig6, dig7, ...]
-export.hash_memory.12
+export.hash_memory.48
     # loc.0 (input address)
     loc_store.0
 
     # loc.1 (input length)
-    loc_store.1
+    loc_store.4
 
     # loc.2 (padded length): input_length + (55 - input_length) % 64 + 9
-    push.55 loc_load.1 u32wrapping_sub push.63 u32and
-    loc_load.1 u32assert2 u32overflowing_add assertz u32assert u32overflowing_add.9 assertz loc_store.2
+    push.55 loc_load.4 u32wrapping_sub push.63 u32and
+    loc_load.4 u32assert2 u32overflowing_add assertz u32assert u32overflowing_add.9 assertz loc_store.8
 
     # loc.3 (last word address in padding): input_address + (padded_length / 4) - 4
-    loc_load.2 u32assert u32div.4 loc_load.0 u32wrapping_add u32wrapping_sub.4 loc_store.3
+    loc_load.8 u32assert u32div.4 loc_load.0 u32wrapping_add u32wrapping_sub.4 loc_store.12
 
     # loc.4 (u32 aligned padding byte): 0x80000000 >> ((input_length % 4) * 8)
-    loc_load.1 u32assert u32mod.4 u32assert u32overflowing_mul.8 assertz push.0x80000000 swap u32shr loc_store.4
+    loc_load.4 u32assert u32mod.4 u32assert u32overflowing_mul.8 assertz push.0x80000000 swap u32shr loc_store.16
 
     # loc.5 (memory offset of first padding byte): (input_length / 4) % 4
-    loc_load.1 u32assert u32div.4 u32mod.4 loc_store.5
+    loc_load.4 u32assert u32div.4 u32mod.4 loc_store.20
 
     # loc.6 (memory address of first padding byte): input_address + (len / 16) * 4
     # Note: (len /16) * 4 is *not* the same as (len / 4), due to the division being a division + floor operation
-    loc_load.0 loc_load.1 u32assert u32div.16 u32assert2 mul.4 u32overflowing_add assertz loc_store.6
+    loc_load.0 loc_load.4 u32assert u32div.16 u32assert2 mul.4 u32overflowing_add assertz loc_store.24
 
 
     # loc.7 (number of remaining 512-bit blocks to consume): padded_length / 64
-    loc_load.2 u32assert u32div.64 loc_store.7
+    loc_load.8 u32assert u32div.64 loc_store.28
 
     # Set the first byte after the message to 0x80
-    padw loc_load.6 mem_loadw loc_store.8 loc_store.9 loc_store.10 loc_store.11
+    padw loc_load.24 mem_loadw loc_store.32 loc_store.36 loc_store.40 loc_store.44
     # Note: We have to `mul.4` here because locals are spread 4 addresses apart.
-    locaddr.8 loc_load.5 mul.4 u32wrapping_add dup mem_load loc_load.4 u32wrapping_add swap mem_store
-    loc_load.11 loc_load.10 loc_load.9 loc_load.8 loc_load.6 mem_storew dropw
+    locaddr.32 loc_load.20 mul.4 u32wrapping_add dup mem_load loc_load.16 u32wrapping_add swap mem_store
+    loc_load.44 loc_load.40 loc_load.36 loc_load.32 loc_load.24 mem_storew dropw
 
     # Set message length in bits at end of padding
-    padw loc_load.3 mem_loadw
-    movup.3 drop loc_load.1 u32assert u32overflowing_mul.8 assertz movdn.3
-    loc_load.3 mem_storew dropw
+    padw loc_load.12 mem_loadw
+    movup.3 drop loc_load.4 u32assert u32overflowing_mul.8 assertz movdn.3
+    loc_load.12 mem_storew dropw
 
     # Sha256 init
     push.0x5be0cd19.0x1f83d9ab.0x9b05688c.0x510e527f
     push.0xa54ff53a.0x3c6ef372.0xbb67ae85.0x6a09e667
 
     # Consume sha256 blocks
-    loc_load.7 u32assert neq.0
+    loc_load.28 u32assert neq.0
     while.true
         padw loc_load.0 u32assert u32overflowing_add.12 assertz mem_loadw movdnw.2
         padw loc_load.0 u32assert u32overflowing_add.8 assertz mem_loadw movdnw.2
@@ -1616,7 +1616,7 @@ export.hash_memory.12
         exec.prepare_message_schedule_and_consume
 
         loc_load.0 u32assert u32overflowing_add.16 assertz loc_store.0
-        loc_load.7 u32assert u32overflowing_sub.1 assertz dup loc_store.7
+        loc_load.28 u32assert u32overflowing_sub.1 assertz dup loc_store.28
         u32assert neq.0
     end
 end
diff --git a/stdlib/asm/math/ecgfp5/group.masm b/stdlib/asm/math/ecgfp5/group.masm
index ddecc813cd..209a1d6b50 100644
--- a/stdlib/asm/math/ecgfp5/group.masm
+++ b/stdlib/asm/math/ecgfp5/group.masm
@@ -229,32 +229,32 @@ end
 #!
 #! Read point addition section ( on page 8 ) of https://ia.cr/2022/274
 #! For reference implementation see https://github.com/pornin/ecgfp5/blob/ce059c6/python/ecGFp5.py#L1228-L1255
-export.add.10
+export.add.40
     loc_storew.0
     dropw
-    loc_store.1 # cached x1
+    loc_store.4 # cached x1
 
-    loc_storew.2
+    loc_storew.8
     dropw
-    loc_store.3 # cached y1
+    loc_store.12 # cached y1
 
-    loc_store.4 # cached inf1
+    loc_store.16 # cached inf1
 
-    loc_storew.5
+    loc_storew.20
     dropw
-    loc_store.6 # cached x2
+    loc_store.24 # cached x2
 
-    loc_storew.7
+    loc_storew.28
     dropw
-    loc_store.8 # cached y2
+    loc_store.32 # cached y2
 
-    loc_store.9 # cached inf2
+    loc_store.36 # cached inf2
 
-    loc_load.6
+    loc_load.24
     push.0.0.0.0
-    loc_loadw.5 # bring x2
+    loc_loadw.20 # bring x2
 
-    loc_load.1
+    loc_load.4
     push.0.0.0.0
     loc_loadw.0 # bring x1
 
@@ -262,7 +262,7 @@ export.add.10
     dup
 
     if.true
-        loc_load.1
+        loc_load.4
         push.0.0.0.0
         loc_loadw.0 # bring x1
 
@@ -278,13 +278,13 @@ export.add.10
         add.263
         swap
     else
-        loc_load.3
+        loc_load.12
         push.0.0.0.0
-        loc_loadw.2 # bring y1
+        loc_loadw.8 # bring y1
 
-        loc_load.8
+        loc_load.32
         push.0.0.0.0
-        loc_loadw.7 # bring y2
+        loc_loadw.28 # bring y2
 
         exec.base_field::sub
     end # = λ0
@@ -292,22 +292,22 @@ export.add.10
     dup.5
 
     if.true
-        loc_load.3
+        loc_load.12
         push.0.0.0.0
-        loc_loadw.2 # bring y1
+        loc_loadw.8 # bring y1
 
         repeat.5
             movup.4
             mul.2
         end
     else
-        loc_load.1
+        loc_load.4
         push.0.0.0.0
         loc_loadw.0 # bring x1
 
-        loc_load.6
+        loc_load.24
         push.0.0.0.0
-        loc_loadw.5 # bring x2
+        loc_loadw.20 # bring x2
 
         exec.base_field::sub
     end # = λ1
@@ -324,11 +324,11 @@ export.add.10
 
     exec.base_field::square # = λ^2
 
-    loc_load.6
+    loc_load.24
     push.0.0.0.0
-    loc_loadw.5 # bring x2
+    loc_loadw.20 # bring x2
 
-    loc_load.1
+    loc_load.4
     push.0.0.0.0
     loc_loadw.0 # bring x1
 
@@ -344,7 +344,7 @@ export.add.10
         dup.4
     end
 
-    loc_load.1
+    loc_load.4
     push.0.0.0.0
     loc_loadw.0 # bring x1
 
@@ -356,9 +356,9 @@ export.add.10
 
     exec.base_field::mul
 
-    loc_load.3
+    loc_load.12
     push.0.0.0.0
-    loc_loadw.2 # bring y1
+    loc_loadw.8 # bring y1
 
     repeat.5
         movup.9
@@ -368,13 +368,13 @@ export.add.10
 
     movup.10
 
-    loc_load.3
+    loc_load.12
     push.0.0.0.0
-    loc_loadw.2 # bring y1
+    loc_loadw.8 # bring y1
 
-    loc_load.8
+    loc_load.32
     push.0.0.0.0
-    loc_loadw.7 # bring y2
+    loc_loadw.28 # bring y2
 
     exec.base_field::neq
 
@@ -384,11 +384,11 @@ export.add.10
 
     # finalize selection of y3
 
-    loc_load.8
+    loc_load.32
     push.0.0.0.0
-    loc_loadw.7 # bring y2
+    loc_loadw.28 # bring y2
 
-    loc_load.4 # bring inf1
+    loc_load.16 # bring inf1
 
     if.true
         repeat.5
@@ -401,11 +401,11 @@ export.add.10
         end
     end
 
-    loc_load.3
+    loc_load.12
     push.0.0.0.0
-    loc_loadw.2 # bring y1
+    loc_loadw.8 # bring y1
 
-    loc_load.9 # bring inf2
+    loc_load.36 # bring inf2
 
     if.true
         repeat.5
@@ -424,11 +424,11 @@ export.add.10
         movup.10
     end
 
-    loc_load.6
+    loc_load.24
     push.0.0.0.0
-    loc_loadw.5 # bring x2
+    loc_loadw.20 # bring x2
 
-    loc_load.4 # bring inf1
+    loc_load.16 # bring inf1
 
     if.true
         repeat.5
@@ -441,11 +441,11 @@ export.add.10
         end
     end
 
-    loc_load.1
+    loc_load.4
     push.0.0.0.0
     loc_loadw.0 # bring x1
 
-    loc_load.9 # bring inf2
+    loc_load.36 # bring inf2
 
     if.true
         repeat.5
@@ -461,12 +461,12 @@ export.add.10
     # finalize selection of inf3
 
     movup.10
-    loc_load.9 # bring inf2
-    loc_load.4 # bring inf1
+    loc_load.36 # bring inf2
+    loc_load.16 # bring inf1
     cdrop
 
-    loc_load.4 # bring inf1
-    loc_load.9 # bring inf2
+    loc_load.16 # bring inf1
+    loc_load.36 # bring inf2
     cdrop
 
     movdn.10
@@ -492,27 +492,27 @@ end
 #!
 #! Read point addition section ( on page 8 ) of https://ia.cr/2022/274
 #! For reference implementation see https://github.com/pornin/ecgfp5/blob/ce059c6/python/ecGFp5.py#L1270-L1280
-export.double.5
+export.double.20
     loc_storew.0
     dropw
-    loc_store.1 # cached x
+    loc_store.4 # cached x
 
-    loc_storew.2
+    loc_storew.8
     dropw
-    loc_store.3 # cached y
+    loc_store.12 # cached y
 
-    loc_store.4 # cached inf
+    loc_store.16 # cached inf
 
-    loc_load.3
+    loc_load.12
     push.0.0.0.0
-    loc_loadw.2 # bring y
+    loc_loadw.8 # bring y
 
     repeat.5
         movup.4
         mul.2
     end # compute λ1
 
-    loc_load.1
+    loc_load.4
     push.0.0.0.0
     loc_loadw.0 # bring x
 
@@ -530,7 +530,7 @@ export.double.5
 
     exec.base_field::div # compute λ
 
-    loc_load.1
+    loc_load.4
     push.0.0.0.0
     loc_loadw.0 # bring x
 
@@ -550,7 +550,7 @@ export.double.5
         dup.4
     end
 
-    loc_load.1
+    loc_load.4
     push.0.0.0.0
     loc_loadw.0 # bring x
 
@@ -562,9 +562,9 @@ export.double.5
 
     exec.base_field::mul
 
-    loc_load.3
+    loc_load.12
     push.0.0.0.0
-    loc_loadw.2 # bring y
+    loc_loadw.8 # bring y
 
     repeat.5
         movup.9
@@ -576,7 +576,7 @@ export.double.5
         movup.9
     end
 
-    loc_load.4
+    loc_load.16
     movdn.10
 end
 
@@ -604,31 +604,31 @@ end
 #! Point b = (x', y' inf') | b = e * a
 #!
 #! See https://github.com/itzmeanjan/secp256k1/blob/cbbe199/point.py#L174-L186 for source of inpiration.
-export.mul.10
+export.mul.40
     loc_storew.0
     dropw
-    loc_store.1 # cached base_x
+    loc_store.4 # cached base_x
 
-    loc_storew.2
+    loc_storew.8
     dropw
-    loc_store.3 # cached base_y
+    loc_store.12 # cached base_y
 
-    loc_store.4 # cached base_inf
+    loc_store.16 # cached base_inf
 
     push.0.0.0.0
-    loc_storew.5
+    loc_storew.20
     dropw
     push.0
-    loc_store.6 # initialize and cache res_x
+    loc_store.24 # initialize and cache res_x
 
     push.0.0.0.0
-    loc_storew.7
+    loc_storew.28
     dropw
     push.0
-    loc_store.8 # initialize and cache res_y
+    loc_store.32 # initialize and cache res_y
 
     push.1
-    loc_store.9 # initialize and cache res_inf
+    loc_store.36 # initialize and cache res_inf
 
     repeat.10
         repeat.32
@@ -638,49 +638,49 @@ export.mul.10
 
             if.true
                 # bring base
-                loc_load.4
+                loc_load.16
 
-                loc_load.3
+                loc_load.12
                 push.0.0.0.0
-                loc_loadw.2
+                loc_loadw.8
 
-                loc_load.1
+                loc_load.4
                 push.0.0.0.0
                 loc_loadw.0
 
                 # bring res
-                loc_load.9
+                loc_load.36
 
-                loc_load.8
+                loc_load.32
                 push.0.0.0.0
-                loc_loadw.7
+                loc_loadw.28
 
-                loc_load.6
+                loc_load.24
                 push.0.0.0.0
-                loc_loadw.5
+                loc_loadw.20
 
                 exec.add
 
                 # write back res
-                loc_storew.5
+                loc_storew.20
                 dropw
-                loc_store.6
+                loc_store.24
 
-                loc_storew.7
+                loc_storew.28
                 dropw
-                loc_store.8
+                loc_store.32
 
-                loc_store.9
+                loc_store.36
             end
 
             # bring base
-            loc_load.4
+            loc_load.16
 
-            loc_load.3
+            loc_load.12
             push.0.0.0.0
-            loc_loadw.2
+            loc_loadw.8
 
-            loc_load.1
+            loc_load.4
             push.0.0.0.0
             loc_loadw.0
 
@@ -689,13 +689,13 @@ export.mul.10
             # write back base
             loc_storew.0
             dropw
-            loc_store.1
+            loc_store.4
 
-            loc_storew.2
+            loc_storew.8
             dropw
-            loc_store.3
+            loc_store.12
 
-            loc_store.4
+            loc_store.16
 
             u32shr.1
         end
@@ -704,15 +704,15 @@ export.mul.10
     end
 
     # bring res
-    loc_load.9
+    loc_load.36
 
-    loc_load.8
+    loc_load.32
     push.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
 
-    loc_load.6
+    loc_load.24
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
 end
 
 #! Given a 319 -bit scalar ( say e ) on stack, this routine computes elliptic curve point
@@ -741,29 +741,29 @@ end
 #! Point b = (x, y, inf) | b = e * G
 #!
 #! See https://github.com/itzmeanjan/secp256k1/blob/cbbe199/point.py#L174-L186 for source of inpiration.
-export.gen_mul.8
+export.gen_mul.32
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
     push.0.0
     movup.3
     movup.3
-    loc_storew.2
+    loc_storew.8
     dropw # cache 319 -bit scalar, having 10 limbs, each of 32 -bit width
 
     push.0.0.0.0.0
-    loc_storew.3
+    loc_storew.12
     dropw
-    loc_store.4 # initialize and cache res_x
+    loc_store.16 # initialize and cache res_x
 
     push.0.0.0.0.0
-    loc_storew.5
+    loc_storew.20
     dropw
-    loc_store.6 # initialize and cache res_y
+    loc_store.24 # initialize and cache res_y
 
     push.1
-    loc_store.7 # initialize and cache res_inf
+    loc_store.28 # initialize and cache res_inf
 
     push.0.8623373087021137817.12280941316336150722.16498106075800537106.17561351883262171669.14191612778916076371.18286383079848041790.5863339128417000744.11451759428629996034.14179124258460441178.4768624678334228939
     push.0.16248535820734672314.12843058077554438509.14604603115652880441.1861370543582612103.12413761366396265525.4176022593458990041.3883454194479152564.14208568149880154517.30496475895846659.10372352552633189742
@@ -1105,28 +1105,28 @@ export.gen_mul.8
                 if.true
                     # base already on stack top
                     # bring res from memory
-                    loc_load.7
+                    loc_load.28
 
-                    loc_load.6
+                    loc_load.24
                     push.0.0.0.0
-                    loc_loadw.5
+                    loc_loadw.20
 
-                    loc_load.4
+                    loc_load.16
                     push.0.0.0.0
-                    loc_loadw.3
+                    loc_loadw.12
 
                     exec.add
 
                     # write back res
-                    loc_storew.3
+                    loc_storew.12
                     dropw
-                    loc_store.4
+                    loc_store.16
 
-                    loc_storew.5
+                    loc_storew.20
                     dropw
-                    loc_store.6
+                    loc_store.24
 
-                    loc_store.7
+                    loc_store.28
                 else
                     dropw
                     dropw
@@ -1147,7 +1147,7 @@ export.gen_mul.8
         end
 
         push.0.0.0.0
-        loc_loadw.1
+        loc_loadw.4
         loc_storew.0
         dropw
     end
@@ -1155,7 +1155,7 @@ export.gen_mul.8
     repeat.2 # process last 2 limbs of scalar
         repeat.32 # process each of last two 32 -bit limbs
             push.0.0.0.0
-            loc_loadw.2
+            loc_loadw.8
 
             dup
             push.1
@@ -1164,34 +1164,34 @@ export.gen_mul.8
 
             u32shr.1
 
-            loc_storew.2
+            loc_storew.8
             dropw
 
             if.true
                 # base already on stack top
                 # bring res from memory
-                loc_load.7
+                loc_load.28
 
-                loc_load.6
+                loc_load.24
                 push.0.0.0.0
-                loc_loadw.5
+                loc_loadw.20
 
-                loc_load.4
+                loc_load.16
                 push.0.0.0.0
-                loc_loadw.3
+                loc_loadw.12
 
                 exec.add
 
                 # write back res
-                loc_storew.3
+                loc_storew.12
                 dropw
-                loc_store.4
+                loc_store.16
 
-                loc_storew.5
+                loc_storew.20
                 dropw
-                loc_store.6
+                loc_store.24
 
-                loc_store.7
+                loc_store.28
             else
                 dropw
                 dropw
@@ -1201,24 +1201,24 @@ export.gen_mul.8
         end
 
         push.0.0.0.0
-        loc_loadw.2
+        loc_loadw.8
 
         drop
         push.0
         movdn.3
 
-        loc_storew.2
+        loc_storew.8
         dropw
     end
 
     # bring res back to stack
-    loc_load.7
+    loc_load.28
 
-    loc_load.6
+    loc_load.24
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
 
-    loc_load.4
+    loc_load.16
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
 end
diff --git a/stdlib/asm/math/ecgfp5/scalar_field.masm b/stdlib/asm/math/ecgfp5/scalar_field.masm
index 2572fb3cdd..01c40872fd 100644
--- a/stdlib/asm/math/ecgfp5/scalar_field.masm
+++ b/stdlib/asm/math/ecgfp5/scalar_field.masm
@@ -251,19 +251,19 @@ end
 #! [r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, ...]
 #!
 #! Adapted from equivalent Rust implementation https://github.com/itzmeanjan/miden/blob/6a611e693601577864da3e43e745525b83c0030d/miden/tests/integration/stdlib/math/ext5_scalar.rs#L92-L132
-export.mont_mul.8
+export.mont_mul.32
     dup
     loc_store.0
-    loc_storew.1
+    loc_storew.4
     dropw
 
-    loc_storew.2
+    loc_storew.8
     dropw
 
     push.0.0
     movup.3
     movup.3
-    loc_storew.3
+    loc_storew.12
     dropw # cached (a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)
 
     # when i = 0
@@ -272,15 +272,15 @@ export.mont_mul.8
     loc_load.0
     u32wrapping_mul
     u32wrapping_mul.91978719 # more about this literal constant
-    loc_store.4              # https://github.com/itzmeanjan/miden/blob/e7038e45865a7032a0629346921a77010e82862d/miden/tests/integration/stdlib/math/ext5_scalar.rs#L46-L54
+    loc_store.16              # https://github.com/itzmeanjan/miden/blob/e7038e45865a7032a0629346921a77010e82862d/miden/tests/integration/stdlib/math/ext5_scalar.rs#L46-L54
                              # cached f
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     dup.4
     u32overflowing_mul
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2492202977
     swap
     movup.3
@@ -299,7 +299,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3893352854
     swap
     movup.3
@@ -313,7 +313,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -325,7 +325,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3609501852
     swap
     movup.3
@@ -338,12 +338,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swap
     drop
     movup.3
     swap
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -355,7 +355,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3901250617
     swap
     movup.3
@@ -368,16 +368,16 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.0.0.0.0
-    loc_loadw.2
+    loc_loadw.8
     movup.5
     movup.5
     movup.2
@@ -389,7 +389,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3484943929
     swap
     movup.3
@@ -402,10 +402,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -417,7 +417,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483622
     swap
     movup.3
@@ -431,7 +431,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -443,7 +443,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.22
     swap
     movup.3
@@ -456,12 +456,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swap
     drop
     movup.3
     swap
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -473,7 +473,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483633
     swap
     movup.3
@@ -486,16 +486,16 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.6
+    loc_storew.24
     dropw
 
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
     movup.3
     movup.3
     drop
@@ -512,7 +512,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483655
     swap
     movup.3
@@ -525,10 +525,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.3
     drop
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -540,7 +540,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483645
     swap
     movup.3
@@ -558,7 +558,7 @@ export.mont_mul.8
     movup.4
     u32wrapping_add
     swap
-    loc_storew.7
+    loc_storew.28
     dropw
 
     # when i = 1
@@ -567,15 +567,15 @@ export.mont_mul.8
     loc_load.0
     u32wrapping_mul
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     dup
     movup.5
     u32wrapping_add
     u32wrapping_mul.91978719
-    loc_store.4
+    loc_store.16
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     dup.8
     u32overflowing_mul
     swap
@@ -584,7 +584,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2492202977
     swap
     movup.3
@@ -607,7 +607,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3893352854
     swap
     movup.3
@@ -621,7 +621,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -637,7 +637,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3609501852
     swap
     movup.3
@@ -650,12 +650,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swap
     drop
     movup.3
     swap
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -671,7 +671,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3901250617
     swap
     movup.3
@@ -684,18 +684,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swapw
-    loc_loadw.2
+    loc_loadw.8
     movup.9
     movup.9
 
@@ -712,7 +712,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3484943929
     swap
     movup.3
@@ -725,10 +725,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -744,7 +744,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483622
     swap
     movup.3
@@ -758,7 +758,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -774,7 +774,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.22
     swap
     movup.3
@@ -787,12 +787,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swap
     drop
     movup.3
     swap
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -808,7 +808,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483633
     swap
     movup.3
@@ -821,18 +821,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.6
+    loc_storew.24
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.3
+    loc_loadw.12
     movup.3
     movup.3
     drop
@@ -857,7 +857,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483655
     swap
     movup.3
@@ -870,10 +870,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.3
     drop
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -889,7 +889,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483645
     swap
     movup.3
@@ -906,7 +906,7 @@ export.mont_mul.8
     push.0.0
     movup.3
     movup.3
-    loc_storew.7
+    loc_storew.28
     dropw
     drop
 
@@ -916,15 +916,15 @@ export.mont_mul.8
     loc_load.0
     u32wrapping_mul
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     dup
     movup.5
     u32wrapping_add
     u32wrapping_mul.91978719
-    loc_store.4
+    loc_store.16
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     dup.8
     u32overflowing_mul
     swap
@@ -933,7 +933,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2492202977
     swap
     movup.3
@@ -956,7 +956,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3893352854
     swap
     movup.3
@@ -970,7 +970,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -986,7 +986,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3609501852
     swap
     movup.3
@@ -999,12 +999,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swap
     drop
     movup.3
     swap
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -1020,7 +1020,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3901250617
     swap
     movup.3
@@ -1033,18 +1033,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swapw
-    loc_loadw.2
+    loc_loadw.8
     movup.9
     movup.9
 
@@ -1061,7 +1061,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3484943929
     swap
     movup.3
@@ -1074,10 +1074,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -1093,7 +1093,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483622
     swap
     movup.3
@@ -1107,7 +1107,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -1123,7 +1123,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.22
     swap
     movup.3
@@ -1136,12 +1136,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swap
     drop
     movup.3
     swap
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -1157,7 +1157,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483633
     swap
     movup.3
@@ -1170,18 +1170,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.6
+    loc_storew.24
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.3
+    loc_loadw.12
     movup.3
     movup.3
     drop
@@ -1206,7 +1206,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483655
     swap
     movup.3
@@ -1219,10 +1219,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.3
     drop
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -1238,7 +1238,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483645
     swap
     movup.3
@@ -1255,7 +1255,7 @@ export.mont_mul.8
     push.0.0
     movup.3
     movup.3
-    loc_storew.7
+    loc_storew.28
     dropw
     drop
 
@@ -1265,15 +1265,15 @@ export.mont_mul.8
     loc_load.0
     u32wrapping_mul
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     dup
     movup.5
     u32wrapping_add
     u32wrapping_mul.91978719
-    loc_store.4
+    loc_store.16
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     dup.8
     u32overflowing_mul
     swap
@@ -1282,7 +1282,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2492202977
     swap
     movup.3
@@ -1305,7 +1305,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3893352854
     swap
     movup.3
@@ -1319,7 +1319,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -1335,7 +1335,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3609501852
     swap
     movup.3
@@ -1348,12 +1348,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swap
     drop
     movup.3
     swap
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -1369,7 +1369,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3901250617
     swap
     movup.3
@@ -1382,18 +1382,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swapw
-    loc_loadw.2
+    loc_loadw.8
     movup.9
     movup.9
 
@@ -1410,7 +1410,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3484943929
     swap
     movup.3
@@ -1423,10 +1423,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -1442,7 +1442,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483622
     swap
     movup.3
@@ -1456,7 +1456,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -1472,7 +1472,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.22
     swap
     movup.3
@@ -1485,12 +1485,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swap
     drop
     movup.3
     swap
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -1506,7 +1506,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483633
     swap
     movup.3
@@ -1519,18 +1519,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.6
+    loc_storew.24
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.3
+    loc_loadw.12
     movup.3
     movup.3
     drop
@@ -1555,7 +1555,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483655
     swap
     movup.3
@@ -1568,10 +1568,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.3
     drop
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -1587,7 +1587,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483645
     swap
     movup.3
@@ -1604,7 +1604,7 @@ export.mont_mul.8
     push.0.0
     movup.3
     movup.3
-    loc_storew.7
+    loc_storew.28
     dropw
     drop
 
@@ -1614,15 +1614,15 @@ export.mont_mul.8
     loc_load.0
     u32wrapping_mul
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     dup
     movup.5
     u32wrapping_add
     u32wrapping_mul.91978719
-    loc_store.4
+    loc_store.16
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     dup.8
     u32overflowing_mul
     swap
@@ -1631,7 +1631,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2492202977
     swap
     movup.3
@@ -1654,7 +1654,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3893352854
     swap
     movup.3
@@ -1668,7 +1668,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -1684,7 +1684,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3609501852
     swap
     movup.3
@@ -1697,12 +1697,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swap
     drop
     movup.3
     swap
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -1718,7 +1718,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3901250617
     swap
     movup.3
@@ -1731,18 +1731,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swapw
-    loc_loadw.2
+    loc_loadw.8
     movup.9
     movup.9
 
@@ -1759,7 +1759,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3484943929
     swap
     movup.3
@@ -1772,10 +1772,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -1791,7 +1791,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483622
     swap
     movup.3
@@ -1805,7 +1805,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -1821,7 +1821,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.22
     swap
     movup.3
@@ -1834,12 +1834,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swap
     drop
     movup.3
     swap
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -1855,7 +1855,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483633
     swap
     movup.3
@@ -1868,18 +1868,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.6
+    loc_storew.24
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.3
+    loc_loadw.12
     movup.3
     movup.3
     drop
@@ -1904,7 +1904,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483655
     swap
     movup.3
@@ -1917,10 +1917,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.3
     drop
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -1936,7 +1936,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483645
     swap
     movup.3
@@ -1953,7 +1953,7 @@ export.mont_mul.8
     push.0.0
     movup.3
     movup.3
-    loc_storew.7
+    loc_storew.28
     dropw
     drop
 
@@ -1963,15 +1963,15 @@ export.mont_mul.8
     loc_load.0
     u32wrapping_mul
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     dup
     movup.5
     u32wrapping_add
     u32wrapping_mul.91978719
-    loc_store.4
+    loc_store.16
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     dup.8
     u32overflowing_mul
     swap
@@ -1980,7 +1980,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2492202977
     swap
     movup.3
@@ -2003,7 +2003,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3893352854
     swap
     movup.3
@@ -2017,7 +2017,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -2033,7 +2033,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3609501852
     swap
     movup.3
@@ -2046,12 +2046,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swap
     drop
     movup.3
     swap
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -2067,7 +2067,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3901250617
     swap
     movup.3
@@ -2080,18 +2080,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swapw
-    loc_loadw.2
+    loc_loadw.8
     movup.9
     movup.9
 
@@ -2108,7 +2108,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3484943929
     swap
     movup.3
@@ -2121,10 +2121,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -2140,7 +2140,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483622
     swap
     movup.3
@@ -2154,7 +2154,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -2170,7 +2170,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.22
     swap
     movup.3
@@ -2183,12 +2183,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swap
     drop
     movup.3
     swap
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -2204,7 +2204,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483633
     swap
     movup.3
@@ -2217,18 +2217,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.6
+    loc_storew.24
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.3
+    loc_loadw.12
     movup.3
     movup.3
     drop
@@ -2253,7 +2253,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483655
     swap
     movup.3
@@ -2266,10 +2266,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.3
     drop
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -2285,7 +2285,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483645
     swap
     movup.3
@@ -2302,7 +2302,7 @@ export.mont_mul.8
     push.0.0
     movup.3
     movup.3
-    loc_storew.7
+    loc_storew.28
     dropw
     drop
 
@@ -2312,15 +2312,15 @@ export.mont_mul.8
     loc_load.0
     u32wrapping_mul
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     dup
     movup.5
     u32wrapping_add
     u32wrapping_mul.91978719
-    loc_store.4
+    loc_store.16
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     dup.8
     u32overflowing_mul
     swap
@@ -2329,7 +2329,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2492202977
     swap
     movup.3
@@ -2352,7 +2352,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3893352854
     swap
     movup.3
@@ -2366,7 +2366,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -2382,7 +2382,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3609501852
     swap
     movup.3
@@ -2395,12 +2395,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swap
     drop
     movup.3
     swap
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -2416,7 +2416,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3901250617
     swap
     movup.3
@@ -2429,18 +2429,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swapw
-    loc_loadw.2
+    loc_loadw.8
     movup.9
     movup.9
 
@@ -2457,7 +2457,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3484943929
     swap
     movup.3
@@ -2470,10 +2470,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -2489,7 +2489,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483622
     swap
     movup.3
@@ -2503,7 +2503,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -2519,7 +2519,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.22
     swap
     movup.3
@@ -2532,12 +2532,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swap
     drop
     movup.3
     swap
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -2553,7 +2553,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483633
     swap
     movup.3
@@ -2566,18 +2566,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.6
+    loc_storew.24
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.3
+    loc_loadw.12
     movup.3
     movup.3
     drop
@@ -2602,7 +2602,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483655
     swap
     movup.3
@@ -2615,10 +2615,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.3
     drop
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -2634,7 +2634,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483645
     swap
     movup.3
@@ -2651,7 +2651,7 @@ export.mont_mul.8
     push.0.0
     movup.3
     movup.3
-    loc_storew.7
+    loc_storew.28
     dropw
     drop
 
@@ -2661,15 +2661,15 @@ export.mont_mul.8
     loc_load.0
     u32wrapping_mul
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     dup
     movup.5
     u32wrapping_add
     u32wrapping_mul.91978719
-    loc_store.4
+    loc_store.16
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     dup.8
     u32overflowing_mul
     swap
@@ -2678,7 +2678,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2492202977
     swap
     movup.3
@@ -2701,7 +2701,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3893352854
     swap
     movup.3
@@ -2715,7 +2715,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -2731,7 +2731,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3609501852
     swap
     movup.3
@@ -2744,12 +2744,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swap
     drop
     movup.3
     swap
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -2765,7 +2765,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3901250617
     swap
     movup.3
@@ -2778,18 +2778,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swapw
-    loc_loadw.2
+    loc_loadw.8
     movup.9
     movup.9
 
@@ -2806,7 +2806,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3484943929
     swap
     movup.3
@@ -2819,10 +2819,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -2838,7 +2838,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483622
     swap
     movup.3
@@ -2852,7 +2852,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -2868,7 +2868,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.22
     swap
     movup.3
@@ -2881,12 +2881,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swap
     drop
     movup.3
     swap
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -2902,7 +2902,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483633
     swap
     movup.3
@@ -2915,18 +2915,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.6
+    loc_storew.24
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.3
+    loc_loadw.12
     movup.3
     movup.3
     drop
@@ -2951,7 +2951,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483655
     swap
     movup.3
@@ -2964,10 +2964,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.3
     drop
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -2983,7 +2983,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483645
     swap
     movup.3
@@ -3000,7 +3000,7 @@ export.mont_mul.8
     push.0.0
     movup.3
     movup.3
-    loc_storew.7
+    loc_storew.28
     dropw
     drop
 
@@ -3010,15 +3010,15 @@ export.mont_mul.8
     loc_load.0
     u32wrapping_mul
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     dup
     movup.5
     u32wrapping_add
     u32wrapping_mul.91978719
-    loc_store.4
+    loc_store.16
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     dup.8
     u32overflowing_mul
     swap
@@ -3027,7 +3027,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2492202977
     swap
     movup.3
@@ -3050,7 +3050,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3893352854
     swap
     movup.3
@@ -3064,7 +3064,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -3080,7 +3080,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3609501852
     swap
     movup.3
@@ -3093,12 +3093,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swap
     drop
     movup.3
     swap
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -3114,7 +3114,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3901250617
     swap
     movup.3
@@ -3127,18 +3127,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swapw
-    loc_loadw.2
+    loc_loadw.8
     movup.9
     movup.9
 
@@ -3155,7 +3155,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3484943929
     swap
     movup.3
@@ -3168,10 +3168,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -3187,7 +3187,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483622
     swap
     movup.3
@@ -3201,7 +3201,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -3217,7 +3217,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.22
     swap
     movup.3
@@ -3230,12 +3230,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swap
     drop
     movup.3
     swap
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -3251,7 +3251,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483633
     swap
     movup.3
@@ -3264,18 +3264,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.6
+    loc_storew.24
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.3
+    loc_loadw.12
     movup.3
     movup.3
     drop
@@ -3300,7 +3300,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483655
     swap
     movup.3
@@ -3313,10 +3313,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.3
     drop
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -3332,7 +3332,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483645
     swap
     movup.3
@@ -3349,7 +3349,7 @@ export.mont_mul.8
     push.0.0
     movup.3
     movup.3
-    loc_storew.7
+    loc_storew.28
     dropw
     drop
 
@@ -3359,15 +3359,15 @@ export.mont_mul.8
     loc_load.0
     u32wrapping_mul
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     dup
     movup.5
     u32wrapping_add
     u32wrapping_mul.91978719
-    loc_store.4
+    loc_store.16
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     dup.8
     u32overflowing_mul
     swap
@@ -3376,7 +3376,7 @@ export.mont_mul.8
     movup.2
     u32wrapping_add
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2492202977
     swap
     movup.3
@@ -3399,7 +3399,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3893352854
     swap
     movup.3
@@ -3413,7 +3413,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -3429,7 +3429,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3609501852
     swap
     movup.3
@@ -3442,12 +3442,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swap
     drop
     movup.3
     swap
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -3463,7 +3463,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3901250617
     swap
     movup.3
@@ -3476,18 +3476,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swapw
-    loc_loadw.2
+    loc_loadw.8
     movup.9
     movup.9
 
@@ -3504,7 +3504,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.3484943929
     swap
     movup.3
@@ -3517,10 +3517,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     movup.2
@@ -3536,7 +3536,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483622
     swap
     movup.3
@@ -3550,7 +3550,7 @@ export.mont_mul.8
     swap
     push.0.0.0
     movup.3
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -3566,7 +3566,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.22
     swap
     movup.3
@@ -3579,12 +3579,12 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     swap
     drop
     movup.3
     swap
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -3600,7 +3600,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483633
     swap
     movup.3
@@ -3613,18 +3613,18 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.2
     drop
     movup.3
     movdn.2
-    loc_storew.6
+    loc_storew.24
     dropw
 
     push.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.3
+    loc_loadw.12
     movup.3
     movup.3
     drop
@@ -3649,7 +3649,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483655
     swap
     movup.3
@@ -3662,10 +3662,10 @@ export.mont_mul.8
     u32wrapping_add3
     swap
     push.0.0.0.0
-    loc_loadw.6
+    loc_loadw.24
     movup.3
     drop
-    loc_storew.6
+    loc_storew.24
     dropw
 
     movup.2
@@ -3681,7 +3681,7 @@ export.mont_mul.8
     movup.3
     u32wrapping_add3
 
-    loc_load.4
+    loc_load.16
     u32overflowing_mul.2147483645
     swap
     movup.3
@@ -3698,7 +3698,7 @@ export.mont_mul.8
     push.0.0
     movup.3
     movup.3
-    loc_storew.7
+    loc_storew.28
     dropw
     drop
 
@@ -3706,11 +3706,11 @@ export.mont_mul.8
     #
     # this will be used when executing https://github.com/itzmeanjan/miden/blob/6a611e693601577864da3e43e745525b83c0030d/miden/tests/integration/stdlib/math/ext5_scalar.rs#L131
     push.0.0.0.0.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.6
+    loc_loadw.24
     movupw.2
-    loc_loadw.5
+    loc_loadw.20
     movup.11
     movup.11
     drop
@@ -3723,11 +3723,11 @@ export.mont_mul.8
     #
     # this will be used when executing https://github.com/itzmeanjan/miden/blob/6a611e693601577864da3e43e745525b83c0030d/miden/tests/integration/stdlib/math/ext5_scalar.rs#L130
     push.0.0.0.0.0.0.0.0.0.0.0.0
-    loc_loadw.7
+    loc_loadw.28
     swapw
-    loc_loadw.6
+    loc_loadw.24
     movupw.2
-    loc_loadw.5
+    loc_loadw.20
     movup.11
     movup.11
     drop
@@ -3803,28 +3803,28 @@ end
 #! Note, if input operand is 0, then multiplicative inverse can't be computed, which is why output result is also 0.
 #!
 #! Adapted from equivalent Rust implementation https://github.com/itzmeanjan/miden/blob/6a611e693601577864da3e43e745525b83c0030d/miden/tests/integration/stdlib/math/ext5_scalar.rs#L162-L176
-export.inv.6
+export.inv.24
     # cache result initial value 1 ( in Montgomery form )
     push.0.0.4.4294967281.29.4294967251.50.1620046732.787433356.1370930886.803228882.3605528638
     loc_storew.0
     dropw
-    loc_storew.1
+    loc_storew.4
     dropw
-    loc_storew.2
+    loc_storew.8
     dropw
 
     # compute Montgomery form of base and cache it
     #
     # note, base ( i.e. input operand ) is expected in radix-2^32 form
     exec.to_mont
-    loc_storew.3
+    loc_storew.12
     dropw
-    loc_storew.4
+    loc_storew.16
     dropw
     push.0.0
     movup.3
     movup.3
-    loc_storew.5
+    loc_storew.20
     dropw
 
     push.2492202975.3893352854.3609501852.3901250617.3484943929.2147483622.22.2147483633.2147483655.2147483645
@@ -3833,9 +3833,9 @@ export.inv.6
         repeat.32
             # bring res back to stack
             push.0.0.0.0.0.0.0.0.0.0.0.0
-            loc_loadw.2
+            loc_loadw.8
             swapw
-            loc_loadw.1
+            loc_loadw.4
             movupw.2
             loc_loadw.0
             movup.11
@@ -3848,12 +3848,12 @@ export.inv.6
             # write res back to memory
             loc_storew.0
             dropw
-            loc_storew.1
+            loc_storew.4
             dropw
             push.0.0
             movup.3
             movup.3
-            loc_storew.2
+            loc_storew.8
             dropw
 
             dup
@@ -3861,11 +3861,11 @@ export.inv.6
             if.true
                 # bring base back to stack
                 push.0.0.0.0.0.0.0.0.0.0.0.0
-                loc_loadw.5
+                loc_loadw.20
                 swapw
-                loc_loadw.4
+                loc_loadw.16
                 movupw.2
-                loc_loadw.3
+                loc_loadw.12
                 movup.11
                 movup.11
                 drop
@@ -3873,9 +3873,9 @@ export.inv.6
 
                 # bring res back to stack
                 push.0.0.0.0.0.0.0.0.0.0.0.0
-                loc_loadw.2
+                loc_loadw.8
                 swapw
-                loc_loadw.1
+                loc_loadw.4
                 movupw.2
                 loc_loadw.0
                 movup.11
@@ -3888,12 +3888,12 @@ export.inv.6
                 # write res back to memory
                 loc_storew.0
                 dropw
-                loc_storew.1
+                loc_storew.4
                 dropw
                 push.0.0
                 movup.3
                 movup.3
-                loc_storew.2
+                loc_storew.8
                 dropw
             end
 
@@ -3905,9 +3905,9 @@ export.inv.6
 
     # bring res back to stack
     push.0.0.0.0.0.0.0.0.0.0.0.0
-    loc_loadw.2
+    loc_loadw.8
     swapw
-    loc_loadw.1
+    loc_loadw.4
     movupw.2
     loc_loadw.0
     movup.11
diff --git a/stdlib/asm/math/secp256k1/base_field.masm b/stdlib/asm/math/secp256k1/base_field.masm
index 66b2d7f76f..524da5895c 100644
--- a/stdlib/asm/math/secp256k1/base_field.masm
+++ b/stdlib/asm/math/secp256k1/base_field.masm
@@ -250,10 +250,10 @@ end
 #! while computed c[0..8] will also be in Montgomery form.
 #!
 #! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_utils.py#L101-L222
-export.mul.2
+export.mul.8
   loc_storew.0
   swapw
-  loc_storew.1
+  loc_storew.4
   swapw
 
   exec.u256xu32
@@ -274,7 +274,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -284,7 +284,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -294,7 +294,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -304,7 +304,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -314,7 +314,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -324,7 +324,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -334,7 +334,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -577,18 +577,18 @@ end
 #! inverse can't be computed, which is why output result is also 0.
 #!
 #! See https://github.com/itzmeanjan/secp256k1/blob/37b339db3e03d24c2977399eb8896ef515ebb09b/field/base_field.py#L114-L132
-export.inv.4
+export.inv.16
   # cache result initial value ( = 1, in Montgomery form )
   push.0.0.0.0.0.0.1.977
   loc_storew.0
   dropw
-  loc_storew.1
+  loc_storew.4
   dropw
 
   # cache base
-  loc_storew.2
+  loc_storew.8
   dropw
-  loc_storew.3
+  loc_storew.12
   dropw
 
   push.4294966317.4294967294.4294967295.4294967295.4294967295.4294967295.4294967295.4294967295
@@ -596,7 +596,7 @@ export.inv.4
   repeat.8
     repeat.32
       push.0.0.0.0.0.0.0.0
-      loc_loadw.1
+      loc_loadw.4
       swapw
       loc_loadw.0
 
@@ -604,7 +604,7 @@ export.inv.4
 
       loc_storew.0
       dropw
-      loc_storew.1
+      loc_storew.4
       dropw
 
       dup
@@ -612,13 +612,13 @@ export.inv.4
       if.true
         push.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0
 
-        loc_loadw.3
+        loc_loadw.12
         swapw
-        loc_loadw.2
+        loc_loadw.8
 
         swapdw
 
-        loc_loadw.1
+        loc_loadw.4
         swapw
         loc_loadw.0
 
@@ -626,7 +626,7 @@ export.inv.4
 
         loc_storew.0
         dropw
-        loc_storew.1
+        loc_storew.4
         dropw
       end
 
@@ -637,7 +637,7 @@ export.inv.4
   end
 
   push.0.0.0.0.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   swapw
   loc_loadw.0
 end
diff --git a/stdlib/asm/math/secp256k1/group.masm b/stdlib/asm/math/secp256k1/group.masm
index d92994f713..cc94611465 100644
--- a/stdlib/asm/math/secp256k1/group.masm
+++ b/stdlib/asm/math/secp256k1/group.masm
@@ -35,7 +35,7 @@ use.std::math::secp256k1::base_field
 #! Stack at end of execution of routine looks like
 #!
 #!   [x3_addr[0..4], x3_addr[4..8], y3_addr[0..4], y3_addr[4..8], z3_addr[0..4], z3_addr[4..8]]
-export.double.12
+export.double.48
   dup.3
   push.0.0.0.0
   movup.4
@@ -52,7 +52,7 @@ export.double.12
 
   loc_storew.0
   swapw
-  loc_storew.1
+  loc_storew.4
   swapw             # cache t0
 
   dupw.1
@@ -70,9 +70,9 @@ export.double.12
 
   exec.base_field::add # = z3
 
-  loc_storew.2
+  loc_storew.8
   dropw
-  loc_storew.3
+  loc_storew.12
   dropw             # cache z3
 
   dup.5
@@ -95,9 +95,9 @@ export.double.12
 
   exec.base_field::mul # = t1
 
-  loc_storew.4
+  loc_storew.16
   dropw
-  loc_storew.5
+  loc_storew.20
   dropw             # cache t1
 
   dup.5
@@ -119,61 +119,61 @@ export.double.12
 
   exec.base_field::mul # = t2
 
-  loc_storew.6
+  loc_storew.24
   swapw
-  loc_storew.7    # cache t2
+  loc_storew.28    # cache t2
   swapw
 
   push.0.0.0.0
-  loc_loadw.3
+  loc_loadw.12
   push.0.0.0.0
-  loc_loadw.2     # = z3
+  loc_loadw.8     # = z3
 
   exec.base_field::mul # = x3
 
-  loc_storew.8
+  loc_storew.32
   dropw
-  loc_storew.9
+  loc_storew.36
   dropw             # cache x3
 
   push.0.0.0.0
-  loc_loadw.7
+  loc_loadw.28
   push.0.0.0.0
-  loc_loadw.6     # = t2
+  loc_loadw.24     # = t2
 
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0     # = t0
 
   exec.base_field::add # = y3
 
-  loc_storew.10
+  loc_storew.40
   dropw
-  loc_storew.11
+  loc_storew.44
   dropw           # cache y3
 
   push.0.0.0.0
-  loc_loadw.5
+  loc_loadw.20
   push.0.0.0.0
-  loc_loadw.4     # = t1
+  loc_loadw.16     # = t1
 
   push.0.0.0.0
-  loc_loadw.3
+  loc_loadw.12
   push.0.0.0.0
-  loc_loadw.2     # = z3
+  loc_loadw.8     # = z3
 
   exec.base_field::mul # = z3
 
-  loc_storew.2
+  loc_storew.8
   dropw
-  loc_storew.3
+  loc_storew.12
   dropw             # cache z3
 
   push.0.0.0.0
-  loc_loadw.7
+  loc_loadw.28
   push.0.0.0.0
-  loc_loadw.6     # = t2
+  loc_loadw.24     # = t2
 
   dupw.1
   dupw.1            # repeated t2
@@ -181,14 +181,14 @@ export.double.12
   exec.base_field::add # = t1
 
   push.0.0.0.0
-  loc_loadw.7
+  loc_loadw.28
   push.0.0.0.0
-  loc_loadw.6     # = t2
+  loc_loadw.24     # = t2
 
   exec.base_field::add # = t2
 
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0     # = t0
 
@@ -196,26 +196,26 @@ export.double.12
 
   loc_storew.0
   swapw
-  loc_storew.1
+  loc_storew.4
   swapw             # cache t0
 
   push.0.0.0.0
-  loc_loadw.11
+  loc_loadw.44
   push.0.0.0.0
-  loc_loadw.10    # = y3
+  loc_loadw.40    # = y3
 
   exec.base_field::mul # = y3
 
   push.0.0.0.0
-  loc_loadw.9
+  loc_loadw.36
   push.0.0.0.0
-  loc_loadw.8     # = x3
+  loc_loadw.32     # = x3
 
   exec.base_field::add # = y3
 
-  loc_storew.10
+  loc_storew.40
   dropw
-  loc_storew.11
+  loc_storew.44
   dropw            # cache y3
 
   dup.3
@@ -239,7 +239,7 @@ export.double.12
   exec.base_field::mul # = t1
 
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0     # = t0
 
@@ -250,9 +250,9 @@ export.double.12
 
   exec.base_field::add # = x3
 
-  loc_storew.8
+  loc_storew.32
   dropw
-  loc_storew.9
+  loc_storew.36
   dropw             # cache x3
 
   dropw
@@ -261,42 +261,42 @@ export.double.12
 
   dup
   push.0.0.0.0
-  loc_loadw.8
+  loc_loadw.32
   movup.4
   mem_storew
   dropw              # write x3[0..4] to memory
 
   dup.1
   push.0.0.0.0
-  loc_loadw.9
+  loc_loadw.36
   movup.4
   mem_storew
   dropw              # write x3[4..8] to memory
 
   dup.2
   push.0.0.0.0
-  loc_loadw.10
+  loc_loadw.40
   movup.4
   mem_storew
   dropw              # write y3[0..4] to memory
 
   dup.3
   push.0.0.0.0
-  loc_loadw.11
+  loc_loadw.44
   movup.4
   mem_storew
   dropw              # write y3[4..8] to memory
 
   dup.4
   push.0.0.0.0
-  loc_loadw.2
+  loc_loadw.8
   movup.4
   mem_storew
   dropw              # write z3[0..4] to memory
 
   dup.5
   push.0.0.0.0
-  loc_loadw.3
+  loc_loadw.12
   movup.4
   mem_storew
   dropw              # write z3[4..8] to memory
@@ -333,7 +333,7 @@ end
 #! Stack at end of execution of routine looks like
 #!
 #!   [x3_addr[0..4], x3_addr[4..8], y3_addr[0..4], y3_addr[4..8], z3_addr[0..4], z3_addr[4..8]]
-export.add.16
+export.add.64
   dup.6
   dup.8
 
@@ -360,7 +360,7 @@ export.add.16
 
   loc_storew.0
   dropw
-  loc_storew.1
+  loc_storew.4
   dropw        # cache t0
 
   dup.8
@@ -387,9 +387,9 @@ export.add.16
 
   exec.base_field::mul # = t1
 
-  loc_storew.2
+  loc_storew.8
   dropw
-  loc_storew.3
+  loc_storew.12
   dropw        # cache t1
 
   dup.10
@@ -416,9 +416,9 @@ export.add.16
 
   exec.base_field::mul # = t2
 
-  loc_storew.4
+  loc_storew.16
   dropw
-  loc_storew.5
+  loc_storew.20
   dropw        # cache t2
 
   dup.2
@@ -445,9 +445,9 @@ export.add.16
 
   exec.base_field::add # = t3
 
-  loc_storew.6
+  loc_storew.24
   dropw
-  loc_storew.7
+  loc_storew.28
   dropw        # cache t3
 
   dup.8
@@ -476,39 +476,39 @@ export.add.16
   exec.base_field::add # = t4
 
   push.0.0.0.0
-  loc_loadw.7
+  loc_loadw.28
   push.0.0.0.0
-  loc_loadw.6 # t3 loaded back
+  loc_loadw.24 # t3 loaded back
 
   exec.base_field::mul # = t3
 
-  loc_storew.6
+  loc_storew.24
   dropw
-  loc_storew.7
+  loc_storew.28
   dropw        # cache t3
 
   push.0.0.0.0
-  loc_loadw.3
+  loc_loadw.12
   push.0.0.0.0
-  loc_loadw.2 # t1 loaded back
+  loc_loadw.8 # t1 loaded back
 
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0 # t0 loaded back
 
   exec.base_field::add # = t4
 
   push.0.0.0.0
-  loc_loadw.7
+  loc_loadw.28
   push.0.0.0.0
-  loc_loadw.6 # t3 loaded back
+  loc_loadw.24 # t3 loaded back
 
   exec.base_field::sub # = t3
 
-  loc_storew.6
+  loc_storew.24
   dropw
-  loc_storew.7
+  loc_storew.28
   dropw        # cache t3
 
   dup.2
@@ -535,9 +535,9 @@ export.add.16
 
   exec.base_field::add # = t4
 
-  loc_storew.8
+  loc_storew.32
   dropw
-  loc_storew.9
+  loc_storew.36
   dropw        # cache t4
 
   dup.11
@@ -568,39 +568,39 @@ export.add.16
   exec.base_field::add # = x3
 
   push.0.0.0.0
-  loc_loadw.9
+  loc_loadw.36
   push.0.0.0.0
-  loc_loadw.8 # t4 loaded back
+  loc_loadw.32 # t4 loaded back
 
   exec.base_field::mul # = t4
 
-  loc_storew.8
+  loc_storew.32
   dropw
-  loc_storew.9
+  loc_storew.36
   dropw        # cache t4
 
   push.0.0.0.0
-  loc_loadw.5
+  loc_loadw.20
   push.0.0.0.0
-  loc_loadw.4 # t2 loaded back
+  loc_loadw.16 # t2 loaded back
 
   push.0.0.0.0
-  loc_loadw.3
+  loc_loadw.12
   push.0.0.0.0
-  loc_loadw.2 # t1 loaded back
+  loc_loadw.8 # t1 loaded back
 
   exec.base_field::add # = x3
 
   push.0.0.0.0
-  loc_loadw.9
+  loc_loadw.36
   push.0.0.0.0
-  loc_loadw.8 # t4 loaded back
+  loc_loadw.32 # t4 loaded back
 
   exec.base_field::sub # = t4
 
-  loc_storew.8
+  loc_storew.32
   dropw
-  loc_storew.9
+  loc_storew.36
   dropw        # cache t4
 
   dup.4
@@ -627,9 +627,9 @@ export.add.16
 
   exec.base_field::add # = x3
 
-  loc_storew.10
+  loc_storew.40
   dropw
-  loc_storew.11
+  loc_storew.44
   dropw       # cache x3
 
   dup.10
@@ -658,43 +658,43 @@ export.add.16
   exec.base_field::add # = y3
 
   push.0.0.0.0
-  loc_loadw.11
+  loc_loadw.44
   push.0.0.0.0
-  loc_loadw.10 # x3 loaded back
+  loc_loadw.40 # x3 loaded back
 
   exec.base_field::mul # = x3
 
-  loc_storew.10
+  loc_storew.40
   dropw
-  loc_storew.11
+  loc_storew.44
   dropw       # cache x3
 
   push.0.0.0.0
-  loc_loadw.5
+  loc_loadw.20
   push.0.0.0.0
-  loc_loadw.4 # t2 loaded back
+  loc_loadw.16 # t2 loaded back
 
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0 # t0 loaded back
 
   exec.base_field::add # = y3
 
   push.0.0.0.0
-  loc_loadw.11
+  loc_loadw.44
   push.0.0.0.0
-  loc_loadw.10 # x3 loaded back
+  loc_loadw.40 # x3 loaded back
 
   exec.base_field::sub # = y3
 
-  loc_storew.12
+  loc_storew.48
   dropw
-  loc_storew.13
+  loc_storew.52
   dropw       # cache y3
 
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0 # t0 loaded back
 
@@ -703,13 +703,13 @@ export.add.16
 
   exec.base_field::add # = x3
 
-  loc_storew.10
+  loc_storew.40
   swapw
-  loc_storew.11
+  loc_storew.44
   swapw # cache x3
 
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0 # t0 loaded back
 
@@ -717,153 +717,153 @@ export.add.16
 
   loc_storew.0
   dropw
-  loc_storew.1
+  loc_storew.4
   dropw        # cache t0
 
   push.0.0.0.0
   push.0.0.21.20517 # b3 on stack top
 
   push.0.0.0.0
-  loc_loadw.5
+  loc_loadw.20
   push.0.0.0.0
-  loc_loadw.4 # t2 loaded back
+  loc_loadw.16 # t2 loaded back
 
   exec.base_field::mul # = t2
 
-  loc_storew.4
+  loc_storew.16
   swapw
-  loc_storew.5
+  loc_storew.20
   swapw # cache t2
 
   push.0.0.0.0
-  loc_loadw.3
+  loc_loadw.12
   push.0.0.0.0
-  loc_loadw.2 # t1 loaded back
+  loc_loadw.8 # t1 loaded back
 
   exec.base_field::add # = z3
 
-  loc_storew.14
+  loc_storew.56
   dropw
-  loc_storew.15
+  loc_storew.60
   dropw       # cache z3
 
   push.0.0.0.0
-  loc_loadw.5
+  loc_loadw.20
   push.0.0.0.0
-  loc_loadw.4 # t2 loaded back
+  loc_loadw.16 # t2 loaded back
 
   push.0.0.0.0
-  loc_loadw.3
+  loc_loadw.12
   push.0.0.0.0
-  loc_loadw.2 # t1 loaded back
+  loc_loadw.8 # t1 loaded back
 
   exec.base_field::sub # = t1
 
-  loc_storew.2
+  loc_storew.8
   dropw
-  loc_storew.3
+  loc_storew.12
   dropw        # cache t1
 
   push.0.0.0.0
   push.0.0.21.20517 # b3 on stack top
 
   push.0.0.0.0
-  loc_loadw.13
+  loc_loadw.52
   push.0.0.0.0
-  loc_loadw.12 # y3 loaded back
+  loc_loadw.48 # y3 loaded back
 
   exec.base_field::mul # = y3
 
-  loc_storew.12
+  loc_storew.48
   swapw
-  loc_storew.13
+  loc_storew.52
   swapw # cache y3
 
   push.0.0.0.0
-  loc_loadw.9
+  loc_loadw.36
   push.0.0.0.0
-  loc_loadw.8 # t4 loaded back
+  loc_loadw.32 # t4 loaded back
 
   exec.base_field::mul # = x3
 
-  loc_storew.10
+  loc_storew.40
   dropw
-  loc_storew.11
+  loc_storew.44
   dropw       # cache x3
 
   push.0.0.0.0
-  loc_loadw.3
+  loc_loadw.12
   push.0.0.0.0
-  loc_loadw.2 # t1 loaded back
+  loc_loadw.8 # t1 loaded back
 
   push.0.0.0.0
-  loc_loadw.7
+  loc_loadw.28
   push.0.0.0.0
-  loc_loadw.6 # t3 loaded back
+  loc_loadw.24 # t3 loaded back
 
   exec.base_field::mul # = t2
 
   push.0.0.0.0
-  loc_loadw.11
+  loc_loadw.44
   push.0.0.0.0
-  loc_loadw.10 # x3 loaded back
+  loc_loadw.40 # x3 loaded back
 
   exec.base_field::neg
   exec.base_field::add # = x3
 
-  loc_storew.10
+  loc_storew.40
   dropw
-  loc_storew.11
+  loc_storew.44
   dropw       # cache x3
 
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0 # t0 loaded back
 
   push.0.0.0.0
-  loc_loadw.13
+  loc_loadw.52
   push.0.0.0.0
-  loc_loadw.12 # y3 loaded back
+  loc_loadw.48 # y3 loaded back
 
   exec.base_field::mul # = y3
 
-  loc_storew.12
+  loc_storew.48
   dropw
-  loc_storew.13
+  loc_storew.52
   dropw       # cache y3
 
   push.0.0.0.0
-  loc_loadw.15
+  loc_loadw.60
   push.0.0.0.0
-  loc_loadw.14 # z3 loaded back
+  loc_loadw.56 # z3 loaded back
 
   push.0.0.0.0
-  loc_loadw.3
+  loc_loadw.12
   push.0.0.0.0
-  loc_loadw.2 # t1 loaded back
+  loc_loadw.8 # t1 loaded back
 
   exec.base_field::mul # = t1
 
   push.0.0.0.0
-  loc_loadw.13
+  loc_loadw.52
   push.0.0.0.0
-  loc_loadw.12 # y3 loaded back
+  loc_loadw.48 # y3 loaded back
 
   exec.base_field::add # = y3
 
-  loc_storew.12
+  loc_storew.48
   dropw
-  loc_storew.13
+  loc_storew.52
   dropw       # cache y3
 
   push.0.0.0.0
-  loc_loadw.7
+  loc_loadw.28
   push.0.0.0.0
-  loc_loadw.6 # t3 loaded back
+  loc_loadw.24 # t3 loaded back
 
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0 # t0 loaded back
 
@@ -871,31 +871,31 @@ export.add.16
 
   loc_storew.0
   dropw
-  loc_storew.1
+  loc_storew.4
   dropw        # cache t0
 
   push.0.0.0.0
-  loc_loadw.9
+  loc_loadw.36
   push.0.0.0.0
-  loc_loadw.8 # t4 loaded back
+  loc_loadw.32 # t4 loaded back
 
   push.0.0.0.0
-  loc_loadw.15
+  loc_loadw.60
   push.0.0.0.0
-  loc_loadw.14 # z3 loaded back
+  loc_loadw.56 # z3 loaded back
 
   exec.base_field::mul # = z3
 
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0 # t0 loaded back
 
   exec.base_field::add # = z3
 
-  loc_storew.14
+  loc_storew.56
   dropw
-  loc_storew.15
+  loc_storew.60
   dropw       # cache z3
 
   dropw
@@ -903,37 +903,37 @@ export.add.16
   dropw
 
   push.0.0.0.0
-  loc_loadw.10
+  loc_loadw.40
   dup.4
   mem_storew
   dropw              # write x3[0..4] to memory
 
   push.0.0.0.0
-  loc_loadw.11
+  loc_loadw.44
   dup.5
   mem_storew
   dropw              # write x3[4..8] to memory
 
   push.0.0.0.0
-  loc_loadw.12
+  loc_loadw.48
   dup.6
   mem_storew
   dropw              # write y3[0..4] to memory
 
   push.0.0.0.0
-  loc_loadw.13
+  loc_loadw.52
   dup.7
   mem_storew
   dropw              # write y3[4..8] to memory
 
   push.0.0.0.0
-  loc_loadw.14
+  loc_loadw.56
   dup.8
   mem_storew
   dropw              # write z3[0..4] to memory
 
   push.0.0.0.0
-  loc_loadw.15
+  loc_loadw.60
   dup.9
   mem_storew
   dropw              # write z3[4..8] to memory
@@ -977,7 +977,7 @@ end
 #!
 #! If base point being multiplied is secp256k1 curve generator point, one should use `gen_point` routine,
 #! which is almost 2x faster !
-export.mul.18
+export.mul.72
   # initialize `base`
   push.0.0.0.0
 
@@ -987,41 +987,41 @@ export.mul.18
 
   movup.4
   mem_loadw
-  loc_storew.1
+  loc_storew.4
 
   movup.4
   mem_loadw
-  loc_storew.2
+  loc_storew.8
 
   movup.4
   mem_loadw
-  loc_storew.3
+  loc_storew.12
 
   movup.4
   mem_loadw
-  loc_storew.4
+  loc_storew.16
 
   movup.4
   mem_loadw
-  loc_storew.5
+  loc_storew.20
 
   dropw
 
   # initialize `res` ( with group identity )
   # See https://github.com/itzmeanjan/secp256k1/blob/d23ea7d/point.py#L40-L45
   push.0.0.0.0
-  loc_storew.6
-  loc_storew.7
+  loc_storew.24
+  loc_storew.28
   dropw
 
   push.0.0.1.977
-  loc_storew.8
+  loc_storew.32
   dropw
   push.0.0.0.0
-  loc_storew.9
+  loc_storew.36
 
-  loc_storew.10
-  loc_storew.11
+  loc_storew.40
+  loc_storew.44
 
   dropw
 
@@ -1033,27 +1033,27 @@ export.mul.18
 
       if.true
         # res = base + res
-        locaddr.17
-        locaddr.16
-        locaddr.15
-        locaddr.14
-        locaddr.13
-        locaddr.12
+        locaddr.68
+        locaddr.64
+        locaddr.60
+        locaddr.56
+        locaddr.52
+        locaddr.48
 
         # res
-        locaddr.11
-        locaddr.10
-        locaddr.9
-        locaddr.8
-        locaddr.7
-        locaddr.6
+        locaddr.44
+        locaddr.40
+        locaddr.36
+        locaddr.32
+        locaddr.28
+        locaddr.24
 
         # base
-        locaddr.5
+        locaddr.20
+        locaddr.16
+        locaddr.12
+        locaddr.8
         locaddr.4
-        locaddr.3
-        locaddr.2
-        locaddr.1
         locaddr.0
 
         exec.add
@@ -1063,45 +1063,45 @@ export.mul.18
 
         movup.4
         mem_loadw
-        loc_storew.6
+        loc_storew.24
 
         movup.4
         mem_loadw
-        loc_storew.7
+        loc_storew.28
 
         movup.4
         mem_loadw
-        loc_storew.8
+        loc_storew.32
 
         movup.4
         mem_loadw
-        loc_storew.9
+        loc_storew.36
 
         movup.4
         mem_loadw
-        loc_storew.10
+        loc_storew.40
 
         movup.4
         mem_loadw
-        loc_storew.11
+        loc_storew.44
 
         dropw
       end
 
       # base = base + base
-      locaddr.17
-      locaddr.16
-      locaddr.15
-      locaddr.14
-      locaddr.13
-      locaddr.12
+      locaddr.68
+      locaddr.64
+      locaddr.60
+      locaddr.56
+      locaddr.52
+      locaddr.48
 
       # base
-      locaddr.5
+      locaddr.20
+      locaddr.16
+      locaddr.12
+      locaddr.8
       locaddr.4
-      locaddr.3
-      locaddr.2
-      locaddr.1
       locaddr.0
 
       exec.double
@@ -1115,23 +1115,23 @@ export.mul.18
 
       movup.4
       mem_loadw
-      loc_storew.1
+      loc_storew.4
 
       movup.4
       mem_loadw
-      loc_storew.2
+      loc_storew.8
 
       movup.4
       mem_loadw
-      loc_storew.3
+      loc_storew.12
 
       movup.4
       mem_loadw
-      loc_storew.4
+      loc_storew.16
 
       movup.4
       mem_loadw
-      loc_storew.5
+      loc_storew.20
 
       dropw
 
@@ -1144,27 +1144,27 @@ export.mul.18
   # write resulting point to provided output memory addresses
   push.0.0.0.0
 
-  loc_loadw.6
+  loc_loadw.24
   dup.4
   mem_storew
 
-  loc_loadw.7
+  loc_loadw.28
   dup.5
   mem_storew
 
-  loc_loadw.8
+  loc_loadw.32
   dup.6
   mem_storew
 
-  loc_loadw.9
+  loc_loadw.36
   dup.7
   mem_storew
 
-  loc_loadw.10
+  loc_loadw.40
   dup.8
   mem_storew
 
-  loc_loadw.11
+  loc_loadw.44
   dup.9
   mem_storew
 
@@ -1205,33 +1205,33 @@ end
 #!
 #! Note, this routine is a specialised instantiation of secp256k1 point multiplication, where we know what the base
 #! point is, so we enjoy faster computation ( because all point doublings can be precomputed, saving us 256 point doublings ! ).
-export.gen_mul.20
+export.gen_mul.80
   # identity point of group (0, 1, 0) in projective coordinate
   # see https://github.com/itzmeanjan/secp256k1/blob/d23ea7d/point.py#L40-L45
   push.0.0.0.0
   loc_storew.0
   dropw
   push.0.0.0.0
-  loc_storew.1
+  loc_storew.4
   dropw        # init & cache res_X
 
   push.0.0.1.977
-  loc_storew.2
+  loc_storew.8
   dropw
   push.0.0.0.0
-  loc_storew.3
+  loc_storew.12
   dropw         # init & cache res_Y
 
   push.0.0.0.0
-  loc_storew.4
+  loc_storew.16
   dropw
   push.0.0.0.0
-  loc_storew.5
+  loc_storew.20
   dropw         # init & cache res_Z
 
-  loc_storew.18
+  loc_storew.72
   dropw
-  loc_storew.19
+  loc_storew.76
   dropw
 
   # push (2^255)G into stack
@@ -3286,48 +3286,48 @@ export.gen_mul.20
     repeat.4
       repeat.32
         push.0.0.0.0
-        loc_loadw.18
+        loc_loadw.72
         dup
         push.1
         u32and
         movdn.4
         u32shr.1
-        loc_storew.18
+        loc_storew.72
         dropw
 
         if.true
-          loc_storew.12
+          loc_storew.48
           dropw
-          loc_storew.13
+          loc_storew.52
           dropw
-          loc_storew.14
+          loc_storew.56
           dropw
-          loc_storew.15
+          loc_storew.60
           dropw
-          loc_storew.16
+          loc_storew.64
           dropw
-          loc_storew.17
+          loc_storew.68
           dropw
 
-          locaddr.11
-          locaddr.10
-          locaddr.9
-          locaddr.8
-          locaddr.7
-          locaddr.6
-
-          locaddr.17
+          locaddr.44
+          locaddr.40
+          locaddr.36
+          locaddr.32
+          locaddr.28
+          locaddr.24
+
+          locaddr.68
+          locaddr.64
+          locaddr.60
+          locaddr.56
+          locaddr.52
+          locaddr.48
+
+          locaddr.20
           locaddr.16
-          locaddr.15
-          locaddr.14
-          locaddr.13
           locaddr.12
-
-          locaddr.5
+          locaddr.8
           locaddr.4
-          locaddr.3
-          locaddr.2
-          locaddr.1
           locaddr.0
 
           exec.add
@@ -3335,20 +3335,20 @@ export.gen_mul.20
           drop
           drop
 
-          loc_loadw.6
+          loc_loadw.24
           loc_storew.0
-          loc_loadw.7
-          loc_storew.1
+          loc_loadw.28
+          loc_storew.4
 
-          loc_loadw.8
-          loc_storew.2
-          loc_loadw.9
-          loc_storew.3
+          loc_loadw.32
+          loc_storew.8
+          loc_loadw.36
+          loc_storew.12
 
-          loc_loadw.10
-          loc_storew.4
-          loc_loadw.11
-          loc_storew.5
+          loc_loadw.40
+          loc_storew.16
+          loc_loadw.44
+          loc_storew.20
 
           dropw
         else
@@ -3359,15 +3359,15 @@ export.gen_mul.20
       end
 
       push.0.0.0.0
-      loc_loadw.18
+      loc_loadw.72
       movdn.3
-      loc_storew.18
+      loc_storew.72
       dropw
     end
 
     push.0.0.0.0
-    loc_loadw.19
-    loc_storew.18
+    loc_loadw.76
+    loc_storew.72
     dropw
   end
 
@@ -3380,35 +3380,35 @@ export.gen_mul.20
 
   dup.1
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   movup.4
   mem_storew
   dropw              # write x[4..8] to memory
 
   dup.2
   push.0.0.0.0
-  loc_loadw.2
+  loc_loadw.8
   movup.4
   mem_storew
   dropw              # write y[0..4] to memory
 
   dup.3
   push.0.0.0.0
-  loc_loadw.3
+  loc_loadw.12
   movup.4
   mem_storew
   dropw              # write y[4..8] to memory
 
   dup.4
   push.0.0.0.0
-  loc_loadw.4
+  loc_loadw.16
   movup.4
   mem_storew
   dropw              # write z[0..4] to memory
 
   dup.5
   push.0.0.0.0
-  loc_loadw.5
+  loc_loadw.20
   movup.4
   mem_storew
   dropw              # write z[4..8] to memory
diff --git a/stdlib/asm/math/secp256k1/scalar_field.masm b/stdlib/asm/math/secp256k1/scalar_field.masm
index 4e5a7cc44a..a94c4bdd55 100644
--- a/stdlib/asm/math/secp256k1/scalar_field.masm
+++ b/stdlib/asm/math/secp256k1/scalar_field.masm
@@ -250,10 +250,10 @@ end
 #! while computed c[0..8] will also be in Montgomery form.
 #!
 #! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/scalar_field_utils.py#L101-L225
-export.mul.2
+export.mul.8
   loc_storew.0
   swapw
-  loc_storew.1
+  loc_storew.4
   swapw
 
   exec.u256xu32
@@ -274,7 +274,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -284,7 +284,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -294,7 +294,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -304,7 +304,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -314,7 +314,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -324,7 +324,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -334,7 +334,7 @@ export.mul.2
 
   movup.9
   push.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   push.0.0.0.0
   loc_loadw.0
 
@@ -437,18 +437,18 @@ end
 #! inverse can't be computed, which is why output result is also 0.
 #!
 #! See https://github.com/itzmeanjan/secp256k1/blob/37b339db3e03d24c2977399eb8896ef515ebb09b/field/scalar_field.py#L118-L136
-export.inv.4
+export.inv.16
   # cache result initial value ( = 1, in Montgomery form )
   push.0.0.0.1.1162945305.1354194884.1076732275.801750719
   loc_storew.0
   dropw
-  loc_storew.1
+  loc_storew.4
   dropw
 
   # cache base
-  loc_storew.2
+  loc_storew.8
   dropw
-  loc_storew.3
+  loc_storew.12
   dropw
 
   push.3493216575.3218235020.2940772411.3132021990.4294967294.4294967295.4294967295.4294967295
@@ -456,7 +456,7 @@ export.inv.4
   repeat.8
     repeat.32
       push.0.0.0.0.0.0.0.0
-      loc_loadw.1
+      loc_loadw.4
       swapw
       loc_loadw.0
 
@@ -464,7 +464,7 @@ export.inv.4
 
       loc_storew.0
       dropw
-      loc_storew.1
+      loc_storew.4
       dropw
 
       dup
@@ -472,13 +472,13 @@ export.inv.4
       if.true
         push.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0
 
-        loc_loadw.3
+        loc_loadw.12
         swapw
-        loc_loadw.2
+        loc_loadw.8
 
         swapdw
 
-        loc_loadw.1
+        loc_loadw.4
         swapw
         loc_loadw.0
 
@@ -486,7 +486,7 @@ export.inv.4
 
         loc_storew.0
         dropw
-        loc_storew.1
+        loc_storew.4
         dropw
       end
 
@@ -497,7 +497,7 @@ export.inv.4
   end
 
   push.0.0.0.0.0.0.0.0
-  loc_loadw.1
+  loc_loadw.4
   swapw
   loc_loadw.0
 end
diff --git a/stdlib/asm/math/u256.masm b/stdlib/asm/math/u256.masm
index 7598009cd3..6038ab940c 100644
--- a/stdlib/asm/math/u256.masm
+++ b/stdlib/asm/math/u256.masm
@@ -246,31 +246,31 @@ end
 #! Stack transition looks as follows:
 #! [b7, b6, b5, b4, b3, b2, b1, b0, a7, a6, a5, a4, a3, a2, a1, a0, ...] -> [c7, c6, c5, c4, c3, c2, c1, c0, ...]
 #! where c = (a * b) % 2^256, and a0, b0, and c0 are least significant 32-bit limbs of a, b, and c respectively.
-export.mul_unsafe.6
+export.mul_unsafe.24
     # Memory storing setup
     loc_storew.0
     dropw
     # b[5-8] at 0
-    loc_storew.1
+    loc_storew.4
     # b[0-4] at 1
     push.0 dropw
     # b[0] at top of stack, followed by a[0-7]
     movdn.8
-    loc_storew.2
+    loc_storew.8
     # a[0-4] at 2
     swapw
-    loc_storew.3
+    loc_storew.12
     # a[5-8] at 3
     padw
-    loc_storew.4
-    loc_storew.5
+    loc_storew.16
+    loc_storew.20
     # p at 4 and 5
 
     # b[0]
     dropw
     swapw
     push.0.0.0.0
-    loc_loadw.4
+    loc_loadw.16
     movdnw.2
     movup.12
 
@@ -279,10 +279,10 @@ export.mul_unsafe.6
     movdn.9
     movdn.9
     swapw
-    loc_storew.4
+    loc_storew.16
     dropw
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     swapw
     movup.9
     movup.9
@@ -315,21 +315,21 @@ export.mul_unsafe.6
     exec.mulstep
 
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     # b[1]
     push.0.0.0.0
-    loc_loadw.4
+    loc_loadw.16
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.7
     dropw
     push.0.0.0.0
-    loc_loadw.3 push.0.0.0.0
-    loc_loadw.2 # load the xs
+    loc_loadw.12 push.0.0.0.0
+    loc_loadw.8 # load the xs
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     movup.2
     movdn.3
     push.0 dropw # only need b[1]
@@ -341,15 +341,15 @@ export.mul_unsafe.6
     swapw
     movdn.3
     push.0.0.0.0
-    loc_loadw.4
+    loc_loadw.16
     push.0 dropw # only need p[0]
     movdn.3
     # save p[0-3] to memory, not needed any more
-    loc_storew.4
+    loc_storew.16
     dropw
 
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     drop
     swapw
@@ -379,22 +379,22 @@ export.mul_unsafe.6
     drop
     swap
     drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     # b[2]
     push.0.0.0.0
-    loc_loadw.4
+    loc_loadw.16
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.7
     movup.7
     dropw
     push.0.0.0.0
-    loc_loadw.3 push.0.0.0.0
-    loc_loadw.2 # load the xs
+    loc_loadw.12 push.0.0.0.0
+    loc_loadw.8 # load the xs
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     swap
     movdn.3
     push.0 dropw # only need b[1]
@@ -407,15 +407,15 @@ export.mul_unsafe.6
     movdn.3
     movdn.3
     push.0.0.0.0
-    loc_loadw.4
+    loc_loadw.16
     drop drop
     movdn.3
     movdn.3
-    loc_storew.4
+    loc_storew.16
     dropw
 
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movup.3
     movup.3
     drop
@@ -440,23 +440,23 @@ export.mul_unsafe.6
     swap drop
     movdn.3
     drop drop drop
-    loc_storew.5
+    loc_storew.20
     dropw
 
     # b[3]
     push.0.0.0.0
-    loc_loadw.4
+    loc_loadw.16
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
 
     movup.7 movup.7 movup.7
     dropw
     push.0.0.0.0
-    loc_loadw.3 push.0.0.0.0
-    loc_loadw.2
+    loc_loadw.12 push.0.0.0.0
+    loc_loadw.8
 
     push.0.0.0.0
-    loc_loadw.1
+    loc_loadw.4
     movdn.3
     push.0 dropw
 
@@ -468,14 +468,14 @@ export.mul_unsafe.6
     swapw
     movup.3
     push.0.0.0.0
-    loc_loadw.4
+    loc_loadw.16
     drop
     movup.3
 
-    loc_storew.4
+    loc_storew.16
     dropw
     push.0.0.0.0
-    loc_loadw.5
+    loc_loadw.20
     movdn.3
     push.0 dropw
     swapw
@@ -494,8 +494,8 @@ export.mul_unsafe.6
 
     # b[4]
     push.0.0.0.0
-    loc_loadw.3 push.0.0.0.0
-    loc_loadw.2 # load the xs
+    loc_loadw.12 push.0.0.0.0
+    loc_loadw.8 # load the xs
     # OPTIM: don't need a[4-7], but can't use mulstep4 if we don't load
 
     push.0.0.0.0
@@ -507,7 +507,7 @@ export.mul_unsafe.6
 
     # b[5]
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
     push.0.0.0.0
     loc_loadw.0
     movup.2 movdn.3
@@ -537,7 +537,7 @@ export.mul_unsafe.6
 
     # b[6]
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
     push.0.0.0.0
     loc_loadw.0
     swap
@@ -561,7 +561,7 @@ export.mul_unsafe.6
 
     # b[7]
     push.0.0.0.0
-    loc_loadw.3
+    loc_loadw.12
     push.0.0.0.0
     loc_loadw.0
 
@@ -576,6 +576,6 @@ export.mul_unsafe.6
     drop drop drop
 
     push.0.0.0.0
-    loc_loadw.4
+    loc_loadw.16
     swapw
 end
diff --git a/stdlib/asm/sys.masm b/stdlib/asm/sys.masm
index e4d4cf55d7..ad0d273b40 100644
--- a/stdlib/asm/sys.masm
+++ b/stdlib/asm/sys.masm
@@ -7,7 +7,7 @@
 #! Output: Stack with only the original top 16 elements.
 #!
 #! Cycles: 17 + 11 * overflow_words, where `overflow_words` is the number of words needed to drop.
-export.truncate_stack.1
+export.truncate_stack.4
     # save the first word to memory and bring elements to be dropped to the top of the stack
     loc_storew.0 dropw movupw.3
     # => [X, B, C, D, ...]
diff --git a/stdlib/tests/crypto/falcon.rs b/stdlib/tests/crypto/falcon.rs
index 57cc538199..233ad9a7e6 100644
--- a/stdlib/tests/crypto/falcon.rs
+++ b/stdlib/tests/crypto/falcon.rs
@@ -207,7 +207,7 @@ fn test_falcon512_probabilistic_product_failure() {
     expect_exec_error_matches!(
         test,
         ExecutionError::FailedAssertion{ clk, err_code, err_msg }
-        if clk == RowIndex::from(18841) && err_code == 0 && err_msg.is_none()
+        if clk == RowIndex::from(18843) && err_code == 0 && err_msg.is_none()
     );
 }
 
diff --git a/stdlib/tests/crypto/fri/remainder.rs b/stdlib/tests/crypto/fri/remainder.rs
index 6542ece5f9..24f732e5c4 100644
--- a/stdlib/tests/crypto/fri/remainder.rs
+++ b/stdlib/tests/crypto/fri/remainder.rs
@@ -17,7 +17,7 @@ fn test_decorator_ext2intt(in_poly_len: usize, blowup: usize) {
     assert!((blowup > 0) && blowup.is_power_of_two());
 
     let eval_len = in_poly_len * blowup;
-    let eval_mem_req = (eval_len * 2) / 4;
+    let eval_mem_req = eval_len * 2;
     let out_mem_req = (in_poly_len * 2) / 4;
 
     let poly = rand_vector::<QuadFelt>(in_poly_len);
@@ -70,7 +70,7 @@ fn test_decorator_ext2intt(in_poly_len: usize, blowup: usize) {
     end
     ",
         eval_mem_req,
-        eval_mem_req - 1,
+        eval_mem_req - 4,
         eval_mem_req,
         eval_len,
         in_poly_len,
@@ -98,8 +98,8 @@ fn test_verify_remainder_64() {
         "
     use.std::crypto::fri::ext2fri
 
-    proc.helper.36
-        locaddr.35
+    proc.helper.144
+        locaddr.140
         repeat.36
             movdn.4
             dup.4
@@ -141,8 +141,8 @@ fn test_verify_remainder_32() {
         "
     use.std::crypto::fri::ext2fri
 
-    proc.helper.18
-        locaddr.17
+    proc.helper.72
+        locaddr.68
         repeat.18
             movdn.4
             dup.4
diff --git a/stdlib/tests/math/secp256k1/group.rs b/stdlib/tests/math/secp256k1/group.rs
index 9837028184..28832fc8c4 100644
--- a/stdlib/tests/math/secp256k1/group.rs
+++ b/stdlib/tests/math/secp256k1/group.rs
@@ -15,44 +15,44 @@ fn test_secp256k1_point_doubling(src: Point, dst: Point) {
     # Given a point of secp256k1 elliptic curve, this routine first computes
     # point doubling of that point in projective coordinate & then asserts
     # each coordinate limb-by-limb for ensuring correctness.
-    proc.point_doubling_test_wrapper.12
+    proc.point_doubling_test_wrapper.48
         # push X -coordinate to memory
         push.{}.{}.{}.{}
         loc_storew.0
         dropw
         push.{}.{}.{}.{}
-        loc_storew.1
+        loc_storew.4
         dropw
 
         # push Y -coordinate to memory
         push.{}.{}.{}.{}
-        loc_storew.2
+        loc_storew.8
         dropw
         push.{}.{}.{}.{}
-        loc_storew.3
+        loc_storew.12
         dropw
 
         # push Z -coordinate to memory
         push.{}.{}.{}.{}
-        loc_storew.4
+        loc_storew.16
         dropw
         push.{}.{}.{}.{}
-        loc_storew.5
+        loc_storew.20
         dropw
 
         # input/ output memory addresses for point doubling purpose
-        locaddr.11
-        locaddr.10
-        locaddr.9
+        locaddr.44
+        locaddr.40
+        locaddr.36
+        locaddr.32
+        locaddr.28
+        locaddr.24
+
+        locaddr.20
+        locaddr.16
+        locaddr.12
         locaddr.8
-        locaddr.7
-        locaddr.6
-
-        locaddr.5
         locaddr.4
-        locaddr.3
-        locaddr.2
-        locaddr.1
         locaddr.0
 
         # elliptic curve point doubling
@@ -211,75 +211,75 @@ fn test_secp256k1_point_addition(src0: Point, src1: Point, dst: Point) {
     # Given two points of secp256k1 elliptic curve ( twice ), this routine first computes
     # point addition of them in projective coordinate & then asserts each coordinate
     # limb-by-limb for ensuring correctness.
-    proc.point_addition_test_wrapper.18
+    proc.point_addition_test_wrapper.72
         # push X1 -coordinate to memory
         push.{}.{}.{}.{}
         loc_storew.0
         dropw
         push.{}.{}.{}.{}
-        loc_storew.1
+        loc_storew.4
         dropw
 
         # push Y1 -coordinate to memory
         push.{}.{}.{}.{}
-        loc_storew.2
+        loc_storew.8
         dropw
         push.{}.{}.{}.{}
-        loc_storew.3
+        loc_storew.12
         dropw
 
         # push Z1 -coordinate to memory
         push.{}.{}.{}.{}
-        loc_storew.4
+        loc_storew.16
         dropw
         push.{}.{}.{}.{}
-        loc_storew.5
+        loc_storew.20
         dropw
 
         # push X2 -coordinate to memory
         push.{}.{}.{}.{}
-        loc_storew.6
+        loc_storew.24
         dropw
         push.{}.{}.{}.{}
-        loc_storew.7
+        loc_storew.28
         dropw
 
         # push Y2 -coordinate to memory
         push.{}.{}.{}.{}
-        loc_storew.8
+        loc_storew.32
         dropw
         push.{}.{}.{}.{}
-        loc_storew.9
+        loc_storew.36
         dropw
 
         # push Z2 -coordinate to memory
         push.{}.{}.{}.{}
-        loc_storew.10
+        loc_storew.40
         dropw
         push.{}.{}.{}.{}
-        loc_storew.11
+        loc_storew.44
         dropw
 
         # input/ output memory addresses for point doubling purpose
-        locaddr.17
+        locaddr.68
+        locaddr.64
+        locaddr.60
+        locaddr.56
+        locaddr.52
+        locaddr.48
+
+        locaddr.44
+        locaddr.40
+        locaddr.36
+        locaddr.32
+        locaddr.28
+        locaddr.24
+
+        locaddr.20
         locaddr.16
-        locaddr.15
-        locaddr.14
-        locaddr.13
         locaddr.12
-
-        locaddr.11
-        locaddr.10
-        locaddr.9
         locaddr.8
-        locaddr.7
-        locaddr.6
-
-        locaddr.5
         locaddr.4
-        locaddr.3
-        locaddr.2
-        locaddr.1
         locaddr.0
 
         # elliptic curve point addition
@@ -465,12 +465,12 @@ fn test_secp256k1_point_multiplication(src_point: Point, scalar: FieldElement, d
     # the EC point with provided scalar and then asserts for correctness with known answer.
     proc.point_multiplication_test_wrapper.12
         # resulting point
-        locaddr.11
-        locaddr.10
-        locaddr.9
-        locaddr.8
-        locaddr.7
-        locaddr.6
+        locaddr.44
+        locaddr.40
+        locaddr.36
+        locaddr.32
+        locaddr.28
+        locaddr.24
 
         # scalar
         push.{}.{}.{}.{}
@@ -482,30 +482,30 @@ fn test_secp256k1_point_multiplication(src_point: Point, scalar: FieldElement, d
         dropw
 
         push.{}.{}.{}.{}
-        loc_storew.1
+        loc_storew.4
         dropw
 
         push.{}.{}.{}.{}
-        loc_storew.2
+        loc_storew.8
         dropw
 
         push.{}.{}.{}.{}
-        loc_storew.3
+        loc_storew.12
         dropw
 
         push.{}.{}.{}.{}
-        loc_storew.4
+        loc_storew.16
         dropw
 
         push.{}.{}.{}.{}
-        loc_storew.5
+        loc_storew.20
         dropw
 
-        locaddr.5
+        locaddr.20
+        locaddr.16
+        locaddr.12
+        locaddr.8
         locaddr.4
-        locaddr.3
-        locaddr.2
-        locaddr.1
         locaddr.0
 
         # elliptic curve point multiplication
@@ -671,14 +671,14 @@ fn test_secp256k1_generator_multiplication(scalar: FieldElement, point: Point) {
     # Given a 256 -bit scalar in radix-2^32 form ( i.e. 8 limbs, each of 32 -bit width ),
     # this routine first multiplies the secp256k1 generator point with provided scalar and
     # then asserts for correctness with known answer.
-    proc.generator_multiplication_test_wrapper.12
+    proc.generator_multiplication_test_wrapper.48
         # resulting point
-        locaddr.11
-        locaddr.10
-        locaddr.9
-        locaddr.8
-        locaddr.7
-        locaddr.6
+        locaddr.44
+        locaddr.40
+        locaddr.36
+        locaddr.32
+        locaddr.28
+        locaddr.24
 
         # scalar
         push.{}.{}.{}.{}
diff --git a/test-utils/src/test_builders.rs b/test-utils/src/test_builders.rs
index 2f8ea1f79d..661fdade14 100644
--- a/test-utils/src/test_builders.rs
+++ b/test-utils/src/test_builders.rs
@@ -18,7 +18,7 @@
 macro_rules! build_op_test {
     ($op_str:expr) => {{
         let source = format!("
-proc.truncate_stack.1
+proc.truncate_stack.4
     loc_storew.0 dropw movupw.3
     sdepth neq.16
     while.true
@@ -35,7 +35,7 @@ begin {} exec.truncate_stack end",
     }};
     ($op_str:expr, $($tail:tt)+) => {{
         let source = format!("
-proc.truncate_stack.1
+proc.truncate_stack.4
     loc_storew.0 dropw movupw.3
     sdepth neq.16
     while.true

From 07aab2507c260588a0f5d2019e19fe9412de2b5c Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Wed, 8 Jan 2025 08:56:13 -0500
Subject: [PATCH 16/19] more PR fixes

---
 air/src/trace/mod.rs                    |  2 +-
 assembly/src/assembler/mod.rs           | 14 ++---
 assembly/src/assembler/procedure.rs     | 14 ++++-
 assembly/src/errors.rs                  |  9 ----
 miden/masm-examples/debug/debug.masm    | 11 ++--
 miden/src/cli/debug/executor.rs         |  2 +-
 miden/src/main.rs                       |  2 +-
 miden/src/repl/mod.rs                   |  2 +-
 miden/src/{helpers.rs => utils.rs}      |  0
 processor/src/chiplets/aux_trace/mod.rs | 69 +++++++++++++++----------
 processor/src/chiplets/memory/mod.rs    |  2 +-
 processor/src/errors.rs                 |  6 ++-
 12 files changed, 76 insertions(+), 57 deletions(-)
 rename miden/src/{helpers.rs => utils.rs} (100%)

diff --git a/air/src/trace/mod.rs b/air/src/trace/mod.rs
index 81f17c8278..6c2c109268 100644
--- a/air/src/trace/mod.rs
+++ b/air/src/trace/mod.rs
@@ -19,7 +19,7 @@ pub const MIN_TRACE_LEN: usize = 64;
 // ------------------------------------------------------------------------------------------------
 
 //      system          decoder           stack      range checks       chiplets
-//    (8 columns)     (24 columns)    (19 columns)    (3 columns)     (18 columns)
+//    (8 columns)     (24 columns)    (19 columns)    (2 columns)     (18 columns)
 // ├───────────────┴───────────────┴───────────────┴───────────────┴─────────────────┤
 
 pub const SYS_TRACE_OFFSET: usize = 0;
diff --git a/assembly/src/assembler/mod.rs b/assembly/src/assembler/mod.rs
index 1985241864..af4de9de06 100644
--- a/assembly/src/assembler/mod.rs
+++ b/assembly/src/assembler/mod.rs
@@ -567,17 +567,11 @@ impl Assembler {
     ) -> Result<Procedure, Report> {
         // Make sure the current procedure context is available during codegen
         let gid = proc_ctx.id();
-        let num_locals = proc_ctx.num_locals();
 
-        // Locals are forced to be a multiple of the word size to properly support reading and
-        // writing words.
-        if num_locals % WORD_SIZE as u16 != 0 {
-            return Err(AssemblyError::InvalidNumLocals {
-                span: proc_ctx.span(),
-                source_file: proc_ctx.source_manager().get(proc_ctx.span().source_id()).ok(),
-                num_locals,
-            })?;
-        }
+        // We expect the number of locals to be a multiple of the word size, having been rounded up
+        // if necessary.
+        let num_locals = proc_ctx.num_locals();
+        assert_eq!(num_locals % WORD_SIZE as u16, 0);
 
         let wrapper_proc = self.module_graph.get_procedure_unsafe(gid);
         let proc = wrapper_proc.unwrap_ast().unwrap_procedure();
diff --git a/assembly/src/assembler/procedure.rs b/assembly/src/assembler/procedure.rs
index 167a25806e..754eb6a342 100644
--- a/assembly/src/assembler/procedure.rs
+++ b/assembly/src/assembler/procedure.rs
@@ -44,8 +44,11 @@ impl ProcedureContext {
         }
     }
 
+    /// Sets the number of locals to allocate for the procedure.
+    ///
+    /// The number of locals is rounded up to the nearest multiple of 4.
     pub fn with_num_locals(mut self, num_locals: u16) -> Self {
-        self.num_locals = num_locals;
+        self.num_locals = round_up_to_multiple_of_4(num_locals);
         self
     }
 
@@ -55,6 +58,15 @@ impl ProcedureContext {
     }
 }
 
+#[inline(always)]
+fn round_up_to_multiple_of_4(value: u16) -> u16 {
+    // For example, if value = 4,5,6,7
+    // value + 3 = 7,8,9,10
+    // value + 3 & !3 = 4,8,8,8 (&!3 clears the last two bits)
+    // as desired.
+    (value + 3) & !3
+}
+
 // ------------------------------------------------------------------------------------------------
 /// Public accessors
 impl ProcedureContext {
diff --git a/assembly/src/errors.rs b/assembly/src/errors.rs
index ddda42c160..06ff9ab3b9 100644
--- a/assembly/src/errors.rs
+++ b/assembly/src/errors.rs
@@ -61,15 +61,6 @@ pub enum AssemblyError {
         source_file: Option<Arc<SourceFile>>,
         callee: QualifiedProcedureName,
     },
-    #[error("invalid number of declared local variables for procedure: {num_locals}")]
-    #[diagnostic(help("the number of local variables must be a multiple of 4"))]
-    InvalidNumLocals {
-        #[label]
-        span: SourceSpan,
-        #[source_code]
-        source_file: Option<Arc<SourceFile>>,
-        num_locals: u16,
-    },
     #[error("invalid local word index: {local_addr}")]
     #[diagnostic(help("the index to a local word must be a multiple of 4"))]
     InvalidLocalWordIndex {
diff --git a/miden/masm-examples/debug/debug.masm b/miden/masm-examples/debug/debug.masm
index 54649b27c6..a79f24b018 100644
--- a/miden/masm-examples/debug/debug.masm
+++ b/miden/masm-examples/debug/debug.masm
@@ -1,8 +1,8 @@
-proc.foo.3
+proc.foo.2
     push.11
     loc_store.0
     push.101
-    loc_store.4
+    loc_store.1
 
     debug.local
     debug.local.1
@@ -12,11 +12,11 @@ proc.foo.3
     # will fail: debug.local.1.65540
 end
 
-proc.bar.4
+proc.bar.2
     push.21
     loc_store.0
     push.121
-    loc_store.4
+    loc_store.1
     debug.local
     debug.local.2
 end
@@ -39,4 +39,7 @@ begin
 
     exec.foo
     exec.bar
+    
+    # Clean stack
+    dropw
 end
diff --git a/miden/src/cli/debug/executor.rs b/miden/src/cli/debug/executor.rs
index 7c3012d4b3..c068175fc2 100644
--- a/miden/src/cli/debug/executor.rs
+++ b/miden/src/cli/debug/executor.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use miden_vm::{DefaultHost, MemAdviceProvider, Program, StackInputs, VmState, VmStateIterator};
 
 use super::DebugCommand;
-use crate::helpers::print_mem_address;
+use crate::utils::print_mem_address;
 
 /// Holds debugger state and iterator used for debugging.
 pub struct DebugExecutor {
diff --git a/miden/src/main.rs b/miden/src/main.rs
index b03d3c9202..f5ead91f0c 100644
--- a/miden/src/main.rs
+++ b/miden/src/main.rs
@@ -10,7 +10,7 @@ mod cli;
 mod repl;
 mod tools;
 
-pub(crate) mod helpers;
+pub(crate) mod utils;
 
 /// Root CLI struct
 #[derive(Parser, Debug)]
diff --git a/miden/src/repl/mod.rs b/miden/src/repl/mod.rs
index ed3335c688..037df6e0fd 100644
--- a/miden/src/repl/mod.rs
+++ b/miden/src/repl/mod.rs
@@ -6,7 +6,7 @@ use processor::ContextId;
 use rustyline::{error::ReadlineError, DefaultEditor};
 use stdlib::StdLibrary;
 
-use crate::helpers::print_mem_address;
+use crate::utils::print_mem_address;
 
 // This work is in continuation to the amazing work done by team `Scribe`
 // [here](https://github.com/ControlCplusControlV/Scribe/blob/main/transpiler/src/repl.rs#L8)
diff --git a/miden/src/helpers.rs b/miden/src/utils.rs
similarity index 100%
rename from miden/src/helpers.rs
rename to miden/src/utils.rs
diff --git a/processor/src/chiplets/aux_trace/mod.rs b/processor/src/chiplets/aux_trace/mod.rs
index 483d04f78a..7c6df3e841 100644
--- a/processor/src/chiplets/aux_trace/mod.rs
+++ b/processor/src/chiplets/aux_trace/mod.rs
@@ -360,7 +360,7 @@ fn build_control_block_request<E: FieldElement<BaseField = Felt>>(
     let header =
         alphas[0] + alphas[1].mul_base(Felt::from(transition_label)) + alphas[2].mul_base(addr_nxt);
 
-    header + build_value(&alphas[8..16], &decoder_hasher_state) + alphas[5].mul_base(op_code_felt)
+    header + build_value(&alphas[8..16], decoder_hasher_state) + alphas[5].mul_base(op_code_felt)
 }
 
 /// Builds requests made on a `DYN` or `DYNCALL` operation.
@@ -431,7 +431,7 @@ fn build_span_block_request<E: FieldElement<BaseField = Felt>>(
         alphas[0] + alphas[1].mul_base(Felt::from(transition_label)) + alphas[2].mul_base(addr_nxt);
 
     let state = main_trace.decoder_hasher_state(row);
-    header + build_value(&alphas[8..16], &state)
+    header + build_value(&alphas[8..16], state)
 }
 
 /// Builds requests made to the hasher chiplet at the start of a respan block.
@@ -451,7 +451,7 @@ fn build_respan_block_request<E: FieldElement<BaseField = Felt>>(
 
     let state = main_trace.decoder_hasher_state(row);
 
-    header + build_value(&alphas[8..16], &state)
+    header + build_value(&alphas[8..16], state)
 }
 
 /// Builds requests made to the hasher chiplet at the end of a block.
@@ -468,7 +468,7 @@ fn build_end_block_request<E: FieldElement<BaseField = Felt>>(
         alphas[0] + alphas[1].mul_base(Felt::from(transition_label)) + alphas[2].mul_base(addr);
 
     let state = main_trace.decoder_hasher_state(row);
-    let digest = &state[..4];
+    let digest: [Felt; 4] = state[..4].try_into().unwrap();
 
     header + build_value(&alphas[8..12], digest)
 }
@@ -486,7 +486,7 @@ fn build_bitwise_request<E: FieldElement<BaseField = Felt>>(
     let b = main_trace.stack_element(0, row);
     let z = main_trace.stack_element(0, row + 1);
 
-    alphas[0] + build_value(&alphas[1..5], &[op_label, a, b, z])
+    alphas[0] + build_value(&alphas[1..5], [op_label, a, b, z])
 }
 
 /// Builds `MSTREAM` requests made to the memory chiplet.
@@ -797,20 +797,26 @@ where
         // v_all = v_h + v_a + v_b + v_c
         if selector1 == ONE && selector2 == ZERO && selector3 == ZERO {
             let header = alphas[0]
-                + build_value(&alphas[1..4], &[transition_label, Felt::from(row + 1), node_index]);
+                + build_value(&alphas[1..4], [transition_label, Felt::from(row + 1), node_index]);
 
-            multiplicand = header + build_value(alphas_state, &state);
+            multiplicand = header + build_value(alphas_state, state);
         }
 
         // f_mp or f_mv or f_mu == 1
         // v_leaf = v_h + (1 - b) * v_b + b * v_d
         if selector1 == ONE && !(selector2 == ZERO && selector3 == ZERO) {
             let header = alphas[0]
-                + build_value(&alphas[1..4], &[transition_label, Felt::from(row + 1), node_index]);
+                + build_value(&alphas[1..4], [transition_label, Felt::from(row + 1), node_index]);
 
             let bit = (node_index.as_int() & 1) as u8;
-            let left_word = build_value(&alphas_state[DIGEST_RANGE], &state[DIGEST_RANGE]);
-            let right_word = build_value(&alphas_state[DIGEST_RANGE], &state[DIGEST_RANGE.end..]);
+            let left_word = build_value::<_, 4>(
+                &alphas_state[DIGEST_RANGE],
+                state[DIGEST_RANGE].try_into().unwrap(),
+            );
+            let right_word = build_value::<_, 4>(
+                &alphas_state[DIGEST_RANGE],
+                state[DIGEST_RANGE.end..].try_into().unwrap(),
+            );
 
             multiplicand = header + E::from(1 - bit).mul(left_word) + E::from(bit).mul(right_word);
         }
@@ -827,32 +833,39 @@ where
         // v_res = v_h + v_b;
         if selector1 == ZERO && selector2 == ZERO && selector3 == ZERO {
             let header = alphas[0]
-                + build_value(&alphas[1..4], &[transition_label, Felt::from(row + 1), node_index]);
+                + build_value(&alphas[1..4], [transition_label, Felt::from(row + 1), node_index]);
 
-            multiplicand = header + build_value(&alphas_state[DIGEST_RANGE], &state[DIGEST_RANGE]);
+            multiplicand = header
+                + build_value::<_, 4>(
+                    &alphas_state[DIGEST_RANGE],
+                    state[DIGEST_RANGE].try_into().unwrap(),
+                );
         }
 
         // f_sout == 1
         // v_all = v_h + v_a + v_b + v_c
         if selector1 == ZERO && selector2 == ZERO && selector3 == ONE {
             let header = alphas[0]
-                + build_value(&alphas[1..4], &[transition_label, Felt::from(row + 1), node_index]);
+                + build_value(&alphas[1..4], [transition_label, Felt::from(row + 1), node_index]);
 
-            multiplicand = header + build_value(alphas_state, &state);
+            multiplicand = header + build_value(alphas_state, state);
         }
 
         // f_abp == 1
         // v_abp = v_h + v_b' + v_c' - v_b - v_c
         if selector1 == ONE && selector2 == ZERO && selector3 == ZERO {
             let header = alphas[0]
-                + build_value(&alphas[1..4], &[transition_label, Felt::from(row + 1), node_index]);
+                + build_value(&alphas[1..4], [transition_label, Felt::from(row + 1), node_index]);
 
             let state_nxt = main_trace.chiplet_hasher_state(row + 1);
 
             // build the value from the hasher state's just right after the absorption of new
             // elements.
-            let next_state_value =
-                build_value(&alphas_state[CAPACITY_LEN..], &state_nxt[CAPACITY_LEN..]);
+            const SIZE: usize = STATE_WIDTH - CAPACITY_LEN;
+            let next_state_value = build_value::<_, SIZE>(
+                &alphas_state[CAPACITY_LEN..],
+                state_nxt[CAPACITY_LEN..].try_into().unwrap(),
+            );
 
             multiplicand = header + next_state_value;
         }
@@ -873,7 +886,7 @@ where
         let b = main_trace.chiplet_bitwise_b(row);
         let z = main_trace.chiplet_bitwise_z(row);
 
-        alphas[0] + build_value(&alphas[1..5], &[op_label, a, b, z])
+        alphas[0] + build_value(&alphas[1..5], [op_label, a, b, z])
     } else {
         E::ONE
     }
@@ -899,7 +912,7 @@ where
             word + idx1.mul_small(2) + idx0
         };
 
-        alphas[0] + build_value(&alphas[1..5], &[op_label, ctx, address, clk])
+        alphas[0] + build_value(&alphas[1..5], [op_label, ctx, address, clk])
     };
 
     if is_word_access == MEMORY_ACCESS_ELEMENT {
@@ -925,7 +938,7 @@ where
         let value2 = main_trace.chiplet_memory_value_2(row);
         let value3 = main_trace.chiplet_memory_value_3(row);
 
-        header + build_value(&alphas[5..9], &[value0, value1, value2, value3])
+        header + build_value(&alphas[5..9], [value0, value1, value2, value3])
     } else {
         panic!("Invalid memory element/word column value: {is_word_access}");
     }
@@ -943,7 +956,7 @@ where
     let root2 = main_trace.chiplet_kernel_root_2(row);
     let root3 = main_trace.chiplet_kernel_root_3(row);
 
-    let v = alphas[0] + build_value(&alphas[1..6], &[op_label, root0, root1, root2, root3]);
+    let v = alphas[0] + build_value(&alphas[1..6], [op_label, root0, root1, root2, root3]);
 
     let kernel_chiplet_selector = main_trace.chiplet_selector_4(row);
     v.mul_base(kernel_chiplet_selector) + E::from(ONE - kernel_chiplet_selector)
@@ -955,11 +968,15 @@ where
 /// Reduces a slice of elements to a single field element in the field specified by E using a slice
 /// of alphas of matching length. This can be used to build the value for a single word or for an
 /// entire [HasherState].
-fn build_value<E: FieldElement<BaseField = Felt>>(alphas: &[E], elements: &[Felt]) -> E {
+#[inline(always)]
+fn build_value<E: FieldElement<BaseField = Felt>, const N: usize>(
+    alphas: &[E],
+    elements: [Felt; N],
+) -> E {
     debug_assert_eq!(alphas.len(), elements.len());
     let mut value = E::ZERO;
-    for (&alpha, &element) in alphas.iter().zip(elements.iter()) {
-        value += alpha.mul_base(element);
+    for i in 0..N {
+        value += alphas[i].mul_base(elements[i]);
     }
     value
 }
@@ -1027,7 +1044,7 @@ fn compute_mem_request_element<E: FieldElement<BaseField = Felt>>(
     let ctx = main_trace.ctx(row);
     let clk = main_trace.clk(row);
 
-    alphas[0] + build_value(&alphas[1..6], &[Felt::from(op_label), ctx, addr, clk, element])
+    alphas[0] + build_value(&alphas[1..6], [Felt::from(op_label), ctx, addr, clk, element])
 }
 
 /// Computes a memory request for a read or write of a word.
@@ -1046,6 +1063,6 @@ fn compute_mem_request_word<E: FieldElement<BaseField = Felt>>(
     alphas[0]
         + build_value(
             &alphas[1..9],
-            &[Felt::from(op_label), ctx, addr, clk, word[0], word[1], word[2], word[3]],
+            [Felt::from(op_label), ctx, addr, clk, word[0], word[1], word[2], word[3]],
         )
 }
diff --git a/processor/src/chiplets/memory/mod.rs b/processor/src/chiplets/memory/mod.rs
index f24b5b592f..c0a4e78283 100644
--- a/processor/src/chiplets/memory/mod.rs
+++ b/processor/src/chiplets/memory/mod.rs
@@ -125,7 +125,7 @@ impl Memory {
         match self.trace.get(&ctx) {
             Some(segment) => segment
                 .get_word(addr)
-                .map_err(|_| ExecutionError::UnalignedMemoryWordAccess { addr, ctx }),
+                .map_err(|_| ExecutionError::MemoryUnalignedWordAccessNoClk { addr, ctx }),
             None => Ok(None),
         }
     }
diff --git a/processor/src/errors.rs b/processor/src/errors.rs
index 902b1df115..58009209e3 100644
--- a/processor/src/errors.rs
+++ b/processor/src/errors.rs
@@ -96,6 +96,10 @@ pub enum ExecutionError {
         "word memory access at address {addr} in context {ctx} is unaligned at clock cycle {clk}"
     )]
     MemoryUnalignedWordAccess { addr: u32, ctx: ContextId, clk: Felt },
+    // Note: we need this version as well because to handle advice provider calls, which don't
+    // have access to the clock.
+    #[error("word access at memory address {addr} in context {ctx} is unaligned")]
+    MemoryUnalignedWordAccessNoClk { addr: u32, ctx: ContextId },
     #[error("merkle path verification failed for value {value} at index {index} in the Merkle tree with root {root} (error code: {err_code})", 
       value = to_hex(Felt::elements_as_bytes(value)),
       root = to_hex(root.as_bytes()),
@@ -133,8 +137,6 @@ pub enum ExecutionError {
       hex = to_hex(.0.as_bytes())
     )]
     SyscallTargetNotInKernel(Digest),
-    #[error("word access at memory address {addr} in context {ctx} is unaligned")]
-    UnalignedMemoryWordAccess { addr: u32, ctx: ContextId },
 }
 
 impl From<Ext2InttError> for ExecutionError {

From 9883cb0a6fee251a465606731edc8661fad4c6c4 Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Thu, 9 Jan 2025 14:21:18 -0500
Subject: [PATCH 17/19] more PR fixes

---
 air/src/constraints/chiplets/memory/mod.rs | 95 +++++++---------------
 air/src/constraints/chiplets/mod.rs        |  6 +-
 assembly/src/assembler/procedure.rs        | 13 +--
 assembly/src/tests.rs                      |  4 +-
 processor/src/chiplets/aux_trace/mod.rs    |  4 +-
 processor/src/chiplets/memory/mod.rs       |  2 +-
 stdlib/asm/crypto/stark/constants.masm     |  6 +-
 stdlib/asm/crypto/stark/random_coin.masm   |  2 +-
 stdlib/tests/mem/mod.rs                    |  7 +-
 9 files changed, 47 insertions(+), 92 deletions(-)

diff --git a/air/src/constraints/chiplets/memory/mod.rs b/air/src/constraints/chiplets/memory/mod.rs
index d91a1fa805..0a7d0048ad 100644
--- a/air/src/constraints/chiplets/memory/mod.rs
+++ b/air/src/constraints/chiplets/memory/mod.rs
@@ -54,7 +54,7 @@ pub fn get_transition_constraint_count() -> usize {
 /// The flags are:
 /// - `memory_flag_all_rows`: a flag that is set to 1 when the current row is part of the memory
 ///   chiplet,
-/// - `memory_flag_no_last_row`: a flag that is set to 1 when the current row is part of the memory
+/// - `memory_flag_not_last_row`: a flag that is set to 1 when the current row is part of the memory
 ///   chiplet, but excludes the last row of the chiplet,
 /// - `memory_flag_first_row`: a flag that is set to 1 when the *next* row is the first row of the
 ///   memory chiplet.
@@ -62,24 +62,24 @@ pub fn enforce_constraints<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
     memory_flag_all_rows: E,
-    memory_flag_no_last_row: E,
+    memory_flag_not_last_row: E,
     memory_flag_first_row: E,
 ) {
     // Constrain the binary columns.
     let mut index = enforce_binary_columns(frame, result, memory_flag_all_rows);
 
     // Constrain the values in the d inverse column.
-    index += enforce_d_inv(frame, &mut result[index..], memory_flag_no_last_row);
+    index += enforce_d_inv(frame, &mut result[index..], memory_flag_not_last_row);
 
-    // Enforce values in ctx, addr, clk transition correctly.
-    index += enforce_delta(frame, &mut result[index..], memory_flag_no_last_row);
+    // Enforce values in ctx, word_addr, clk transition correctly.
+    index += enforce_delta(frame, &mut result[index..], memory_flag_not_last_row);
 
     // Enforce the correct value for the f_scw flag.
     index +=
-        enforce_flag_same_context_and_word(frame, &mut result[index..], memory_flag_no_last_row);
+        enforce_flag_same_context_and_word(frame, &mut result[index..], memory_flag_not_last_row);
 
     // Constrain the memory values.
-    enforce_values(frame, &mut result[index..], memory_flag_no_last_row, memory_flag_first_row);
+    enforce_values(frame, &mut result[index..], memory_flag_not_last_row, memory_flag_first_row);
 }
 
 // TRANSITION CONSTRAINT HELPERS
@@ -104,19 +104,19 @@ fn enforce_binary_columns<E: FieldElement>(
 fn enforce_d_inv<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
-    memory_flag_no_last_row: E,
+    memory_flag_not_last_row: E,
 ) -> usize {
     let constraint_count = 4;
 
     // n0 is binary
-    result[0] = memory_flag_no_last_row * is_binary(frame.n0());
+    result[0] = memory_flag_not_last_row * is_binary(frame.n0());
     // when the context changes, n0 should be set to 1.
-    result[1] = memory_flag_no_last_row * frame.not_n0() * frame.ctx_change();
+    result[1] = memory_flag_not_last_row * frame.not_n0() * frame.ctx_change();
     // when n0 is 0, n1 is binary.
-    result[2] = memory_flag_no_last_row * frame.not_n0() * is_binary(frame.n1());
+    result[2] = memory_flag_not_last_row * frame.not_n0() * is_binary(frame.n1());
     // when n0 and n1 are 0, then `word_addr` doesn't change.
     result[3] =
-        memory_flag_no_last_row * frame.not_n0() * frame.not_n1() * frame.word_addr_change();
+        memory_flag_not_last_row * frame.not_n0() * frame.not_n1() * frame.word_addr_change();
 
     constraint_count
 }
@@ -126,21 +126,21 @@ fn enforce_d_inv<E: FieldElement>(
 fn enforce_delta<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
-    memory_flag_no_last_row: E,
+    memory_flag_not_last_row: E,
 ) -> usize {
     let constraint_count = 1;
 
     // If the context changed, include the difference.
-    result[0] = memory_flag_no_last_row * frame.n0() * frame.ctx_change();
-    // If the context is the same, include the word difference if it changed or else include the
-    // clock change.
+    result[0] = memory_flag_not_last_row * frame.n0() * frame.ctx_change();
+    // If the context is the same, include the word address difference if it changed or else include
+    // the clock change.
     result.agg_constraint(
         0,
-        memory_flag_no_last_row * frame.not_n0(),
+        memory_flag_not_last_row * frame.not_n0(),
         frame.n1() * frame.word_addr_change() + frame.not_n1() * frame.clk_change(),
     );
     // Always subtract the delta. It should offset the other changes.
-    result[0] -= memory_flag_no_last_row * frame.delta_next();
+    result[0] -= memory_flag_not_last_row * frame.delta_next();
 
     constraint_count
 }
@@ -150,9 +150,9 @@ fn enforce_delta<E: FieldElement>(
 fn enforce_flag_same_context_and_word<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
-    memory_flag_no_last_row: E,
+    memory_flag_not_last_row: E,
 ) -> usize {
-    result[0] = memory_flag_no_last_row
+    result[0] = memory_flag_not_last_row
         * (frame.f_scw_next() - binary_not(frame.n0() + frame.not_n0() * frame.n1()));
 
     1
@@ -163,14 +163,12 @@ fn enforce_flag_same_context_and_word<E: FieldElement>(
 ///
 /// The constraints on the values depend on a few factors:
 /// - When in the first row of a new context or word, any of the 4 values of the word that are not
-///   written to must be set to 0.
-///   - This is because the memory is initialized to 0 when a new context or word is started.
+///   written to must be set to 0. This is because the memory is initialized to 0 when a new context
+///   or word is started.
 /// - When we remain in the same context and word, then this is when we want to enforce the "memory
 ///   property" that what was previously written must be read. Therefore, the values that are not
-///   being written need to be equal to the values in the previous row (i.e. previously written, or
-///   initialized to 0).
-///   - The implication is that in a given evaluation frame, we always constrain the "next" value,
-///     since that constraint depends on the "current" value.
+///   being written need to be equal to the values in the previous row (i.e. were either previously
+///   written or are still initialized to 0).
 fn enforce_values<E: FieldElement>(
     frame: &EvaluationFrame<E>,
     result: &mut [E],
@@ -196,7 +194,8 @@ fn enforce_values<E: FieldElement>(
         let f3 = frame.idx1_next() * frame.idx0_next();
 
         let c_i = |f_i| {
-            // z_i is set to 1 when `v'[i]` is not being accessed.
+            // when we are operating on elements, z_i is set to 1 for all `i` which are not being
+            // accessed; otherwise it is set to 0.
             let z_i = binary_not(frame.is_word_access_next()) * binary_not(f_i);
             let is_read_next = frame.is_read_next();
 
@@ -213,15 +212,15 @@ fn enforce_values<E: FieldElement>(
     result[2] = memory_flag_first_row * c2 * frame.v_next(2);
     result[3] = memory_flag_first_row * c3 * frame.v_next(3);
 
-    // non-first row, new word or context constraints: when row' is a new word/ctx, and v'[i] is
-    // not written to, then v'[i] must be 0.
+    // non-first row, new word address or context constraints: when row' is a new word address/ctx,
+    // and v'[i] is not written to, then v'[i] must be 0.
     result[4] = memory_flag_no_last * binary_not(frame.f_scw_next()) * c0 * frame.v_next(0);
     result[5] = memory_flag_no_last * binary_not(frame.f_scw_next()) * c1 * frame.v_next(1);
     result[6] = memory_flag_no_last * binary_not(frame.f_scw_next()) * c2 * frame.v_next(2);
     result[7] = memory_flag_no_last * binary_not(frame.f_scw_next()) * c3 * frame.v_next(3);
 
-    // non-first row, same word or context constraints: when row' is in the same word/ctx, and
-    // v'[i] is not written to, then v'[i] must be equal to v[i].
+    // non-first row, same word address and context constraints: when row' is in the same word
+    // address/ctx, and v'[i] is not written to, then v'[i] must be equal to v[i].
     result[8] = memory_flag_no_last * frame.f_scw_next() * c0 * (frame.v_next(0) - frame.v(0));
     result[9] = memory_flag_no_last * frame.f_scw_next() * c1 * (frame.v_next(1) - frame.v(1));
     result[10] = memory_flag_no_last * frame.f_scw_next() * c2 * (frame.v_next(2) - frame.v(2));
@@ -254,12 +253,6 @@ trait EvaluationFrameExt<E: FieldElement> {
     ///
     /// 0: element, 1: word
     fn is_word_access_next(&self) -> E;
-    /// The current context value.
-    #[allow(dead_code)]
-    fn ctx(&self) -> E;
-    /// The current address.
-    #[allow(dead_code)]
-    fn word_next(&self) -> E;
     /// The 0'th bit of the index of the memory address in the current word.
     fn idx0(&self) -> E;
     /// The 0'th bit of the index of the memory address in the next word.
@@ -268,12 +261,6 @@ trait EvaluationFrameExt<E: FieldElement> {
     fn idx1(&self) -> E;
     /// The 1st bit of the index of the memory address in the next word.
     fn idx1_next(&self) -> E;
-    /// The current clock cycle.
-    #[allow(dead_code)]
-    fn clk(&self) -> E;
-    /// The next clock cycle.
-    #[allow(dead_code)]
-    fn clk_next(&self) -> E;
     /// The value from the specified index of the values (0, 1, 2, 3) in the current row.
     fn v(&self, index: usize) -> E;
     /// The value from the specified index of the values (0, 1, 2, 3) in the next row.
@@ -308,7 +295,7 @@ trait EvaluationFrameExt<E: FieldElement> {
     fn not_n1(&self) -> E;
     /// The difference between the next context and the current context.
     fn ctx_change(&self) -> E;
-    /// The difference between the next address and the current address.
+    /// The difference between the next word address and the current word address.
     fn word_addr_change(&self) -> E;
     /// The difference between the next clock value and the current one, minus 1.
     fn clk_change(&self) -> E;
@@ -339,16 +326,6 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
         self.next()[MEMORY_IS_WORD_ACCESS_COL_IDX]
     }
 
-    #[inline(always)]
-    fn ctx(&self) -> E {
-        self.current()[MEMORY_CTX_COL_IDX]
-    }
-
-    #[inline(always)]
-    fn word_next(&self) -> E {
-        self.next()[MEMORY_WORD_COL_IDX]
-    }
-
     #[inline(always)]
     fn idx0(&self) -> E {
         self.current()[MEMORY_IDX0_COL_IDX]
@@ -369,16 +346,6 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
         self.next()[MEMORY_IDX1_COL_IDX]
     }
 
-    #[inline(always)]
-    fn clk(&self) -> E {
-        self.current()[MEMORY_CLK_COL_IDX]
-    }
-
-    #[inline(always)]
-    fn clk_next(&self) -> E {
-        self.next()[MEMORY_CLK_COL_IDX]
-    }
-
     #[inline(always)]
     fn v(&self, index: usize) -> E {
         self.current()[MEMORY_V_COL_RANGE.start + index]
diff --git a/air/src/constraints/chiplets/mod.rs b/air/src/constraints/chiplets/mod.rs
index d29b8000e4..3ecbcae95d 100644
--- a/air/src/constraints/chiplets/mod.rs
+++ b/air/src/constraints/chiplets/mod.rs
@@ -91,7 +91,7 @@ pub fn enforce_constraints<E: FieldElement<BaseField = Felt>>(
         frame,
         &mut result[constraint_offset..],
         frame.memory_flag(),
-        frame.memory_flag_no_last(),
+        frame.memory_flag_not_last_row(),
         frame.memory_flag_first_row(),
     );
 }
@@ -155,7 +155,7 @@ trait EvaluationFrameExt<E: FieldElement> {
 
     /// Flag to indicate whether the current row of the frame is in the memory portion of the
     /// Chiplets trace, except for the last memory chiplet row.
-    fn memory_flag_no_last(&self) -> E;
+    fn memory_flag_not_last_row(&self) -> E;
 
     /// Flag to indicate whether the next row of the frame is in the memory portion of the Chiplets
     /// trace.
@@ -195,7 +195,7 @@ impl<E: FieldElement> EvaluationFrameExt<E> for &EvaluationFrame<E> {
     }
 
     #[inline(always)]
-    fn memory_flag_no_last(&self) -> E {
+    fn memory_flag_not_last_row(&self) -> E {
         self.s(0) * self.s(1) * binary_not(self.s_next(2))
     }
 
diff --git a/assembly/src/assembler/procedure.rs b/assembly/src/assembler/procedure.rs
index 754eb6a342..a2dbf74d6c 100644
--- a/assembly/src/assembler/procedure.rs
+++ b/assembly/src/assembler/procedure.rs
@@ -1,6 +1,6 @@
 use alloc::sync::Arc;
 
-use vm_core::mast::MastNodeId;
+use vm_core::{mast::MastNodeId, WORD_SIZE};
 
 use super::GlobalProcedureIndex;
 use crate::{
@@ -48,7 +48,7 @@ impl ProcedureContext {
     ///
     /// The number of locals is rounded up to the nearest multiple of 4.
     pub fn with_num_locals(mut self, num_locals: u16) -> Self {
-        self.num_locals = round_up_to_multiple_of_4(num_locals);
+        self.num_locals = num_locals.next_multiple_of(WORD_SIZE as u16);
         self
     }
 
@@ -58,15 +58,6 @@ impl ProcedureContext {
     }
 }
 
-#[inline(always)]
-fn round_up_to_multiple_of_4(value: u16) -> u16 {
-    // For example, if value = 4,5,6,7
-    // value + 3 = 7,8,9,10
-    // value + 3 & !3 = 4,8,8,8 (&!3 clears the last two bits)
-    // as desired.
-    (value + 3) & !3
-}
-
 // ------------------------------------------------------------------------------------------------
 /// Public accessors
 impl ProcedureContext {
diff --git a/assembly/src/tests.rs b/assembly/src/tests.rs
index 20383751ff..e62802cce9 100644
--- a/assembly/src/tests.rs
+++ b/assembly/src/tests.rs
@@ -842,7 +842,7 @@ fn mem_operations_with_constants() -> TestResult {
     const.GLOBAL_STOREW_PTR={GLOBAL_STOREW_PTR}
     const.GLOBAL_LOADW_PTR={GLOBAL_LOADW_PTR}
 
-    proc.test_const_loc.24
+    proc.test_const_loc.12
         # constant should resolve using locaddr operation
         locaddr.PROC_LOC_STORE_PTR
 
@@ -885,7 +885,7 @@ fn mem_operations_with_constants() -> TestResult {
         &context,
         format!(
             "\
-    proc.test_const_loc.24
+    proc.test_const_loc.12
         # constant should resolve using locaddr operation
         locaddr.{PROC_LOC_STORE_PTR}
 
diff --git a/processor/src/chiplets/aux_trace/mod.rs b/processor/src/chiplets/aux_trace/mod.rs
index 7c6df3e841..b591844f10 100644
--- a/processor/src/chiplets/aux_trace/mod.rs
+++ b/processor/src/chiplets/aux_trace/mod.rs
@@ -965,9 +965,7 @@ where
 // HELPER FUNCTIONS
 // ================================================================================================
 
-/// Reduces a slice of elements to a single field element in the field specified by E using a slice
-/// of alphas of matching length. This can be used to build the value for a single word or for an
-/// entire [HasherState].
+/// Runs an inner product between the alphas and the elements.
 #[inline(always)]
 fn build_value<E: FieldElement<BaseField = Felt>, const N: usize>(
     alphas: &[E],
diff --git a/processor/src/chiplets/memory/mod.rs b/processor/src/chiplets/memory/mod.rs
index c0a4e78283..630c69b92d 100644
--- a/processor/src/chiplets/memory/mod.rs
+++ b/processor/src/chiplets/memory/mod.rs
@@ -104,7 +104,7 @@ impl Memory {
         self.num_trace_rows
     }
 
-    /// Returns a word located at the specified context/address, or None if the address hasn't
+    /// Returns the element located at the specified context/address, or None if the address hasn't
     /// been accessed previously.
     ///
     /// Unlike read() which modifies the memory access trace, this method returns the value at the
diff --git a/stdlib/asm/crypto/stark/constants.masm b/stdlib/asm/crypto/stark/constants.masm
index a77b487884..e6ff602f13 100644
--- a/stdlib/asm/crypto/stark/constants.masm
+++ b/stdlib/asm/crypto/stark/constants.masm
@@ -12,7 +12,7 @@ const.NUM_CONSTRAINT_COMPOSITION_COEF_ROUNDED_UP_TO_FOUR=232
 
 # Number of coefficients corresponds to "number of main & aux columns" + 8,
 # where "8" is the number of columns needed to store the constraint composition polynomial.
-const.NUM_DEEP_COMPOSITION_COEF_MULTIPLIED_BY_TWO_ROUNDED_UP_TO_FOUR=88
+const.NUM_DEEP_COMPOSITION_COEF_ROUNDED_UP_TO_FOUR=88
 
 # Number of random extension field coefficients related to the auxiliary trace (i.e. the alphas)
 const.NUM_AUX_TRACE_COEFS=16
@@ -185,8 +185,8 @@ export.num_constraint_composition_coef_multiplied_by_two_and_rounded_up_to_4
     push.NUM_CONSTRAINT_COMPOSITION_COEF_ROUNDED_UP_TO_FOUR
 end
 
-export.num_deep_composition_coef_multiplied_by_two_and_rounded_up_to_4
-    push.NUM_DEEP_COMPOSITION_COEF_MULTIPLIED_BY_TWO_ROUNDED_UP_TO_FOUR
+export.num_deep_composition_coef_rounded_up_to_4
+    push.NUM_DEEP_COMPOSITION_COEF_ROUNDED_UP_TO_FOUR
 end
 
 export.public_inputs_ptr
diff --git a/stdlib/asm/crypto/stark/random_coin.masm b/stdlib/asm/crypto/stark/random_coin.masm
index 3e94abb234..5489e4115f 100644
--- a/stdlib/asm/crypto/stark/random_coin.masm
+++ b/stdlib/asm/crypto/stark/random_coin.masm
@@ -483,7 +483,7 @@ end
 #! Cycles: 1624
 export.generate_deep_composition_random_coefficients
     # note that 88 is the next number after 86 divisible by 4
-    exec.constants::num_deep_composition_coef_multiplied_by_two_and_rounded_up_to_4
+    exec.constants::num_deep_composition_coef_rounded_up_to_4
     swap
     exec.generate_random_coefficients_pad
     #=> [...]
diff --git a/stdlib/tests/mem/mod.rs b/stdlib/tests/mem/mod.rs
index f5efa22b14..9d3fbbb906 100644
--- a/stdlib/tests/mem/mod.rs
+++ b/stdlib/tests/mem/mod.rs
@@ -100,15 +100,14 @@ fn test_pipe_double_words_to_memory() {
         use.std::sys
 
         begin
-            push.{}         # end_addr
-            push.{}         # write_addr
+            push.{end_addr}
+            push.{start_addr}
             padw padw padw  # hasher state
 
             exec.mem::pipe_double_words_to_memory
 
             exec.sys::truncate_stack
-        end",
-        end_addr, start_addr,
+        end"
     );
 
     let operand_stack = &[];

From 868bf1e78567539d2b1ae4cc42bb5b8edd788c9e Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Fri, 10 Jan 2025 11:13:24 -0500
Subject: [PATCH 18/19] fix rounding up of locals

---
 assembly/src/assembler/instruction/env_ops.rs      |  2 +-
 assembly/src/assembler/instruction/mem_ops.rs      | 14 +++++++++++---
 assembly/src/assembler/mod.rs                      |  8 +++-----
 assembly/src/assembler/procedure.rs                |  6 ++----
 .../tests/integration/operations/io_ops/env_ops.rs |  6 ++++--
 5 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/assembly/src/assembler/instruction/env_ops.rs b/assembly/src/assembler/instruction/env_ops.rs
index 2026af71f5..8688bc89ce 100644
--- a/assembly/src/assembler/instruction/env_ops.rs
+++ b/assembly/src/assembler/instruction/env_ops.rs
@@ -44,7 +44,7 @@ pub fn locaddr(
     index: u16,
     proc_ctx: &ProcedureContext,
 ) -> Result<(), AssemblyError> {
-    local_to_absolute_addr(block_builder, index, proc_ctx.num_locals())
+    local_to_absolute_addr(block_builder, index, proc_ctx.num_locals(), true)
 }
 
 /// Appends CALLER operation to the span which puts the hash of the function which initiated the
diff --git a/assembly/src/assembler/instruction/mem_ops.rs b/assembly/src/assembler/instruction/mem_ops.rs
index 98b5c5c9f1..0ef2568f71 100644
--- a/assembly/src/assembler/instruction/mem_ops.rs
+++ b/assembly/src/assembler/instruction/mem_ops.rs
@@ -33,7 +33,7 @@ pub fn mem_read(
     if let Some(addr) = addr {
         if is_local {
             let num_locals = proc_ctx.num_locals();
-            local_to_absolute_addr(block_builder, addr as u16, num_locals)?;
+            local_to_absolute_addr(block_builder, addr as u16, num_locals, is_single)?;
         } else {
             push_u32_value(block_builder, addr);
         }
@@ -81,7 +81,7 @@ pub fn mem_write_imm(
     is_single: bool,
 ) -> Result<(), AssemblyError> {
     if is_local {
-        local_to_absolute_addr(block_builder, addr as u16, proc_ctx.num_locals())?;
+        local_to_absolute_addr(block_builder, addr as u16, proc_ctx.num_locals(), is_single)?;
     } else {
         push_u32_value(block_builder, addr);
     }
@@ -113,6 +113,7 @@ pub fn local_to_absolute_addr(
     block_builder: &mut BasicBlockBuilder,
     index_of_local: u16,
     num_proc_locals: u16,
+    is_single: bool,
 ) -> Result<(), AssemblyError> {
     if num_proc_locals == 0 {
         return Err(AssemblyError::Other(
@@ -124,7 +125,14 @@ pub fn local_to_absolute_addr(
         ));
     }
 
-    let max = num_proc_locals - 1;
+    // If a single local value is being accessed, then the index can take the full range 
+    // [0, num_proc_locals - 1]. Otherwise, the index can take the range [0, num_proc_locals - 4] 
+    // to account for the fact that a full word is being accessed.
+    let max = if is_single {
+        num_proc_locals - 1
+    } else {
+        num_proc_locals - 4
+    };
     validate_param(index_of_local, 0..=max)?;
 
     // Local values are placed under the frame pointer, so we need to calculate the offset of the
diff --git a/assembly/src/assembler/mod.rs b/assembly/src/assembler/mod.rs
index af4de9de06..67337da077 100644
--- a/assembly/src/assembler/mod.rs
+++ b/assembly/src/assembler/mod.rs
@@ -568,19 +568,17 @@ impl Assembler {
         // Make sure the current procedure context is available during codegen
         let gid = proc_ctx.id();
 
-        // We expect the number of locals to be a multiple of the word size, having been rounded up
-        // if necessary.
         let num_locals = proc_ctx.num_locals();
-        assert_eq!(num_locals % WORD_SIZE as u16, 0);
 
         let wrapper_proc = self.module_graph.get_procedure_unsafe(gid);
         let proc = wrapper_proc.unwrap_ast().unwrap_procedure();
         let proc_body_id = if num_locals > 0 {
             // For procedures with locals, we need to update fmp register before and after the
             // procedure body is executed. Specifically:
-            // - to allocate procedure locals we need to increment fmp by the number of locals, and
+            // - to allocate procedure locals we need to increment fmp by the number of locals
+            //   (rounded up to the word size), and
             // - to deallocate procedure locals we need to decrement it by the same amount.
-            let locals_frame = Felt::from(num_locals);
+            let locals_frame = Felt::from(num_locals.next_multiple_of(WORD_SIZE as u16));
             let wrapper = BodyWrapper {
                 prologue: vec![Operation::Push(locals_frame), Operation::FmpUpdate],
                 epilogue: vec![Operation::Push(-locals_frame), Operation::FmpUpdate],
diff --git a/assembly/src/assembler/procedure.rs b/assembly/src/assembler/procedure.rs
index a2dbf74d6c..591cd5ac76 100644
--- a/assembly/src/assembler/procedure.rs
+++ b/assembly/src/assembler/procedure.rs
@@ -1,6 +1,6 @@
 use alloc::sync::Arc;
 
-use vm_core::{mast::MastNodeId, WORD_SIZE};
+use vm_core::mast::MastNodeId;
 
 use super::GlobalProcedureIndex;
 use crate::{
@@ -45,10 +45,8 @@ impl ProcedureContext {
     }
 
     /// Sets the number of locals to allocate for the procedure.
-    ///
-    /// The number of locals is rounded up to the nearest multiple of 4.
     pub fn with_num_locals(mut self, num_locals: u16) -> Self {
-        self.num_locals = num_locals.next_multiple_of(WORD_SIZE as u16);
+        self.num_locals = num_locals;
         self
     }
 
diff --git a/miden/tests/integration/operations/io_ops/env_ops.rs b/miden/tests/integration/operations/io_ops/env_ops.rs
index bd3552ca37..a74508c2c1 100644
--- a/miden/tests/integration/operations/io_ops/env_ops.rs
+++ b/miden/tests/integration/operations/io_ops/env_ops.rs
@@ -49,7 +49,7 @@ fn sdepth() {
 fn locaddr() {
     // --- locaddr returns expected address -------------------------------------------------------
     let source = "
-        proc.foo.8
+        proc.foo.5
             locaddr.0
             locaddr.4
         end
@@ -59,7 +59,9 @@ fn locaddr() {
         end";
 
     let test = build_test!(source, &[10]);
-    test.expect_stack(&[FMP_MIN + 4, FMP_MIN, 10]);
+    // Note: internally, we round 5 up to 8 for word-aligned purposes, so the local addresses are
+    // offset from 8 rather than 5.
+    test.expect_stack(&[FMP_MIN + 7, FMP_MIN + 3, 10]);
 
     // --- accessing mem via locaddr updates the correct variables --------------------------------
     let source = "

From 90a979e2171cb00c6ae45c0f01a4d802fb8dce7d Mon Sep 17 00:00:00 2001
From: Philippe Laferriere <plafer@protonmail.com>
Date: Fri, 10 Jan 2025 11:26:09 -0500
Subject: [PATCH 19/19] PR fixes

---
 air/src/constraints/chiplets/memory/mod.rs    |  3 +--
 assembly/src/assembler/instruction/mem_ops.rs |  4 ++--
 stdlib/asm/crypto/dsa/rpo_falcon512.masm      | 12 ++++++------
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/air/src/constraints/chiplets/memory/mod.rs b/air/src/constraints/chiplets/memory/mod.rs
index 0a7d0048ad..729dd3138c 100644
--- a/air/src/constraints/chiplets/memory/mod.rs
+++ b/air/src/constraints/chiplets/memory/mod.rs
@@ -194,8 +194,7 @@ fn enforce_values<E: FieldElement>(
         let f3 = frame.idx1_next() * frame.idx0_next();
 
         let c_i = |f_i| {
-            // when we are operating on elements, z_i is set to 1 for all `i` which are not being
-            // accessed; otherwise it is set to 0.
+            // z_i is set to 1 when we are operating on elements but not the i-th element
             let z_i = binary_not(frame.is_word_access_next()) * binary_not(f_i);
             let is_read_next = frame.is_read_next();
 
diff --git a/assembly/src/assembler/instruction/mem_ops.rs b/assembly/src/assembler/instruction/mem_ops.rs
index 0ef2568f71..c3eaa36332 100644
--- a/assembly/src/assembler/instruction/mem_ops.rs
+++ b/assembly/src/assembler/instruction/mem_ops.rs
@@ -125,8 +125,8 @@ pub fn local_to_absolute_addr(
         ));
     }
 
-    // If a single local value is being accessed, then the index can take the full range 
-    // [0, num_proc_locals - 1]. Otherwise, the index can take the range [0, num_proc_locals - 4] 
+    // If a single local value is being accessed, then the index can take the full range
+    // [0, num_proc_locals - 1]. Otherwise, the index can take the range [0, num_proc_locals - 4]
     // to account for the fact that a full word is being accessed.
     let max = if is_single {
         num_proc_locals - 1
diff --git a/stdlib/asm/crypto/dsa/rpo_falcon512.masm b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
index a5fa681363..e142df9ac4 100644
--- a/stdlib/asm/crypto/dsa/rpo_falcon512.masm
+++ b/stdlib/asm/crypto/dsa/rpo_falcon512.masm
@@ -75,7 +75,7 @@ end
 #! Input: [c_ptr, MSG, NONCE1, NONCE0, ...]
 #! Output: [...]
 #!
-#! Cycles: 1327
+#! Cycles: ~1400
 export.hash_to_point.8
     # Move pointer out of the way
     movdn.12
@@ -118,7 +118,7 @@ end
 #! Input: [tau1, tau0, tau_ptr, ...]
 #! Output: [tau_ptr + 513*4, ...]
 #!
-#! Cycles: 8323
+#! Cycles: ~8900
 export.powers_of_tau
 
     # 1) Save tau^0 i.e. (0, 1)
@@ -150,7 +150,7 @@ end
 #! Input: [ptr, ...]
 #! Output: [...]
 #!
-#! Cycles: 2607
+#! Cycles: ~3100
 export.set_to_zero
     padw
     repeat.512
@@ -484,7 +484,7 @@ end
 #! Input: [pi_ptr, ...]
 #! Output: [norm_sq(s1), ...]
 #!
-#! Cycles: 58888
+#! Cycles: 59000
 export.compute_s1_norm_sq
     repeat.128
         # 1) Load the next 4 * 3 coefficients
@@ -561,7 +561,7 @@ end
 #! Input: [s2_ptr, ...]
 #! Output: [norm_sq(s2), ...]
 #!
-#! Cycles: 13322
+#! Cycles: ~13500
 export.compute_s2_norm_sq
     repeat.128
         padw
@@ -592,7 +592,7 @@ end
 #! Input: [PK, MSG, ...]
 #! Output: [...]
 #!
-#! Cycles: ~ 92029
+#! Cycles: ~ 90400
 export.verify.6660
 
     # 1) Generate a Falcon signature using the secret key associated to PK on message MSG.