exec: fix kernel launch id in instructions

romnn · Mar 4, 2024 · 970c1af · 970c1af
1 parent 41a888a
commit 970c1af
Show file tree

Hide file tree

Showing 7 changed files with 79 additions and 19 deletions.
diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
@@ -8,6 +8,7 @@
 )]
 // #![allow(warnings)]
 
+pub mod babelstream;
 pub mod matrixmul;
 pub mod pchase;
 pub mod simple_matrixmul;

diff --git a/exec/src/kernel.rs b/exec/src/kernel.rs
@@ -40,6 +40,7 @@ pub struct ThreadIndex {
     pub kernel_launch_id: u64,
     pub block_idx: trace_model::Dim,
     pub block_dim: trace_model::Dim,
+    pub grid_dim: trace_model::Dim,
     pub thread_idx: trace_model::Dim,
 }
 

diff --git a/exec/src/tracegen.rs b/exec/src/tracegen.rs
@@ -119,6 +119,11 @@ pub struct WarpId {
     pub warp_id_in_block: usize,
 }
 
+struct SharedMemAllocation {
+    start_addr: u64,
+    num_bytes: u64,
+}
+
 pub type WarpInstructionTraces = [Vec<model::ThreadInstruction>; WARP_SIZE as usize];
 
 pub struct Tracer {
@@ -130,6 +135,7 @@ pub struct Tracer {
     traced_instructions: std::sync::Mutex<HashMap<WarpId, WarpInstructionTraces>>,
     kernel_launch_id: atomic::AtomicU64,
     commands: std::sync::Mutex<Vec<trace_model::command::Command>>,
+    shared_mem_allocations: std::sync::Mutex<Vec<SharedMemAllocation>>,
 }
 
 impl Tracer {
@@ -140,6 +146,7 @@ impl Tracer {
             traced_instructions: std::sync::Mutex::new(HashMap::new()),
             kernel_launch_id: atomic::AtomicU64::new(0),
             commands: std::sync::Mutex::new(Vec::new()),
+            shared_mem_allocations: std::sync::Mutex::new(Vec::new()),
         })
     }
 }
@@ -316,6 +323,7 @@ impl Tracer {
                                     block_id: block_id.clone(),
                                     block_idx: block_id.to_dim(),
                                     block_dim: block_size.clone(),
+                                    grid_dim: grid.clone(),
                                     thread_idx: warp_thread_idx.to_dim(),
                                 };
 
@@ -521,17 +529,24 @@ impl TraceGenerator for Tracer {
         *offset = addr + num_bytes;
         *offset = utils::next_multiple(*offset, ALIGNMENT_BYTES);
 
-        self.commands
-            .lock()
-            .unwrap()
-            .push(trace_model::command::Command::MemAlloc(
-                trace_model::command::MemAlloc {
-                    allocation_name: options.name,
-                    device_ptr: base_addr + addr,
-                    fill_l2: options.fill_l2,
+        if options.mem_space == model::MemorySpace::Local {
+            // TODO: use a rangemap for this?
+            self.shared_mem_allocations
+                .lock()
+                .unwrap()
+                .push(SharedMemAllocation {
+                    start_addr: base_addr + addr,
                     num_bytes,
-                },
-            ));
+                });
+        } else {
+            let cmd = trace_model::command::Command::MemAlloc(trace_model::command::MemAlloc {
+                allocation_name: options.name,
+                device_ptr: base_addr + addr,
+                fill_l2: options.fill_l2,
+                num_bytes,
+            });
+            self.commands.lock().unwrap().push(cmd);
+        }
 
         DevicePtr {
             inner: value,
@@ -586,7 +601,7 @@ impl TraceGenerator for Tracer {
                 cuda_ctx: 0,
                 device_id: 0,
                 sm_id: 0,
-                kernel_id: 0,
+                kernel_id: kernel_launch_id,
                 block_id: block_id.clone().into(),
                 warp_id_in_sm: warp_id_in_block as u32,
                 warp_id_in_block: warp_id_in_block as u32,
@@ -767,6 +782,16 @@ impl TraceGenerator for Tracer {
             });
         }
 
+        let shared_mem_bytes: u32 = self
+            .shared_mem_allocations
+            .lock()
+            .unwrap()
+            .iter()
+            .map(|alloc| alloc.num_bytes)
+            .sum::<u64>()
+            .try_into()
+            .unwrap();
+
         let trace = trace_model::MemAccessTrace(trace);
         let launch_config = trace_model::command::KernelLaunch {
             mangled_name: kernel_name.clone(),
@@ -775,7 +800,7 @@ impl TraceGenerator for Tracer {
             id: kernel_launch_id,
             grid,
             block: block_size,
-            shared_mem_bytes: 0,
+            shared_mem_bytes,
             num_registers: 0,
             binary_version: 61,
             stream_id: 0,

diff --git a/src/cache/data.rs b/src/cache/data.rs
@@ -941,8 +941,10 @@ where
             WritePolicy::WRITE_BACK => Self::write_hit_write_back,
             WritePolicy::WRITE_THROUGH => unimplemented!("WritePolicy::WRITE_THROUGH"),
             WritePolicy::WRITE_EVICT => unimplemented!("WritePolicy::WRITE_EVICT"),
-            WritePolicy::LOCAL_WRITE_BACK_GLOBAL_WRITE_THROUGH => unimplemented!("WritePolicy::LOCAL_WB_GLOBAL_WT"),
-            // WritePolicy::LOCAL_WB_GLOBAL_WT => Self::write_hit_global_write_evict_local_write_back,
+            // WritePolicy::LOCAL_WRITE_BACK_GLOBAL_WRITE_THROUGH => unimplemented!("WritePolicy::LOCAL_WB_GLOBAL_WT"),
+            // this policy we only observe is used in exec-driven, 
+            // where the L1 has write hits
+            WritePolicy::LOCAL_WRITE_BACK_GLOBAL_WRITE_THROUGH => Self::write_hit_global_write_evict_local_write_back,
         };
         (func)(self, addr, cache_index, fetch, time, events, probe_status)
     }

diff --git a/src/core.rs b/src/core.rs
@@ -976,8 +976,11 @@ where
 
             let kernel_id = warp.kernel_id;
             let block_hw_id = warp.block_id as usize;
-            debug_assert!(block_hw_id < self.active_threads_per_hardware_block.len(),);
+            debug_assert!(block_hw_id < self.active_threads_per_hardware_block.len());
 
+            // TODO: lets maybe move all these more expensive checks
+            // into the issue stage?
+            // using the scoreboard here just for that does not really make sense...
             let has_pending_writes = !self.scoreboard.pending_writes(warp_id).is_empty();
 
             let warp_completed = warp.hardware_done() && !has_pending_writes && !warp.done_exit();
@@ -2724,6 +2727,12 @@ where
 pub fn warp_inst_complete(instr: &mut WarpInstruction, stats: &mut stats::PerKernel) {
     let kernel_stats = stats.get_mut(Some(instr.kernel_launch_id as usize));
     kernel_stats.sim.instructions += instr.active_thread_count() as u64;
+    // log::error!(
+    //     "kernel {}: warp inst {} completed. instructions={}",
+    //     instr.kernel_launch_id,
+    //     &instr,
+    //     kernel_stats.sim.instructions
+    // );
     // crate::WIP_STATS.lock().warp_instructions += 1;
 }
 

diff --git a/src/instruction.rs b/src/instruction.rs
@@ -542,6 +542,7 @@ impl WarpInstruction {
             _ => {}
         }
 
+        assert_eq!(trace.kernel_id, launch_config.id);
         Self {
             uid: 0,
             warp_id: trace.warp_id_in_block as usize,
@@ -941,6 +942,9 @@ impl WarpInstruction {
             }
             Some(MemorySpace::Global | MemorySpace::Local) => {
                 let access_kind = self.access_kind().expect("has access kind");
+                if self.memory_space == Some(MemorySpace::Local) {
+                    panic!("have local");
+                }
                 if config.coalescing_arch as usize >= 13 {
                     if self.is_atomic() {
                         // memory_coalescing_arch_atomic(is_write, access_type);
@@ -1013,6 +1017,8 @@ impl WarpInstruction {
             crate::mem_sub_partition::SECTOR_SIZE as u64,
             "require sector segment size for sectored L1"
         );
+
+        // todo: change this back to 32
         let subwarp_size = config.warp_size / warp_parts;
         log::trace!(
             "memory_coalescing_arch {:?}: segment size={} subwarp size={}",
@@ -1024,6 +1030,7 @@ impl WarpInstruction {
         // let mut accesses: Vec<MemAccess> = Vec::new();
         // let mut accesses: SmallVec<[MemAccess; 32]> = SmallVec::new();
 
+        // todo: warp parts should be 1
         for subwarp in 0..warp_parts {
             // let mut subwarp_transactions: HashMap<address, TransactionInfo> = HashMap::new();
             use vec_collections::VecMap;
@@ -1134,7 +1141,7 @@ impl WarpInstruction {
             //     subwarp_accesses,
             // );
 
-            if true || log::log_enabled!(log::Level::Trace) {
+            if log::log_enabled!(log::Level::Warn) {
                 let allocations = allocations.read();
                 for (i, (block_addr, subwarp_access)) in subwarp_accesses.iter().enumerate() {
                     let (last_block_addr, _) = subwarp_accesses[i.saturating_sub(1)];
@@ -1164,14 +1171,21 @@ impl WarpInstruction {
                         .join("|");
 
                     let rel_block_addr = allocations
-                        .get(&block_addr)
+                        .get(block_addr)
                         .map(|allocation| block_addr - allocation.start_addr);
 
+                    let addr =
+                        *block_addr + subwarp_access.byte_mask.first_one().unwrap_or(0) as u64;
+                    let rel_addr = allocations
+                        .get(&addr)
+                        .map(|allocation| addr - allocation.start_addr);
+
                     log::warn!(
-                        " [{: >2}] {:>18} {:>6} ({}{:<4}): chunk={:>4} floats={} activemask={}",
+                        " [{: >2}] rel={:>6} block={:>18} ({}{:<4}): chunk={:>4} floats={} activemask={}",
                         i,
+                        rel_addr.unwrap_or(0),
                         block_addr,
-                        rel_block_addr.unwrap_or(0),
+                        // rel_block_addr.unwrap_or(0),
                         if diff < 0 { "-" } else { "+" },
                         diff.abs(),
                         subwarp_access.chunk_mask[..4].to_bit_string(),

diff --git a/src/mem_fetch.rs b/src/mem_fetch.rs
@@ -679,3 +679,11 @@ impl MemFetch {
         }
     }
 }
+
+#[cfg(test)]
+pub mod tests {
+    #[test]
+    fn mem_fetch_size() {
+        assert_eq!(std::mem::size_of::<super::MemFetch>(), 0);
+    }
+}