Skip to content

Commit

Permalink
exec: fix kernel launch id in instructions
Browse files Browse the repository at this point in the history
  • Loading branch information
romnn committed Mar 4, 2024
1 parent 41a888a commit 970c1af
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 19 deletions.
1 change: 1 addition & 0 deletions benchmarks/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
)]
// #![allow(warnings)]

pub mod babelstream;
pub mod matrixmul;
pub mod pchase;
pub mod simple_matrixmul;
Expand Down
1 change: 1 addition & 0 deletions exec/src/kernel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ pub struct ThreadIndex {
pub kernel_launch_id: u64,
pub block_idx: trace_model::Dim,
pub block_dim: trace_model::Dim,
pub grid_dim: trace_model::Dim,
pub thread_idx: trace_model::Dim,
}

Expand Down
49 changes: 37 additions & 12 deletions exec/src/tracegen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ pub struct WarpId {
pub warp_id_in_block: usize,
}

struct SharedMemAllocation {
start_addr: u64,
num_bytes: u64,
}

pub type WarpInstructionTraces = [Vec<model::ThreadInstruction>; WARP_SIZE as usize];

pub struct Tracer {
Expand All @@ -130,6 +135,7 @@ pub struct Tracer {
traced_instructions: std::sync::Mutex<HashMap<WarpId, WarpInstructionTraces>>,
kernel_launch_id: atomic::AtomicU64,
commands: std::sync::Mutex<Vec<trace_model::command::Command>>,
shared_mem_allocations: std::sync::Mutex<Vec<SharedMemAllocation>>,
}

impl Tracer {
Expand All @@ -140,6 +146,7 @@ impl Tracer {
traced_instructions: std::sync::Mutex::new(HashMap::new()),
kernel_launch_id: atomic::AtomicU64::new(0),
commands: std::sync::Mutex::new(Vec::new()),
shared_mem_allocations: std::sync::Mutex::new(Vec::new()),
})
}
}
Expand Down Expand Up @@ -316,6 +323,7 @@ impl Tracer {
block_id: block_id.clone(),
block_idx: block_id.to_dim(),
block_dim: block_size.clone(),
grid_dim: grid.clone(),
thread_idx: warp_thread_idx.to_dim(),
};

Expand Down Expand Up @@ -521,17 +529,24 @@ impl TraceGenerator for Tracer {
*offset = addr + num_bytes;
*offset = utils::next_multiple(*offset, ALIGNMENT_BYTES);

self.commands
.lock()
.unwrap()
.push(trace_model::command::Command::MemAlloc(
trace_model::command::MemAlloc {
allocation_name: options.name,
device_ptr: base_addr + addr,
fill_l2: options.fill_l2,
if options.mem_space == model::MemorySpace::Local {
// TODO: use a rangemap for this?
self.shared_mem_allocations
.lock()
.unwrap()
.push(SharedMemAllocation {
start_addr: base_addr + addr,
num_bytes,
},
));
});
} else {
let cmd = trace_model::command::Command::MemAlloc(trace_model::command::MemAlloc {
allocation_name: options.name,
device_ptr: base_addr + addr,
fill_l2: options.fill_l2,
num_bytes,
});
self.commands.lock().unwrap().push(cmd);
}

DevicePtr {
inner: value,
Expand Down Expand Up @@ -586,7 +601,7 @@ impl TraceGenerator for Tracer {
cuda_ctx: 0,
device_id: 0,
sm_id: 0,
kernel_id: 0,
kernel_id: kernel_launch_id,
block_id: block_id.clone().into(),
warp_id_in_sm: warp_id_in_block as u32,
warp_id_in_block: warp_id_in_block as u32,
Expand Down Expand Up @@ -767,6 +782,16 @@ impl TraceGenerator for Tracer {
});
}

let shared_mem_bytes: u32 = self
.shared_mem_allocations
.lock()
.unwrap()
.iter()
.map(|alloc| alloc.num_bytes)
.sum::<u64>()
.try_into()
.unwrap();

let trace = trace_model::MemAccessTrace(trace);
let launch_config = trace_model::command::KernelLaunch {
mangled_name: kernel_name.clone(),
Expand All @@ -775,7 +800,7 @@ impl TraceGenerator for Tracer {
id: kernel_launch_id,
grid,
block: block_size,
shared_mem_bytes: 0,
shared_mem_bytes,
num_registers: 0,
binary_version: 61,
stream_id: 0,
Expand Down
6 changes: 4 additions & 2 deletions src/cache/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -941,8 +941,10 @@ where
WritePolicy::WRITE_BACK => Self::write_hit_write_back,
WritePolicy::WRITE_THROUGH => unimplemented!("WritePolicy::WRITE_THROUGH"),
WritePolicy::WRITE_EVICT => unimplemented!("WritePolicy::WRITE_EVICT"),
WritePolicy::LOCAL_WRITE_BACK_GLOBAL_WRITE_THROUGH => unimplemented!("WritePolicy::LOCAL_WB_GLOBAL_WT"),
// WritePolicy::LOCAL_WB_GLOBAL_WT => Self::write_hit_global_write_evict_local_write_back,
// WritePolicy::LOCAL_WRITE_BACK_GLOBAL_WRITE_THROUGH => unimplemented!("WritePolicy::LOCAL_WB_GLOBAL_WT"),
// this policy we only observe is used in exec-driven,
// where the L1 has write hits
WritePolicy::LOCAL_WRITE_BACK_GLOBAL_WRITE_THROUGH => Self::write_hit_global_write_evict_local_write_back,
};
(func)(self, addr, cache_index, fetch, time, events, probe_status)
}
Expand Down
11 changes: 10 additions & 1 deletion src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -976,8 +976,11 @@ where

let kernel_id = warp.kernel_id;
let block_hw_id = warp.block_id as usize;
debug_assert!(block_hw_id < self.active_threads_per_hardware_block.len(),);
debug_assert!(block_hw_id < self.active_threads_per_hardware_block.len());

// TODO: lets maybe move all these more expensive checks
// into the issue stage?
// using the scoreboard here just for that does not really make sense...
let has_pending_writes = !self.scoreboard.pending_writes(warp_id).is_empty();

let warp_completed = warp.hardware_done() && !has_pending_writes && !warp.done_exit();
Expand Down Expand Up @@ -2724,6 +2727,12 @@ where
pub fn warp_inst_complete(instr: &mut WarpInstruction, stats: &mut stats::PerKernel) {
let kernel_stats = stats.get_mut(Some(instr.kernel_launch_id as usize));
kernel_stats.sim.instructions += instr.active_thread_count() as u64;
// log::error!(
// "kernel {}: warp inst {} completed. instructions={}",
// instr.kernel_launch_id,
// &instr,
// kernel_stats.sim.instructions
// );
// crate::WIP_STATS.lock().warp_instructions += 1;
}

Expand Down
22 changes: 18 additions & 4 deletions src/instruction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,7 @@ impl WarpInstruction {
_ => {}
}

assert_eq!(trace.kernel_id, launch_config.id);
Self {
uid: 0,
warp_id: trace.warp_id_in_block as usize,
Expand Down Expand Up @@ -941,6 +942,9 @@ impl WarpInstruction {
}
Some(MemorySpace::Global | MemorySpace::Local) => {
let access_kind = self.access_kind().expect("has access kind");
if self.memory_space == Some(MemorySpace::Local) {
panic!("have local");
}
if config.coalescing_arch as usize >= 13 {
if self.is_atomic() {
// memory_coalescing_arch_atomic(is_write, access_type);
Expand Down Expand Up @@ -1013,6 +1017,8 @@ impl WarpInstruction {
crate::mem_sub_partition::SECTOR_SIZE as u64,
"require sector segment size for sectored L1"
);

// todo: change this back to 32
let subwarp_size = config.warp_size / warp_parts;
log::trace!(
"memory_coalescing_arch {:?}: segment size={} subwarp size={}",
Expand All @@ -1024,6 +1030,7 @@ impl WarpInstruction {
// let mut accesses: Vec<MemAccess> = Vec::new();
// let mut accesses: SmallVec<[MemAccess; 32]> = SmallVec::new();

// todo: warp parts should be 1
for subwarp in 0..warp_parts {
// let mut subwarp_transactions: HashMap<address, TransactionInfo> = HashMap::new();
use vec_collections::VecMap;
Expand Down Expand Up @@ -1134,7 +1141,7 @@ impl WarpInstruction {
// subwarp_accesses,
// );

if true || log::log_enabled!(log::Level::Trace) {
if log::log_enabled!(log::Level::Warn) {
let allocations = allocations.read();
for (i, (block_addr, subwarp_access)) in subwarp_accesses.iter().enumerate() {
let (last_block_addr, _) = subwarp_accesses[i.saturating_sub(1)];
Expand Down Expand Up @@ -1164,14 +1171,21 @@ impl WarpInstruction {
.join("|");

let rel_block_addr = allocations
.get(&block_addr)
.get(block_addr)
.map(|allocation| block_addr - allocation.start_addr);

let addr =
*block_addr + subwarp_access.byte_mask.first_one().unwrap_or(0) as u64;
let rel_addr = allocations
.get(&addr)
.map(|allocation| addr - allocation.start_addr);

log::warn!(
" [{: >2}] {:>18} {:>6} ({}{:<4}): chunk={:>4} floats={} activemask={}",
" [{: >2}] rel={:>6} block={:>18} ({}{:<4}): chunk={:>4} floats={} activemask={}",
i,
rel_addr.unwrap_or(0),
block_addr,
rel_block_addr.unwrap_or(0),
// rel_block_addr.unwrap_or(0),
if diff < 0 { "-" } else { "+" },
diff.abs(),
subwarp_access.chunk_mask[..4].to_bit_string(),
Expand Down
8 changes: 8 additions & 0 deletions src/mem_fetch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -679,3 +679,11 @@ impl MemFetch {
}
}
}

#[cfg(test)]
pub mod tests {
#[test]
fn mem_fetch_size() {
assert_eq!(std::mem::size_of::<super::MemFetch>(), 0);
}
}

0 comments on commit 970c1af

Please sign in to comment.