diff --git a/src/gemm.rs b/src/gemm.rs index 8caf1996..ea67af50 100644 --- a/src/gemm.rs +++ b/src/gemm.rs @@ -1054,6 +1054,7 @@ mod tests { use std::error::Error; use std::mem::MaybeUninit; use std::ops::Range; + use std::time::Instant; use rten_bench::run_bench; use rten_tensor::prelude::*; @@ -1775,8 +1776,6 @@ mod tests { Ok(()) } - use crate::timer::Timer; - // Run with `cargo test --release bench_gemm -- --nocapture --ignored` #[test] #[ignore] @@ -1863,12 +1862,11 @@ mod tests { Tensor::rand(&[k, n], &mut rng) }; - let mut t = Timer::new(); - t.start(); + let start = Instant::now(); for _i in 0..iters { run_gemm(&mut result, &a, &b, 1., 0., None, None); } - t.end(); + let duration = start.elapsed(); // Calculate throughput. For comparison, the theoretical maximum // GFLOPS for a single core (`RAYON_NUM_THREADS=1`) can be computed @@ -1887,8 +1885,9 @@ mod tests { // `fma_units` is 2. For a 3.4Ghz CPU this would give a max // theoretical peak of 3.4 * 8 * 2 * 2 = 108.8 GFLOPS. - let flops = (2 * m * n * k * iters as usize) as f32 / t.elapsed_secs(); + let flops = (2 * m * n * k * iters as usize) as f32 / duration.as_secs_f32(); let gflops = flops / (10f32).powi(9); + let duration_ms = duration.as_secs_f64() * 1000.0; println!( "m {} n {} k {} iters {}. Duration {}ms ({}ms/iter). GFLOPS {}", @@ -1896,8 +1895,8 @@ mod tests { n, k, iters, - t.elapsed_ms(), - t.elapsed_ms() / iters as f32, + duration_ms, + duration_ms / iters as f64, gflops, ); } diff --git a/src/graph.rs b/src/graph.rs index 892d464e..09d13513 100644 --- a/src/graph.rs +++ b/src/graph.rs @@ -2,6 +2,7 @@ use std::error::Error; use std::fmt; use std::iter::zip; use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; use rten_tensor::prelude::*; use rten_tensor::{DynLayout, Tensor, TensorView}; @@ -18,7 +19,6 @@ use crate::env::env_flag; use crate::ops::{Input, InputList, InputOrOutput, OpError, Operator, Output, OutputList}; use crate::tensor_pool::TensorPool; use crate::threading; -use crate::timer::Timer; use crate::timing::{InputShape, RunTiming, TimingRecord, TimingSort}; /// Represents the size of a dimension of a runtime-provided value, such as @@ -533,11 +533,6 @@ impl Graph { ) -> Result, RunError> { let opts = opts.unwrap_or_default(); - let mut run_timer = Timer::new(); - if opts.timing { - run_timer.start(); - } - let mut temp_values: FxHashMap = FxHashMap::default(); // Extract all the owned tensor inputs into the temp value map. @@ -604,7 +599,8 @@ impl Graph { } else { Vec::new() }; - let mut alloc_timer = Timer::new(); + + let mut op_start = Instant::now(); for (step, &op_node_id) in plan.iter().enumerate() { let Some(Node::Operator(op_node)) = self.nodes.get(op_node_id) else { @@ -612,10 +608,6 @@ impl Graph { "operator node not found".to_string(), )); }; - let mut op_timer = Timer::new(); - if record_timing { - op_timer.start(); - } // Choose the input that we'll try to modify in-place to avoid // allocating a new buffer for the output. This will be passed as @@ -713,20 +705,12 @@ impl Graph { }; std::mem::drop(op_inputs); - // Record timings. We do this even if the operation failed so logs - // will contain details of the failed operation in the event of an - // error. - if record_timing { - op_timer.end(); - op_timing_records.push(TimingRecord { - name: op_node.operator.name(), - input_shapes: input_shapes.clone(), - elapsed_micros: op_timer.elapsed_micros(), - node_name: op_node.name.as_deref().unwrap_or(""), - }); - } + // Print verbose logs if enabled. This is done before checking the + // op's result, so logs will contain details of the failed operation + // in the event of an error. if opts.verbose { - self.print_op_timing(step, op_node, &op_result, &op_timer, &input_shapes); + let op_duration = op_start.elapsed(); + self.print_op_timing(step, op_node, &op_result, op_duration, &input_shapes); } // Extract outputs or fail if an error occurred. @@ -750,7 +734,6 @@ impl Graph { ); // Remove temporary values that are no longer needed - record_timing.then(|| alloc_timer.start()); for node_id in op_node.inputs.iter().filter_map(|node| *node) { let rc = temp_value_refcount.dec(node_id); if rc == Some(0) { @@ -759,19 +742,23 @@ impl Graph { } } } - record_timing.then(|| alloc_timer.end()); + + if record_timing { + let op_end = Instant::now(); + let op_duration = op_end - op_start; + op_start = op_end; + + op_timing_records.push(TimingRecord { + name: op_node.operator.name(), + input_shapes, + elapsed: op_duration, + node_name: op_node.name.as_deref().unwrap_or(""), + }); + } } if opts.timing { - run_timer.end(); - self.print_run_timing( - plan, - &pool, - run_timer, - alloc_timer, - &op_timing_records, - &opts, - ); + self.print_run_timing(plan, &pool, &op_timing_records, &opts); } // Return the requested outputs @@ -796,7 +783,7 @@ impl Graph { step: usize, op_node: &OperatorNode, op_result: &Result, - op_timer: &Timer, + op_duration: Duration, input_shapes: &[InputShape], ) { println!( @@ -819,7 +806,7 @@ impl Graph { } } - println!(" time: {}ms", op_timer.elapsed_ms()); + println!(" time: {}ms", op_duration.as_secs_f64() * 1000.0); } /// Print a profiling summary at the end of the run. @@ -827,15 +814,15 @@ impl Graph { &self, plan: &[NodeId], pool: &TensorPool, - run_timer: Timer, - alloc_timer: Timer, op_timing_records: &[TimingRecord], opts: &RunOptions, ) { + let run_duration: Duration = op_timing_records.iter().map(|r| r.elapsed).sum(); + let run_duration_ms = run_duration.as_secs_f64() * 1000.0; println!( - "Graph run of {} ops finished in {}ms", + "Graph run of {} ops finished in {:.3}ms", plan.len(), - run_timer.elapsed_ms() + run_duration_ms, ); println!( "Pool allocs {} hits {}", @@ -844,8 +831,7 @@ impl Graph { ); let timing = RunTiming { records: op_timing_records, - alloc_time: alloc_timer.elapsed_ms(), - total_time: run_timer.elapsed_ms(), + total_time: run_duration, }; print!( "{}", diff --git a/src/lib.rs b/src/lib.rs index 3dbbb1c1..5aae556b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -104,7 +104,6 @@ mod optimize; mod slice_reductions; mod tensor_pool; mod threading; -mod timer; mod timing; #[cfg(feature = "wasm_api")] @@ -122,7 +121,6 @@ pub use model_metadata::ModelMetadata; pub use ops::{FloatOperators, Input, InputOrOutput, Operators, Output}; pub use tensor_pool::{ExtractBuffer, PoolRef, TensorPool}; pub use threading::{thread_pool, ThreadPool}; -pub use timer::Timer; pub use timing::TimingSort; #[allow(dead_code, unused_imports)] diff --git a/src/timer.rs b/src/timer.rs deleted file mode 100644 index fcd25a20..00000000 --- a/src/timer.rs +++ /dev/null @@ -1,54 +0,0 @@ -use std::time::Instant; - -/// Utility for recording the cumulative time spent in an operation. -#[doc(hidden)] // Not intended for external use -pub struct Timer { - start: Option, - elapsed: u64, -} - -impl Timer { - /// Create a new, inactive timer with zero elapsed time - pub fn new() -> Timer { - Timer { - start: None, - elapsed: 0, - } - } - - /// Start the timer, or reset it if already active - pub fn start(&mut self) { - self.start = Some(Instant::now()); - } - - /// Stop active timer and add elapsed time to the total returned by `elapsed` - pub fn end(&mut self) { - if let Some(start) = self.start { - self.elapsed += start.elapsed().as_micros() as u64; - } - } - - /// Return the cumulative elapsed time between calls to `start` and `end` - /// in milliseconds. - pub fn elapsed_ms(&self) -> f32 { - (self.elapsed as f32) / 1000.0 - } - - /// Return the cumulative elapsed time between calls to `start` and `end` - /// in microseconds. - pub fn elapsed_micros(&self) -> f32 { - self.elapsed as f32 - } - - /// Return the cumulative elapsed time between calls to `start` and `end` - /// in seconds. - pub fn elapsed_secs(&self) -> f32 { - self.elapsed as f32 / 1_000_000.0 - } -} - -impl Default for Timer { - fn default() -> Self { - Self::new() - } -} diff --git a/src/timing.rs b/src/timing.rs index 01cb2809..b42824d8 100644 --- a/src/timing.rs +++ b/src/timing.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::fmt; +use std::time::Duration; use smallvec::SmallVec; @@ -100,19 +101,11 @@ pub struct RunTiming<'a> { /// Records of graph step execution times pub records: &'a [TimingRecord<'a>], - /// Total time spent in memory allocations or de-allocations that were - /// tracked. - pub alloc_time: f32, - - /// Total time for the graph run. - pub total_time: f32, + /// Total time for the graph run + pub total_time: Duration, } impl<'a> RunTiming<'a> { - fn total_op_time(&self) -> f32 { - self.records.iter().map(|tr| tr.elapsed_micros).sum::() / 1000.0 - } - /// Return a struct that formats output with the given options. pub fn display(&self, sort: TimingSort, include_shapes: bool) -> impl fmt::Display + '_ { FormattedRunTiming { @@ -136,8 +129,8 @@ struct TimingByShapeRecord { /// Formatted representation of input shapes shape: String, - /// Total elapsed time for this operator with a given shape - total_ms: f32, + /// Total elapsed time for this operator + total_duration: Duration, /// Number of times the operator was run with this shape count: usize, @@ -182,15 +175,14 @@ impl Table for TimingByShapeTable { fn cell(&self, row: usize, col: usize) -> String { let row = self.rows.get(row).expect("invalid row"); + let total_ms = row.total_duration.as_secs_f64() * 1000.0; + let total_ns = total_ms * 1_000_000.0; match col { 0 => row.shape.clone(), 1 => row.count.to_string(), - 2 => format!("{:.3}", row.total_ms / row.count as f32), - 3 => format!("{:.3}", row.total_ms), - 4 => format!( - "{:.3}", - (row.total_ms * 1_000_000.0) / (row.input_elements * row.count) as f32 - ), + 2 => format!("{:.3}", total_ms / row.count as f64), + 3 => format!("{:.3}", total_ms), + 4 => format!("{:.3}", total_ns / (row.input_elements * row.count) as f64), 5 => row.node_name.clone(), _ => panic!("invalid column"), } @@ -255,20 +247,20 @@ impl<'a> FormattedRunTiming<'a> { }) .sum::(); let (cum_time, count, _, _) = timings.entry(formatted_shapes).or_insert(( - 0., + Duration::ZERO, 0, input_elements, record.node_name, )); - *cum_time += record.elapsed_micros / 1000.0; + *cum_time += record.elapsed; *count += 1; timings }) .into_iter() .map( - |(shape, (total_ms, count, input_elements, node_name))| TimingByShapeRecord { + |(shape, (total, count, input_elements, node_name))| TimingByShapeRecord { shape, - total_ms, + total_duration: total, count, input_elements, node_name: node_name.to_string(), @@ -276,7 +268,7 @@ impl<'a> FormattedRunTiming<'a> { ) .collect(); - time_by_shape_rows.sort_by(|a, b| a.total_ms.total_cmp(&b.total_ms).reverse()); + time_by_shape_rows.sort_by(|a, b| a.total_duration.cmp(&b.total_duration).reverse()); TimingByShapeTable { rows: time_by_shape_rows, @@ -291,33 +283,27 @@ impl<'a> fmt::Display for FormattedRunTiming<'a> { .records .iter() .fold(HashMap::new(), |mut timings, record| { - let total_op_time = timings.entry(record.name).or_insert(0.); - *total_op_time += record.elapsed_micros / 1000.0; + let total_op_time = timings.entry(record.name).or_insert(Duration::ZERO); + *total_op_time += record.elapsed; timings }) .into_iter() .collect(); - // Add `[Other]` for all unaccounted time. - let total_op_time = self.timing.total_op_time(); - op_timings.push(( - "[Other]", - self.timing.total_time - total_op_time - self.timing.alloc_time, - )); - op_timings.push(("[Mem alloc/free]", self.timing.alloc_time)); - op_timings.sort_by(|(a_name, a_time), (b_name, b_time)| match self.sort { TimingSort::ByName => a_name.cmp(b_name), - TimingSort::ByTime => a_time.total_cmp(b_time).reverse(), + TimingSort::ByTime => a_time.cmp(b_time).reverse(), }); let rows: Vec<_> = op_timings - .iter() + .into_iter() .map(|(op_name, op_total_time)| { - let run_percent = (*op_total_time / self.timing.total_time) * 100.; + let op_total_time_ms = op_total_time.as_secs_f64() * 1000.0; + let total_time_ms = self.timing.total_time.as_secs_f64() * 1000.0; + let run_percent = (op_total_time_ms / total_time_ms) * 100.; [ op_name.to_string(), - format!("{:.2}ms", op_total_time), + format!("{:.2}ms", op_total_time_ms), format!("({:.2}%)", run_percent), ] }) @@ -362,8 +348,8 @@ pub struct TimingRecord<'a> { /// Shapes of the operator's inputs pub input_shapes: Vec, - /// Execution time of this step in microseconds - pub elapsed_micros: f32, + /// Execution time of this step + pub elapsed: Duration, } /// Specifies sort order for graph run timings.