Revise timing metrics collection to avoid unaccounted-for time

In the GPT-2 medium model, about 6% of the runtime for each step was reported as "[Other]", which is the difference between the total time for the run and the sum of individual operator timings. Upon analysis, this difference turned out to be a combination of: 1. The overhead of getting the time with `Instant::now` 2. Rounding and accumulation errors due to storing timings in various resolutions and using single-precision float operations to sum timings 3. Setup time spent in `Graph::run_plan` before the start of the main loop The `Instant::now` overhead on macOS (Intel) is about 30% rdtsc + fences and 70% call overhead according to samply. This commit addresses these as follows: - The number of `Instant::now` calls has been reduced from 2 to 1 per step, with each call capturing the total time for the whole step, now just the call to `Operator::run`. This means that output processing for a step is now accounted for in the time for that step. - Operator timings are now represented, and summed as, `Duration` values. These are integer counts of nanoseconds, avoiding issues with rounding and float accumulation. - The run timing is now reported as the sum of operator timings, excluding setup before the main loop. The `Timer` utility type was removed as it is no longer used following these.
robertknight · Jun 29, 2024 · 4cb16a1 · 4cb16a1
1 parent 9eea36d
commit 4cb16a1
Show file tree

Hide file tree

Showing 5 changed files with 61 additions and 146 deletions.
diff --git a/src/gemm.rs b/src/gemm.rs
@@ -1054,6 +1054,7 @@ mod tests {
     use std::error::Error;
     use std::mem::MaybeUninit;
     use std::ops::Range;
+    use std::time::Instant;
 
     use rten_bench::run_bench;
     use rten_tensor::prelude::*;
@@ -1775,8 +1776,6 @@ mod tests {
         Ok(())
     }
 
-    use crate::timer::Timer;
-
     // Run with `cargo test --release bench_gemm -- --nocapture --ignored`
     #[test]
     #[ignore]
@@ -1863,12 +1862,11 @@ mod tests {
                 Tensor::rand(&[k, n], &mut rng)
             };
 
-            let mut t = Timer::new();
-            t.start();
+            let start = Instant::now();
             for _i in 0..iters {
                 run_gemm(&mut result, &a, &b, 1., 0., None, None);
             }
-            t.end();
+            let duration = start.elapsed();
 
             // Calculate throughput. For comparison, the theoretical maximum
             // GFLOPS for a single core (`RAYON_NUM_THREADS=1`) can be computed
@@ -1887,17 +1885,18 @@ mod tests {
             //   `fma_units` is 2. For a 3.4Ghz CPU this would give a max
             //   theoretical peak of 3.4 * 8 * 2 * 2 = 108.8 GFLOPS.
 
-            let flops = (2 * m * n * k * iters as usize) as f32 / t.elapsed_secs();
+            let flops = (2 * m * n * k * iters as usize) as f32 / duration.as_secs_f32();
             let gflops = flops / (10f32).powi(9);
+            let duration_ms = duration.as_secs_f64() * 1000.0;
 
             println!(
                 "m {} n {} k {} iters {}. Duration {}ms ({}ms/iter). GFLOPS {}",
                 m,
                 n,
                 k,
                 iters,
-                t.elapsed_ms(),
-                t.elapsed_ms() / iters as f32,
+                duration_ms,
+                duration_ms / iters as f64,
                 gflops,
             );
         }

diff --git a/src/graph.rs b/src/graph.rs
@@ -2,6 +2,7 @@ use std::error::Error;
 use std::fmt;
 use std::iter::zip;
 use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
 
 use rten_tensor::prelude::*;
 use rten_tensor::{DynLayout, Tensor, TensorView};
@@ -18,7 +19,6 @@ use crate::env::env_flag;
 use crate::ops::{Input, InputList, InputOrOutput, OpError, Operator, Output, OutputList};
 use crate::tensor_pool::TensorPool;
 use crate::threading;
-use crate::timer::Timer;
 use crate::timing::{InputShape, RunTiming, TimingRecord, TimingSort};
 
 /// Represents the size of a dimension of a runtime-provided value, such as
@@ -533,11 +533,6 @@ impl Graph {
     ) -> Result<Vec<Output>, RunError> {
         let opts = opts.unwrap_or_default();
 
-        let mut run_timer = Timer::new();
-        if opts.timing {
-            run_timer.start();
-        }
-
         let mut temp_values: FxHashMap<NodeId, Output> = FxHashMap::default();
 
         // Extract all the owned tensor inputs into the temp value map.
@@ -604,18 +599,15 @@ impl Graph {
         } else {
             Vec::new()
         };
-        let mut alloc_timer = Timer::new();
+
+        let mut op_start = Instant::now();
 
         for (step, &op_node_id) in plan.iter().enumerate() {
             let Some(Node::Operator(op_node)) = self.nodes.get(op_node_id) else {
                 return Err(RunError::PlanningError(
                     "operator node not found".to_string(),
                 ));
             };
-            let mut op_timer = Timer::new();
-            if record_timing {
-                op_timer.start();
-            }
 
             // Choose the input that we'll try to modify in-place to avoid
             // allocating a new buffer for the output. This will be passed as
@@ -713,20 +705,12 @@ impl Graph {
             };
             std::mem::drop(op_inputs);
 
-            // Record timings. We do this even if the operation failed so logs
-            // will contain details of the failed operation in the event of an
-            // error.
-            if record_timing {
-                op_timer.end();
-                op_timing_records.push(TimingRecord {
-                    name: op_node.operator.name(),
-                    input_shapes: input_shapes.clone(),
-                    elapsed_micros: op_timer.elapsed_micros(),
-                    node_name: op_node.name.as_deref().unwrap_or(""),
-                });
-            }
+            // Print verbose logs if enabled. This is done before checking the
+            // op's result, so logs will contain details of the failed operation
+            // in the event of an error.
             if opts.verbose {
-                self.print_op_timing(step, op_node, &op_result, &op_timer, &input_shapes);
+                let op_duration = op_start.elapsed();
+                self.print_op_timing(step, op_node, &op_result, op_duration, &input_shapes);
             }
 
             // Extract outputs or fail if an error occurred.
@@ -750,7 +734,6 @@ impl Graph {
             );
 
             // Remove temporary values that are no longer needed
-            record_timing.then(|| alloc_timer.start());
             for node_id in op_node.inputs.iter().filter_map(|node| *node) {
                 let rc = temp_value_refcount.dec(node_id);
                 if rc == Some(0) {
@@ -759,19 +742,23 @@ impl Graph {
                     }
                 }
             }
-            record_timing.then(|| alloc_timer.end());
+
+            if record_timing {
+                let op_end = Instant::now();
+                let op_duration = op_end - op_start;
+                op_start = op_end;
+
+                op_timing_records.push(TimingRecord {
+                    name: op_node.operator.name(),
+                    input_shapes,
+                    elapsed: op_duration,
+                    node_name: op_node.name.as_deref().unwrap_or(""),
+                });
+            }
         }
 
         if opts.timing {
-            run_timer.end();
-            self.print_run_timing(
-                plan,
-                &pool,
-                run_timer,
-                alloc_timer,
-                &op_timing_records,
-                &opts,
-            );
+            self.print_run_timing(plan, &pool, &op_timing_records, &opts);
         }
 
         // Return the requested outputs
@@ -796,7 +783,7 @@ impl Graph {
         step: usize,
         op_node: &OperatorNode,
         op_result: &Result<OutputList, OpError>,
-        op_timer: &Timer,
+        op_duration: Duration,
         input_shapes: &[InputShape],
     ) {
         println!(
@@ -819,23 +806,23 @@ impl Graph {
             }
         }
 
-        println!("  time: {}ms", op_timer.elapsed_ms());
+        println!("  time: {}ms", op_duration.as_secs_f64() * 1000.0);
     }
 
     /// Print a profiling summary at the end of the run.
     fn print_run_timing(
         &self,
         plan: &[NodeId],
         pool: &TensorPool,
-        run_timer: Timer,
-        alloc_timer: Timer,
         op_timing_records: &[TimingRecord],
         opts: &RunOptions,
     ) {
+        let run_duration: Duration = op_timing_records.iter().map(|r| r.elapsed).sum();
+        let run_duration_ms = run_duration.as_secs_f64() * 1000.0;
         println!(
-            "Graph run of {} ops finished in {}ms",
+            "Graph run of {} ops finished in {:.3}ms",
             plan.len(),
-            run_timer.elapsed_ms()
+            run_duration_ms,
         );
         println!(
             "Pool allocs {} hits {}",
@@ -844,8 +831,7 @@ impl Graph {
         );
         let timing = RunTiming {
             records: op_timing_records,
-            alloc_time: alloc_timer.elapsed_ms(),
-            total_time: run_timer.elapsed_ms(),
+            total_time: run_duration,
         };
         print!(
             "{}",

diff --git a/src/lib.rs b/src/lib.rs
@@ -104,7 +104,6 @@ mod optimize;
 mod slice_reductions;
 mod tensor_pool;
 mod threading;
-mod timer;
 mod timing;
 
 #[cfg(feature = "wasm_api")]
@@ -122,7 +121,6 @@ pub use model_metadata::ModelMetadata;
 pub use ops::{FloatOperators, Input, InputOrOutput, Operators, Output};
 pub use tensor_pool::{ExtractBuffer, PoolRef, TensorPool};
 pub use threading::{thread_pool, ThreadPool};
-pub use timer::Timer;
 pub use timing::TimingSort;
 
 #[allow(dead_code, unused_imports)]

diff --git a/src/timer.rs b/src/timer.rs