Skip to content

Commit

Permalink
Revise timing metrics collection to avoid unaccounted-for time
Browse files Browse the repository at this point in the history
In the GPT-2 medium model, about 6% of the runtime for each step was
reported as "[Other]", which is the difference between the total time
for the run and the sum of individual operator timings. Upon analysis,
this difference turned out to be a combination of:

 1. The overhead of getting the time with `Instant::now`
 2. Rounding and accumulation errors due to storing timings in various
    resolutions and using single-precision float operations to sum timings
 3. Setup time spent in `Graph::run_plan` before the start of the main
    loop

The `Instant::now` overhead on macOS (Intel) is about 30% rdtsc + fences and 70%
call overhead according to samply.

This commit addresses these as follows:

 - The number of `Instant::now` calls has been reduced from 2 to 1 per step,
   with each call capturing the total time for the whole step, now just the call
   to `Operator::run`. This means that output processing for a step is now
   accounted for in the time for that step.
 - Operator timings are now represented, and summed as, `Duration` values. These
   are integer counts of nanoseconds, avoiding issues with rounding and float
   accumulation.
 - The run timing is now reported as the sum of operator timings, excluding
   setup before the main loop.

The `Timer` utility type was removed as it is no longer used following these.
  • Loading branch information
robertknight committed Jun 29, 2024
1 parent 9eea36d commit 4cb16a1
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 146 deletions.
15 changes: 7 additions & 8 deletions src/gemm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1054,6 +1054,7 @@ mod tests {
use std::error::Error;
use std::mem::MaybeUninit;
use std::ops::Range;
use std::time::Instant;

use rten_bench::run_bench;
use rten_tensor::prelude::*;
Expand Down Expand Up @@ -1775,8 +1776,6 @@ mod tests {
Ok(())
}

use crate::timer::Timer;

// Run with `cargo test --release bench_gemm -- --nocapture --ignored`
#[test]
#[ignore]
Expand Down Expand Up @@ -1863,12 +1862,11 @@ mod tests {
Tensor::rand(&[k, n], &mut rng)
};

let mut t = Timer::new();
t.start();
let start = Instant::now();
for _i in 0..iters {
run_gemm(&mut result, &a, &b, 1., 0., None, None);
}
t.end();
let duration = start.elapsed();

// Calculate throughput. For comparison, the theoretical maximum
// GFLOPS for a single core (`RAYON_NUM_THREADS=1`) can be computed
Expand All @@ -1887,17 +1885,18 @@ mod tests {
// `fma_units` is 2. For a 3.4Ghz CPU this would give a max
// theoretical peak of 3.4 * 8 * 2 * 2 = 108.8 GFLOPS.

let flops = (2 * m * n * k * iters as usize) as f32 / t.elapsed_secs();
let flops = (2 * m * n * k * iters as usize) as f32 / duration.as_secs_f32();
let gflops = flops / (10f32).powi(9);
let duration_ms = duration.as_secs_f64() * 1000.0;

println!(
"m {} n {} k {} iters {}. Duration {}ms ({}ms/iter). GFLOPS {}",
m,
n,
k,
iters,
t.elapsed_ms(),
t.elapsed_ms() / iters as f32,
duration_ms,
duration_ms / iters as f64,
gflops,
);
}
Expand Down
72 changes: 29 additions & 43 deletions src/graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::error::Error;
use std::fmt;
use std::iter::zip;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};

use rten_tensor::prelude::*;
use rten_tensor::{DynLayout, Tensor, TensorView};
Expand All @@ -18,7 +19,6 @@ use crate::env::env_flag;
use crate::ops::{Input, InputList, InputOrOutput, OpError, Operator, Output, OutputList};
use crate::tensor_pool::TensorPool;
use crate::threading;
use crate::timer::Timer;
use crate::timing::{InputShape, RunTiming, TimingRecord, TimingSort};

/// Represents the size of a dimension of a runtime-provided value, such as
Expand Down Expand Up @@ -533,11 +533,6 @@ impl Graph {
) -> Result<Vec<Output>, RunError> {
let opts = opts.unwrap_or_default();

let mut run_timer = Timer::new();
if opts.timing {
run_timer.start();
}

let mut temp_values: FxHashMap<NodeId, Output> = FxHashMap::default();

// Extract all the owned tensor inputs into the temp value map.
Expand Down Expand Up @@ -604,18 +599,15 @@ impl Graph {
} else {
Vec::new()
};
let mut alloc_timer = Timer::new();

let mut op_start = Instant::now();

for (step, &op_node_id) in plan.iter().enumerate() {
let Some(Node::Operator(op_node)) = self.nodes.get(op_node_id) else {
return Err(RunError::PlanningError(
"operator node not found".to_string(),
));
};
let mut op_timer = Timer::new();
if record_timing {
op_timer.start();
}

// Choose the input that we'll try to modify in-place to avoid
// allocating a new buffer for the output. This will be passed as
Expand Down Expand Up @@ -713,20 +705,12 @@ impl Graph {
};
std::mem::drop(op_inputs);

// Record timings. We do this even if the operation failed so logs
// will contain details of the failed operation in the event of an
// error.
if record_timing {
op_timer.end();
op_timing_records.push(TimingRecord {
name: op_node.operator.name(),
input_shapes: input_shapes.clone(),
elapsed_micros: op_timer.elapsed_micros(),
node_name: op_node.name.as_deref().unwrap_or(""),
});
}
// Print verbose logs if enabled. This is done before checking the
// op's result, so logs will contain details of the failed operation
// in the event of an error.
if opts.verbose {
self.print_op_timing(step, op_node, &op_result, &op_timer, &input_shapes);
let op_duration = op_start.elapsed();
self.print_op_timing(step, op_node, &op_result, op_duration, &input_shapes);
}

// Extract outputs or fail if an error occurred.
Expand All @@ -750,7 +734,6 @@ impl Graph {
);

// Remove temporary values that are no longer needed
record_timing.then(|| alloc_timer.start());
for node_id in op_node.inputs.iter().filter_map(|node| *node) {
let rc = temp_value_refcount.dec(node_id);
if rc == Some(0) {
Expand All @@ -759,19 +742,23 @@ impl Graph {
}
}
}
record_timing.then(|| alloc_timer.end());

if record_timing {
let op_end = Instant::now();
let op_duration = op_end - op_start;
op_start = op_end;

op_timing_records.push(TimingRecord {
name: op_node.operator.name(),
input_shapes,
elapsed: op_duration,
node_name: op_node.name.as_deref().unwrap_or(""),
});
}
}

if opts.timing {
run_timer.end();
self.print_run_timing(
plan,
&pool,
run_timer,
alloc_timer,
&op_timing_records,
&opts,
);
self.print_run_timing(plan, &pool, &op_timing_records, &opts);
}

// Return the requested outputs
Expand All @@ -796,7 +783,7 @@ impl Graph {
step: usize,
op_node: &OperatorNode,
op_result: &Result<OutputList, OpError>,
op_timer: &Timer,
op_duration: Duration,
input_shapes: &[InputShape],
) {
println!(
Expand All @@ -819,23 +806,23 @@ impl Graph {
}
}

println!(" time: {}ms", op_timer.elapsed_ms());
println!(" time: {}ms", op_duration.as_secs_f64() * 1000.0);
}

/// Print a profiling summary at the end of the run.
fn print_run_timing(
&self,
plan: &[NodeId],
pool: &TensorPool,
run_timer: Timer,
alloc_timer: Timer,
op_timing_records: &[TimingRecord],
opts: &RunOptions,
) {
let run_duration: Duration = op_timing_records.iter().map(|r| r.elapsed).sum();
let run_duration_ms = run_duration.as_secs_f64() * 1000.0;
println!(
"Graph run of {} ops finished in {}ms",
"Graph run of {} ops finished in {:.3}ms",
plan.len(),
run_timer.elapsed_ms()
run_duration_ms,
);
println!(
"Pool allocs {} hits {}",
Expand All @@ -844,8 +831,7 @@ impl Graph {
);
let timing = RunTiming {
records: op_timing_records,
alloc_time: alloc_timer.elapsed_ms(),
total_time: run_timer.elapsed_ms(),
total_time: run_duration,
};
print!(
"{}",
Expand Down
2 changes: 0 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ mod optimize;
mod slice_reductions;
mod tensor_pool;
mod threading;
mod timer;
mod timing;

#[cfg(feature = "wasm_api")]
Expand All @@ -122,7 +121,6 @@ pub use model_metadata::ModelMetadata;
pub use ops::{FloatOperators, Input, InputOrOutput, Operators, Output};
pub use tensor_pool::{ExtractBuffer, PoolRef, TensorPool};
pub use threading::{thread_pool, ThreadPool};
pub use timer::Timer;
pub use timing::TimingSort;

#[allow(dead_code, unused_imports)]
Expand Down
54 changes: 0 additions & 54 deletions src/timer.rs

This file was deleted.

Loading

0 comments on commit 4cb16a1

Please sign in to comment.