Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revise timing metrics collection to avoid unaccounted-for time #254

Merged
merged 1 commit into from
Jun 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions src/gemm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1054,6 +1054,7 @@ mod tests {
use std::error::Error;
use std::mem::MaybeUninit;
use std::ops::Range;
use std::time::Instant;

use rten_bench::run_bench;
use rten_tensor::prelude::*;
Expand Down Expand Up @@ -1775,8 +1776,6 @@ mod tests {
Ok(())
}

use crate::timer::Timer;

// Run with `cargo test --release bench_gemm -- --nocapture --ignored`
#[test]
#[ignore]
Expand Down Expand Up @@ -1863,12 +1862,11 @@ mod tests {
Tensor::rand(&[k, n], &mut rng)
};

let mut t = Timer::new();
t.start();
let start = Instant::now();
for _i in 0..iters {
run_gemm(&mut result, &a, &b, 1., 0., None, None);
}
t.end();
let duration = start.elapsed();

// Calculate throughput. For comparison, the theoretical maximum
// GFLOPS for a single core (`RAYON_NUM_THREADS=1`) can be computed
Expand All @@ -1887,17 +1885,18 @@ mod tests {
// `fma_units` is 2. For a 3.4Ghz CPU this would give a max
// theoretical peak of 3.4 * 8 * 2 * 2 = 108.8 GFLOPS.

let flops = (2 * m * n * k * iters as usize) as f32 / t.elapsed_secs();
let flops = (2 * m * n * k * iters as usize) as f32 / duration.as_secs_f32();
let gflops = flops / (10f32).powi(9);
let duration_ms = duration.as_secs_f64() * 1000.0;

println!(
"m {} n {} k {} iters {}. Duration {}ms ({}ms/iter). GFLOPS {}",
m,
n,
k,
iters,
t.elapsed_ms(),
t.elapsed_ms() / iters as f32,
duration_ms,
duration_ms / iters as f64,
gflops,
);
}
Expand Down
72 changes: 29 additions & 43 deletions src/graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::error::Error;
use std::fmt;
use std::iter::zip;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};

use rten_tensor::prelude::*;
use rten_tensor::{DynLayout, Tensor, TensorView};
Expand All @@ -18,7 +19,6 @@ use crate::env::env_flag;
use crate::ops::{Input, InputList, InputOrOutput, OpError, Operator, Output, OutputList};
use crate::tensor_pool::TensorPool;
use crate::threading;
use crate::timer::Timer;
use crate::timing::{InputShape, RunTiming, TimingRecord, TimingSort};

/// Represents the size of a dimension of a runtime-provided value, such as
Expand Down Expand Up @@ -533,11 +533,6 @@ impl Graph {
) -> Result<Vec<Output>, RunError> {
let opts = opts.unwrap_or_default();

let mut run_timer = Timer::new();
if opts.timing {
run_timer.start();
}

let mut temp_values: FxHashMap<NodeId, Output> = FxHashMap::default();

// Extract all the owned tensor inputs into the temp value map.
Expand Down Expand Up @@ -604,18 +599,15 @@ impl Graph {
} else {
Vec::new()
};
let mut alloc_timer = Timer::new();

let mut op_start = Instant::now();

for (step, &op_node_id) in plan.iter().enumerate() {
let Some(Node::Operator(op_node)) = self.nodes.get(op_node_id) else {
return Err(RunError::PlanningError(
"operator node not found".to_string(),
));
};
let mut op_timer = Timer::new();
if record_timing {
op_timer.start();
}

// Choose the input that we'll try to modify in-place to avoid
// allocating a new buffer for the output. This will be passed as
Expand Down Expand Up @@ -713,20 +705,12 @@ impl Graph {
};
std::mem::drop(op_inputs);

// Record timings. We do this even if the operation failed so logs
// will contain details of the failed operation in the event of an
// error.
if record_timing {
op_timer.end();
op_timing_records.push(TimingRecord {
name: op_node.operator.name(),
input_shapes: input_shapes.clone(),
elapsed_micros: op_timer.elapsed_micros(),
node_name: op_node.name.as_deref().unwrap_or(""),
});
}
// Print verbose logs if enabled. This is done before checking the
// op's result, so logs will contain details of the failed operation
// in the event of an error.
if opts.verbose {
self.print_op_timing(step, op_node, &op_result, &op_timer, &input_shapes);
let op_duration = op_start.elapsed();
self.print_op_timing(step, op_node, &op_result, op_duration, &input_shapes);
}

// Extract outputs or fail if an error occurred.
Expand All @@ -750,7 +734,6 @@ impl Graph {
);

// Remove temporary values that are no longer needed
record_timing.then(|| alloc_timer.start());
for node_id in op_node.inputs.iter().filter_map(|node| *node) {
let rc = temp_value_refcount.dec(node_id);
if rc == Some(0) {
Expand All @@ -759,19 +742,23 @@ impl Graph {
}
}
}
record_timing.then(|| alloc_timer.end());

if record_timing {
let op_end = Instant::now();
let op_duration = op_end - op_start;
op_start = op_end;

op_timing_records.push(TimingRecord {
name: op_node.operator.name(),
input_shapes,
elapsed: op_duration,
node_name: op_node.name.as_deref().unwrap_or(""),
});
}
}

if opts.timing {
run_timer.end();
self.print_run_timing(
plan,
&pool,
run_timer,
alloc_timer,
&op_timing_records,
&opts,
);
self.print_run_timing(plan, &pool, &op_timing_records, &opts);
}

// Return the requested outputs
Expand All @@ -796,7 +783,7 @@ impl Graph {
step: usize,
op_node: &OperatorNode,
op_result: &Result<OutputList, OpError>,
op_timer: &Timer,
op_duration: Duration,
input_shapes: &[InputShape],
) {
println!(
Expand All @@ -819,23 +806,23 @@ impl Graph {
}
}

println!(" time: {}ms", op_timer.elapsed_ms());
println!(" time: {}ms", op_duration.as_secs_f64() * 1000.0);
}

/// Print a profiling summary at the end of the run.
fn print_run_timing(
&self,
plan: &[NodeId],
pool: &TensorPool,
run_timer: Timer,
alloc_timer: Timer,
op_timing_records: &[TimingRecord],
opts: &RunOptions,
) {
let run_duration: Duration = op_timing_records.iter().map(|r| r.elapsed).sum();
let run_duration_ms = run_duration.as_secs_f64() * 1000.0;
println!(
"Graph run of {} ops finished in {}ms",
"Graph run of {} ops finished in {:.3}ms",
plan.len(),
run_timer.elapsed_ms()
run_duration_ms,
);
println!(
"Pool allocs {} hits {}",
Expand All @@ -844,8 +831,7 @@ impl Graph {
);
let timing = RunTiming {
records: op_timing_records,
alloc_time: alloc_timer.elapsed_ms(),
total_time: run_timer.elapsed_ms(),
total_time: run_duration,
};
print!(
"{}",
Expand Down
2 changes: 0 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ mod optimize;
mod slice_reductions;
mod tensor_pool;
mod threading;
mod timer;
mod timing;

#[cfg(feature = "wasm_api")]
Expand All @@ -122,7 +121,6 @@ pub use model_metadata::ModelMetadata;
pub use ops::{FloatOperators, Input, InputOrOutput, Operators, Output};
pub use tensor_pool::{ExtractBuffer, PoolRef, TensorPool};
pub use threading::{thread_pool, ThreadPool};
pub use timer::Timer;
pub use timing::TimingSort;

#[allow(dead_code, unused_imports)]
Expand Down
54 changes: 0 additions & 54 deletions src/timer.rs

This file was deleted.

Loading
Loading