diff --git a/src/gemm.rs b/src/gemm.rs
index 8caf1996..ea67af50 100644
--- a/src/gemm.rs
+++ b/src/gemm.rs
@@ -1054,6 +1054,7 @@ mod tests {
     use std::error::Error;
     use std::mem::MaybeUninit;
     use std::ops::Range;
+    use std::time::Instant;
 
     use rten_bench::run_bench;
     use rten_tensor::prelude::*;
@@ -1775,8 +1776,6 @@ mod tests {
         Ok(())
     }
 
-    use crate::timer::Timer;
-
     // Run with `cargo test --release bench_gemm -- --nocapture --ignored`
     #[test]
     #[ignore]
@@ -1863,12 +1862,11 @@ mod tests {
                 Tensor::rand(&[k, n], &mut rng)
             };
 
-            let mut t = Timer::new();
-            t.start();
+            let start = Instant::now();
             for _i in 0..iters {
                 run_gemm(&mut result, &a, &b, 1., 0., None, None);
             }
-            t.end();
+            let duration = start.elapsed();
 
             // Calculate throughput. For comparison, the theoretical maximum
             // GFLOPS for a single core (`RAYON_NUM_THREADS=1`) can be computed
@@ -1887,8 +1885,9 @@ mod tests {
             //   `fma_units` is 2. For a 3.4Ghz CPU this would give a max
             //   theoretical peak of 3.4 * 8 * 2 * 2 = 108.8 GFLOPS.
 
-            let flops = (2 * m * n * k * iters as usize) as f32 / t.elapsed_secs();
+            let flops = (2 * m * n * k * iters as usize) as f32 / duration.as_secs_f32();
             let gflops = flops / (10f32).powi(9);
+            let duration_ms = duration.as_secs_f64() * 1000.0;
 
             println!(
                 "m {} n {} k {} iters {}. Duration {}ms ({}ms/iter). GFLOPS {}",
@@ -1896,8 +1895,8 @@ mod tests {
                 n,
                 k,
                 iters,
-                t.elapsed_ms(),
-                t.elapsed_ms() / iters as f32,
+                duration_ms,
+                duration_ms / iters as f64,
                 gflops,
             );
         }
diff --git a/src/graph.rs b/src/graph.rs
index 892d464e..09d13513 100644
--- a/src/graph.rs
+++ b/src/graph.rs
@@ -2,6 +2,7 @@ use std::error::Error;
 use std::fmt;
 use std::iter::zip;
 use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
 
 use rten_tensor::prelude::*;
 use rten_tensor::{DynLayout, Tensor, TensorView};
@@ -18,7 +19,6 @@ use crate::env::env_flag;
 use crate::ops::{Input, InputList, InputOrOutput, OpError, Operator, Output, OutputList};
 use crate::tensor_pool::TensorPool;
 use crate::threading;
-use crate::timer::Timer;
 use crate::timing::{InputShape, RunTiming, TimingRecord, TimingSort};
 
 /// Represents the size of a dimension of a runtime-provided value, such as
@@ -533,11 +533,6 @@ impl Graph {
     ) -> Result<Vec<Output>, RunError> {
         let opts = opts.unwrap_or_default();
 
-        let mut run_timer = Timer::new();
-        if opts.timing {
-            run_timer.start();
-        }
-
         let mut temp_values: FxHashMap<NodeId, Output> = FxHashMap::default();
 
         // Extract all the owned tensor inputs into the temp value map.
@@ -604,7 +599,8 @@ impl Graph {
         } else {
             Vec::new()
         };
-        let mut alloc_timer = Timer::new();
+
+        let mut op_start = Instant::now();
 
         for (step, &op_node_id) in plan.iter().enumerate() {
             let Some(Node::Operator(op_node)) = self.nodes.get(op_node_id) else {
@@ -612,10 +608,6 @@ impl Graph {
                     "operator node not found".to_string(),
                 ));
             };
-            let mut op_timer = Timer::new();
-            if record_timing {
-                op_timer.start();
-            }
 
             // Choose the input that we'll try to modify in-place to avoid
             // allocating a new buffer for the output. This will be passed as
@@ -713,20 +705,12 @@ impl Graph {
             };
             std::mem::drop(op_inputs);
 
-            // Record timings. We do this even if the operation failed so logs
-            // will contain details of the failed operation in the event of an
-            // error.
-            if record_timing {
-                op_timer.end();
-                op_timing_records.push(TimingRecord {
-                    name: op_node.operator.name(),
-                    input_shapes: input_shapes.clone(),
-                    elapsed_micros: op_timer.elapsed_micros(),
-                    node_name: op_node.name.as_deref().unwrap_or(""),
-                });
-            }
+            // Print verbose logs if enabled. This is done before checking the
+            // op's result, so logs will contain details of the failed operation
+            // in the event of an error.
             if opts.verbose {
-                self.print_op_timing(step, op_node, &op_result, &op_timer, &input_shapes);
+                let op_duration = op_start.elapsed();
+                self.print_op_timing(step, op_node, &op_result, op_duration, &input_shapes);
             }
 
             // Extract outputs or fail if an error occurred.
@@ -750,7 +734,6 @@ impl Graph {
             );
 
             // Remove temporary values that are no longer needed
-            record_timing.then(|| alloc_timer.start());
             for node_id in op_node.inputs.iter().filter_map(|node| *node) {
                 let rc = temp_value_refcount.dec(node_id);
                 if rc == Some(0) {
@@ -759,19 +742,23 @@ impl Graph {
                     }
                 }
             }
-            record_timing.then(|| alloc_timer.end());
+
+            if record_timing {
+                let op_end = Instant::now();
+                let op_duration = op_end - op_start;
+                op_start = op_end;
+
+                op_timing_records.push(TimingRecord {
+                    name: op_node.operator.name(),
+                    input_shapes,
+                    elapsed: op_duration,
+                    node_name: op_node.name.as_deref().unwrap_or(""),
+                });
+            }
         }
 
         if opts.timing {
-            run_timer.end();
-            self.print_run_timing(
-                plan,
-                &pool,
-                run_timer,
-                alloc_timer,
-                &op_timing_records,
-                &opts,
-            );
+            self.print_run_timing(plan, &pool, &op_timing_records, &opts);
         }
 
         // Return the requested outputs
@@ -796,7 +783,7 @@ impl Graph {
         step: usize,
         op_node: &OperatorNode,
         op_result: &Result<OutputList, OpError>,
-        op_timer: &Timer,
+        op_duration: Duration,
         input_shapes: &[InputShape],
     ) {
         println!(
@@ -819,7 +806,7 @@ impl Graph {
             }
         }
 
-        println!("  time: {}ms", op_timer.elapsed_ms());
+        println!("  time: {}ms", op_duration.as_secs_f64() * 1000.0);
     }
 
     /// Print a profiling summary at the end of the run.
@@ -827,15 +814,15 @@ impl Graph {
         &self,
         plan: &[NodeId],
         pool: &TensorPool,
-        run_timer: Timer,
-        alloc_timer: Timer,
         op_timing_records: &[TimingRecord],
         opts: &RunOptions,
     ) {
+        let run_duration: Duration = op_timing_records.iter().map(|r| r.elapsed).sum();
+        let run_duration_ms = run_duration.as_secs_f64() * 1000.0;
         println!(
-            "Graph run of {} ops finished in {}ms",
+            "Graph run of {} ops finished in {:.3}ms",
             plan.len(),
-            run_timer.elapsed_ms()
+            run_duration_ms,
         );
         println!(
             "Pool allocs {} hits {}",
@@ -844,8 +831,7 @@ impl Graph {
         );
         let timing = RunTiming {
             records: op_timing_records,
-            alloc_time: alloc_timer.elapsed_ms(),
-            total_time: run_timer.elapsed_ms(),
+            total_time: run_duration,
         };
         print!(
             "{}",
diff --git a/src/lib.rs b/src/lib.rs
index 3dbbb1c1..5aae556b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -104,7 +104,6 @@ mod optimize;
 mod slice_reductions;
 mod tensor_pool;
 mod threading;
-mod timer;
 mod timing;
 
 #[cfg(feature = "wasm_api")]
@@ -122,7 +121,6 @@ pub use model_metadata::ModelMetadata;
 pub use ops::{FloatOperators, Input, InputOrOutput, Operators, Output};
 pub use tensor_pool::{ExtractBuffer, PoolRef, TensorPool};
 pub use threading::{thread_pool, ThreadPool};
-pub use timer::Timer;
 pub use timing::TimingSort;
 
 #[allow(dead_code, unused_imports)]
diff --git a/src/timer.rs b/src/timer.rs
deleted file mode 100644
index fcd25a20..00000000
--- a/src/timer.rs
+++ /dev/null
@@ -1,54 +0,0 @@
-use std::time::Instant;
-
-/// Utility for recording the cumulative time spent in an operation.
-#[doc(hidden)] // Not intended for external use
-pub struct Timer {
-    start: Option<Instant>,
-    elapsed: u64,
-}
-
-impl Timer {
-    /// Create a new, inactive timer with zero elapsed time
-    pub fn new() -> Timer {
-        Timer {
-            start: None,
-            elapsed: 0,
-        }
-    }
-
-    /// Start the timer, or reset it if already active
-    pub fn start(&mut self) {
-        self.start = Some(Instant::now());
-    }
-
-    /// Stop active timer and add elapsed time to the total returned by `elapsed`
-    pub fn end(&mut self) {
-        if let Some(start) = self.start {
-            self.elapsed += start.elapsed().as_micros() as u64;
-        }
-    }
-
-    /// Return the cumulative elapsed time between calls to `start` and `end`
-    /// in milliseconds.
-    pub fn elapsed_ms(&self) -> f32 {
-        (self.elapsed as f32) / 1000.0
-    }
-
-    /// Return the cumulative elapsed time between calls to `start` and `end`
-    /// in microseconds.
-    pub fn elapsed_micros(&self) -> f32 {
-        self.elapsed as f32
-    }
-
-    /// Return the cumulative elapsed time between calls to `start` and `end`
-    /// in seconds.
-    pub fn elapsed_secs(&self) -> f32 {
-        self.elapsed as f32 / 1_000_000.0
-    }
-}
-
-impl Default for Timer {
-    fn default() -> Self {
-        Self::new()
-    }
-}
diff --git a/src/timing.rs b/src/timing.rs
index 01cb2809..b42824d8 100644
--- a/src/timing.rs
+++ b/src/timing.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::fmt;
+use std::time::Duration;
 
 use smallvec::SmallVec;
 
@@ -100,19 +101,11 @@ pub struct RunTiming<'a> {
     /// Records of graph step execution times
     pub records: &'a [TimingRecord<'a>],
 
-    /// Total time spent in memory allocations or de-allocations that were
-    /// tracked.
-    pub alloc_time: f32,
-
-    /// Total time for the graph run.
-    pub total_time: f32,
+    /// Total time for the graph run
+    pub total_time: Duration,
 }
 
 impl<'a> RunTiming<'a> {
-    fn total_op_time(&self) -> f32 {
-        self.records.iter().map(|tr| tr.elapsed_micros).sum::<f32>() / 1000.0
-    }
-
     /// Return a struct that formats output with the given options.
     pub fn display(&self, sort: TimingSort, include_shapes: bool) -> impl fmt::Display + '_ {
         FormattedRunTiming {
@@ -136,8 +129,8 @@ struct TimingByShapeRecord {
     /// Formatted representation of input shapes
     shape: String,
 
-    /// Total elapsed time for this operator with a given shape
-    total_ms: f32,
+    /// Total elapsed time for this operator
+    total_duration: Duration,
 
     /// Number of times the operator was run with this shape
     count: usize,
@@ -182,15 +175,14 @@ impl Table for TimingByShapeTable {
 
     fn cell(&self, row: usize, col: usize) -> String {
         let row = self.rows.get(row).expect("invalid row");
+        let total_ms = row.total_duration.as_secs_f64() * 1000.0;
+        let total_ns = total_ms * 1_000_000.0;
         match col {
             0 => row.shape.clone(),
             1 => row.count.to_string(),
-            2 => format!("{:.3}", row.total_ms / row.count as f32),
-            3 => format!("{:.3}", row.total_ms),
-            4 => format!(
-                "{:.3}",
-                (row.total_ms * 1_000_000.0) / (row.input_elements * row.count) as f32
-            ),
+            2 => format!("{:.3}", total_ms / row.count as f64),
+            3 => format!("{:.3}", total_ms),
+            4 => format!("{:.3}", total_ns / (row.input_elements * row.count) as f64),
             5 => row.node_name.clone(),
             _ => panic!("invalid column"),
         }
@@ -255,20 +247,20 @@ impl<'a> FormattedRunTiming<'a> {
                     })
                     .sum::<usize>();
                 let (cum_time, count, _, _) = timings.entry(formatted_shapes).or_insert((
-                    0.,
+                    Duration::ZERO,
                     0,
                     input_elements,
                     record.node_name,
                 ));
-                *cum_time += record.elapsed_micros / 1000.0;
+                *cum_time += record.elapsed;
                 *count += 1;
                 timings
             })
             .into_iter()
             .map(
-                |(shape, (total_ms, count, input_elements, node_name))| TimingByShapeRecord {
+                |(shape, (total, count, input_elements, node_name))| TimingByShapeRecord {
                     shape,
-                    total_ms,
+                    total_duration: total,
                     count,
                     input_elements,
                     node_name: node_name.to_string(),
@@ -276,7 +268,7 @@ impl<'a> FormattedRunTiming<'a> {
             )
             .collect();
 
-        time_by_shape_rows.sort_by(|a, b| a.total_ms.total_cmp(&b.total_ms).reverse());
+        time_by_shape_rows.sort_by(|a, b| a.total_duration.cmp(&b.total_duration).reverse());
 
         TimingByShapeTable {
             rows: time_by_shape_rows,
@@ -291,33 +283,27 @@ impl<'a> fmt::Display for FormattedRunTiming<'a> {
             .records
             .iter()
             .fold(HashMap::new(), |mut timings, record| {
-                let total_op_time = timings.entry(record.name).or_insert(0.);
-                *total_op_time += record.elapsed_micros / 1000.0;
+                let total_op_time = timings.entry(record.name).or_insert(Duration::ZERO);
+                *total_op_time += record.elapsed;
                 timings
             })
             .into_iter()
             .collect();
 
-        // Add `[Other]` for all unaccounted time.
-        let total_op_time = self.timing.total_op_time();
-        op_timings.push((
-            "[Other]",
-            self.timing.total_time - total_op_time - self.timing.alloc_time,
-        ));
-        op_timings.push(("[Mem alloc/free]", self.timing.alloc_time));
-
         op_timings.sort_by(|(a_name, a_time), (b_name, b_time)| match self.sort {
             TimingSort::ByName => a_name.cmp(b_name),
-            TimingSort::ByTime => a_time.total_cmp(b_time).reverse(),
+            TimingSort::ByTime => a_time.cmp(b_time).reverse(),
         });
 
         let rows: Vec<_> = op_timings
-            .iter()
+            .into_iter()
             .map(|(op_name, op_total_time)| {
-                let run_percent = (*op_total_time / self.timing.total_time) * 100.;
+                let op_total_time_ms = op_total_time.as_secs_f64() * 1000.0;
+                let total_time_ms = self.timing.total_time.as_secs_f64() * 1000.0;
+                let run_percent = (op_total_time_ms / total_time_ms) * 100.;
                 [
                     op_name.to_string(),
-                    format!("{:.2}ms", op_total_time),
+                    format!("{:.2}ms", op_total_time_ms),
                     format!("({:.2}%)", run_percent),
                 ]
             })
@@ -362,8 +348,8 @@ pub struct TimingRecord<'a> {
     /// Shapes of the operator's inputs
     pub input_shapes: Vec<InputShape>,
 
-    /// Execution time of this step in microseconds
-    pub elapsed_micros: f32,
+    /// Execution time of this step
+    pub elapsed: Duration,
 }
 
 /// Specifies sort order for graph run timings.