diff --git a/src/rust/engine/process_execution/src/cache.rs b/src/rust/engine/process_execution/src/cache.rs index 57564e65fa4..359592585f0 100644 --- a/src/rust/engine/process_execution/src/cache.rs +++ b/src/rust/engine/process_execution/src/cache.rs @@ -1,4 +1,5 @@ use std::sync::Arc; +use std::time::Instant; use async_trait::async_trait; use bazel_protos::gen::build::bazel::remote::execution::v2 as remexec; @@ -10,7 +11,7 @@ use prost::Message; use serde::{Deserialize, Serialize}; use sharded_lmdb::ShardedLmdb; use store::Store; -use workunit_store::{with_workunit, Level, Metric, WorkunitMetadata}; +use workunit_store::{with_workunit, Level, Metric, ObservationMetric, WorkunitMetadata}; use crate::{ Context, FallibleProcessResultWithPlatform, MultiPlatformProcess, Platform, Process, @@ -59,6 +60,7 @@ impl crate::CommandRunner for CommandRunner { req: MultiPlatformProcess, context: Context, ) -> Result { + let cache_lookup_start = Instant::now(); let context2 = context.clone(); let cache_read_future = async move { context @@ -76,9 +78,19 @@ impl crate::CommandRunner for CommandRunner { let command_runner = self.clone(); match self.lookup(key).await { Ok(Some(result)) if result.exit_code == 0 || cache_failures => { + let lookup_elapsed = cache_lookup_start.elapsed(); context .workunit_store .increment_counter(Metric::LocalCacheRequestsCached, 1); + if let Some(time_saved) = result.metadata.time_saved_from_cache(lookup_elapsed) { + let time_saved = time_saved.as_millis() as u64; + context + .workunit_store + .increment_counter(Metric::LocalCacheTotalTimeSavedMs, time_saved); + context + .workunit_store + .record_observation(ObservationMetric::LocalCacheTimeSavedMs, time_saved); + } return Ok(result); } Err(err) => { diff --git a/src/rust/engine/process_execution/src/lib.rs b/src/rust/engine/process_execution/src/lib.rs index 917333c97be..1114b700e0a 100644 --- a/src/rust/engine/process_execution/src/lib.rs +++ b/src/rust/engine/process_execution/src/lib.rs @@ -370,6 +370,30 @@ impl ProcessResultMetadata { pub fn new(total_elapsed: Option) -> Self { ProcessResultMetadata { total_elapsed } } + + /// How much faster a cache hit was than running the process again. + /// + /// This includes the overhead of setting up and cleaning up the process for execution, and it + /// should include all overhead for the cache lookup. + /// + /// If the cache hit was slower than the original process, we return 0. Note that the cache hit + /// may still have been faster than rerunning the process a second time, e.g. if speculation + /// is used and the cache hit completed before the rerun; still, we cannot know how long the + /// second run would have taken, so the best we can do is report 0. + /// + /// If the original process's execution time was not recorded, we return None because we + /// cannot make a meaningful comparison. + pub fn time_saved_from_cache( + &self, + cache_lookup: std::time::Duration, + ) -> Option { + self.total_elapsed.and_then(|original_process| { + let original_process: std::time::Duration = original_process.into(); + original_process + .checked_sub(cache_lookup) + .or_else(|| Some(std::time::Duration::new(0, 0))) + }) + } } impl From for ProcessResultMetadata { diff --git a/src/rust/engine/process_execution/src/remote_cache.rs b/src/rust/engine/process_execution/src/remote_cache.rs index 7ae1f00448d..3507f2f0594 100644 --- a/src/rust/engine/process_execution/src/remote_cache.rs +++ b/src/rust/engine/process_execution/src/remote_cache.rs @@ -2,6 +2,7 @@ use std::collections::{BTreeMap, HashSet, VecDeque}; use std::ffi::OsString; use std::path::Component; use std::sync::Arc; +use std::time::Instant; use async_trait::async_trait; use bazel_protos::gen::build::bazel::remote::execution::v2 as remexec; @@ -14,7 +15,7 @@ use remexec::action_cache_client::ActionCacheClient; use remexec::{ActionResult, Command, FileNode, Tree}; use store::Store; use tonic::transport::Channel; -use workunit_store::{with_workunit, Level, Metric, WorkunitMetadata}; +use workunit_store::{with_workunit, Level, Metric, ObservationMetric, WorkunitMetadata}; use crate::remote::make_execute_request; use crate::{ @@ -391,6 +392,7 @@ impl crate::CommandRunner for CommandRunner { req: MultiPlatformProcess, context: Context, ) -> Result { + let cache_lookup_start = Instant::now(); // Construct the REv2 ExecuteRequest and related data for this execution request. let request = self .extract_compatible_request(&req) @@ -452,7 +454,17 @@ impl crate::CommandRunner for CommandRunner { tokio::select! { cache_result = cache_read_future => { if let Some(cached_response) = cache_result { + let lookup_elapsed = cache_lookup_start.elapsed(); context.workunit_store.increment_counter(Metric::RemoteCacheSpeculationRemoteCompletedFirst, 1); + if let Some(time_saved) = cached_response.metadata.time_saved_from_cache(lookup_elapsed) { + let time_saved = time_saved.as_millis() as u64; + context + .workunit_store + .increment_counter(Metric::RemoteCacheTotalTimeSavedMs, time_saved); + context + .workunit_store + .record_observation(ObservationMetric::RemoteCacheTimeSavedMs, time_saved); + } return Ok(cached_response); } else { // Note that we don't increment a counter here, as there is nothing of note in this diff --git a/src/rust/engine/process_execution/src/tests.rs b/src/rust/engine/process_execution/src/tests.rs index a8ec6b999c5..4a9e9113e28 100644 --- a/src/rust/engine/process_execution/src/tests.rs +++ b/src/rust/engine/process_execution/src/tests.rs @@ -88,3 +88,21 @@ fn process_result_metadata_to_and_from_executed_action_metadata() { let process_result_missing: ExecutedActionMetadata = ProcessResultMetadata::default().into(); assert_eq!(process_result_missing, ExecutedActionMetadata::default()); } + +#[test] +fn process_result_metadata_time_saved_from_cache() { + let metadata = ProcessResultMetadata::new(Some(concrete_time::Duration::new(5, 150))); + let time_saved = metadata.time_saved_from_cache(Duration::new(1, 100)); + assert_eq!(time_saved, Some(Duration::new(4, 50))); + + // If the cache lookup took more time than the process, we return 0. + let metadata = ProcessResultMetadata::new(Some(concrete_time::Duration::new(1, 0))); + let time_saved = metadata.time_saved_from_cache(Duration::new(5, 0)); + assert_eq!(time_saved, Some(Duration::new(0, 0))); + + // If the original process time wasn't recorded, we can't compute the time saved. + assert_eq!( + ProcessResultMetadata::default().time_saved_from_cache(Duration::new(1, 100)), + None + ); +} diff --git a/src/rust/engine/workunit_store/src/metrics.rs b/src/rust/engine/workunit_store/src/metrics.rs index e124e45f75a..e3349431ec3 100644 --- a/src/rust/engine/workunit_store/src/metrics.rs +++ b/src/rust/engine/workunit_store/src/metrics.rs @@ -48,6 +48,9 @@ pub enum Metric { LocalCacheRequestsUncached, LocalCacheReadErrors, LocalCacheWriteErrors, + /// The total time saved (in milliseconds) thanks to local cache hits instead of running the + /// processes directly. + LocalCacheTotalTimeSavedMs, LocalExecutionRequests, RemoteCacheRequests, RemoteCacheRequestsCached, @@ -58,6 +61,9 @@ pub enum Metric { RemoteCacheWriteFinished, RemoteCacheSpeculationLocalCompletedFirst, RemoteCacheSpeculationRemoteCompletedFirst, + /// The total time saved (in milliseconds) thanks to remote cache hits instead of running the + /// processes directly. + RemoteCacheTotalTimeSavedMs, RemoteExecutionErrors, RemoteExecutionRequests, RemoteExecutionRPCErrors, @@ -81,4 +87,10 @@ pub enum ObservationMetric { LocalStoreReadBlobSize, RemoteExecutionRPCFirstResponseTime, RemoteStoreTimeToFirstByte, + /// The time saved (in milliseconds) thanks to a local cache hit instead of running the process + /// directly. + LocalCacheTimeSavedMs, + /// The time saved (in milliseconds) thanks to a remote cache hit instead of running the process + /// directly. + RemoteCacheTimeSavedMs, }