pantsbuild · Eric-Arellano · Feb 25, 2021 · Feb 24, 2021 · Feb 25, 2021 · Feb 25, 2021
diff --git a/src/rust/engine/process_execution/src/cache.rs b/src/rust/engine/process_execution/src/cache.rs
@@ -1,4 +1,5 @@
 use std::sync::Arc;
+use std::time::Instant;
 
 use async_trait::async_trait;
 use bazel_protos::gen::build::bazel::remote::execution::v2 as remexec;
@@ -10,7 +11,7 @@ use prost::Message;
 use serde::{Deserialize, Serialize};
 use sharded_lmdb::ShardedLmdb;
 use store::Store;
-use workunit_store::{with_workunit, Level, Metric, WorkunitMetadata};
+use workunit_store::{with_workunit, Level, Metric, ObservationMetric, WorkunitMetadata};
 
 use crate::{
   Context, FallibleProcessResultWithPlatform, MultiPlatformProcess, Platform, Process,
@@ -59,6 +60,7 @@ impl crate::CommandRunner for CommandRunner {
     req: MultiPlatformProcess,
     context: Context,
   ) -> Result<FallibleProcessResultWithPlatform, String> {
+    let cache_lookup_start = Instant::now();
     let context2 = context.clone();
     let cache_read_future = async move {
       context
@@ -76,9 +78,19 @@ impl crate::CommandRunner for CommandRunner {
       let command_runner = self.clone();
       match self.lookup(key).await {
         Ok(Some(result)) if result.exit_code == 0 || cache_failures => {
+          let lookup_elapsed = cache_lookup_start.elapsed();
           context
             .workunit_store
             .increment_counter(Metric::LocalCacheRequestsCached, 1);
+          if let Some(time_saved) = result.metadata.time_saved_from_cache(lookup_elapsed) {
+            let time_saved = time_saved.as_millis() as u64;
+            context
+              .workunit_store
+              .increment_counter(Metric::LocalCacheTotalTimeSavedMs, time_saved);
+            context
+              .workunit_store
+              .record_observation(ObservationMetric::LocalCacheTimeSavedMs, time_saved);
+          }
           return Ok(result);
         }
         Err(err) => {

diff --git a/src/rust/engine/process_execution/src/lib.rs b/src/rust/engine/process_execution/src/lib.rs
@@ -370,6 +370,30 @@ impl ProcessResultMetadata {
   pub fn new(total_elapsed: Option<Duration>) -> Self {
     ProcessResultMetadata { total_elapsed }
   }
+
+  /// How much faster a cache hit was than running the process again.
+  ///
+  /// This includes the overhead of setting up and cleaning up the process for execution, and it
+  /// should include all overhead for the cache lookup.
+  ///
+  /// If the cache hit was slower than the original process, we return 0. Note that the cache hit
+  /// may still have been faster than rerunning the process a second time, e.g. if speculation
+  /// is used and the cache hit completed before the rerun; still, we cannot know how long the
+  /// second run would have taken, so the best we can do is report 0.
+  ///
+  /// If the original process's execution time was not recorded, we return None because we
+  /// cannot make a meaningful comparison.
+  pub fn time_saved_from_cache(
+    &self,
+    cache_lookup: std::time::Duration,
+  ) -> Option<std::time::Duration> {
+    self.total_elapsed.and_then(|original_process| {
+      let original_process: std::time::Duration = original_process.into();
+      original_process
+        .checked_sub(cache_lookup)
+        .or_else(|| Some(std::time::Duration::new(0, 0)))
+    })
+  }
 }
 
 impl From<ExecutedActionMetadata> for ProcessResultMetadata {

diff --git a/src/rust/engine/process_execution/src/remote_cache.rs b/src/rust/engine/process_execution/src/remote_cache.rs
@@ -2,6 +2,7 @@ use std::collections::{BTreeMap, HashSet, VecDeque};
 use std::ffi::OsString;
 use std::path::Component;
 use std::sync::Arc;
+use std::time::Instant;
 
 use async_trait::async_trait;
 use bazel_protos::gen::build::bazel::remote::execution::v2 as remexec;
@@ -14,7 +15,7 @@ use remexec::action_cache_client::ActionCacheClient;
 use remexec::{ActionResult, Command, FileNode, Tree};
 use store::Store;
 use tonic::transport::Channel;
-use workunit_store::{with_workunit, Level, Metric, WorkunitMetadata};
+use workunit_store::{with_workunit, Level, Metric, ObservationMetric, WorkunitMetadata};
 
 use crate::remote::make_execute_request;
 use crate::{
@@ -391,6 +392,7 @@ impl crate::CommandRunner for CommandRunner {
     req: MultiPlatformProcess,
     context: Context,
   ) -> Result<FallibleProcessResultWithPlatform, String> {
+    let cache_lookup_start = Instant::now();
     // Construct the REv2 ExecuteRequest and related data for this execution request.
     let request = self
       .extract_compatible_request(&req)
@@ -452,7 +454,17 @@ impl crate::CommandRunner for CommandRunner {
       tokio::select! {
         cache_result = cache_read_future => {
           if let Some(cached_response) = cache_result {
+            let lookup_elapsed = cache_lookup_start.elapsed();
             context.workunit_store.increment_counter(Metric::RemoteCacheSpeculationRemoteCompletedFirst, 1);
+            if let Some(time_saved) = cached_response.metadata.time_saved_from_cache(lookup_elapsed) {
+              let time_saved = time_saved.as_millis() as u64;
+              context
+                .workunit_store
+                .increment_counter(Metric::RemoteCacheTotalTimeSavedMs, time_saved);
+              context
+                .workunit_store
+                .record_observation(ObservationMetric::RemoteCacheTimeSavedMs, time_saved);
+              }
             return Ok(cached_response);
           } else {
             // Note that we don't increment a counter here, as there is nothing of note in this

diff --git a/src/rust/engine/process_execution/src/tests.rs b/src/rust/engine/process_execution/src/tests.rs
@@ -88,3 +88,21 @@ fn process_result_metadata_to_and_from_executed_action_metadata() {
   let process_result_missing: ExecutedActionMetadata = ProcessResultMetadata::default().into();
   assert_eq!(process_result_missing, ExecutedActionMetadata::default());
 }
+
+#[test]
+fn process_result_metadata_time_saved_from_cache() {
+  let metadata = ProcessResultMetadata::new(Some(concrete_time::Duration::new(5, 150)));
+  let time_saved = metadata.time_saved_from_cache(Duration::new(1, 100));
+  assert_eq!(time_saved, Some(Duration::new(4, 50)));
+
+  // If the cache lookup took more time than the process, we return 0.
+  let metadata = ProcessResultMetadata::new(Some(concrete_time::Duration::new(1, 0)));
+  let time_saved = metadata.time_saved_from_cache(Duration::new(5, 0));
+  assert_eq!(time_saved, Some(Duration::new(0, 0)));
+
+  // If the original process time wasn't recorded, we can't compute the time saved.
+  assert_eq!(
+    ProcessResultMetadata::default().time_saved_from_cache(Duration::new(1, 100)),
+    None
+  );
 // Possibly either add the remote execution runner or the remote cache runner. 
 // `global_options.py` already validates that both are not set at the same time. 
 let maybe_remote_enabled_command_runner: Box<dyn CommandRunner> = 
   if remoting_opts.execution_enable { 
     Box::new(BoundedCommandRunner::new( 
       Box::new(process_execution::remote::CommandRunner::new( 
         // No problem unwrapping here because the global options validation 
         // requires the remoting_opts.execution_server be present when 
         // remoting_opts.execution_enable is set. 
         &remoting_opts.execution_address.clone().unwrap(), 
         remoting_opts.store_addresses.clone(), 
         process_execution_metadata.clone(), 
         root_ca_certs.clone(), 
         remoting_opts.execution_headers.clone(), 
         full_store.clone(), 
         // TODO if we ever want to configure the remote platform to be something else we 
         // need to take an option all the way down here and into the remote::CommandRunner struct. 
         Platform::Linux, 
         remoting_opts.execution_overall_deadline, 
         Duration::from_millis(100), 
       )?), 
       exec_strategy_opts.remote_parallelism, 
     )) 
   } else if remote_caching_used { 
     let action_cache_address = remote_store_addresses 
       .first() 
       .ok_or_else(|| "At least one remote store must be specified".to_owned())?; 
     Box::new(process_execution::remote_cache::CommandRunner::new( 
       local_command_runner.into(), 
       process_execution_metadata.clone(), 
       executor.clone(), 
       full_store.clone(), 
       action_cache_address.as_str(), 
       root_ca_certs.clone(), 
       remoting_opts.store_headers.clone(), 
       Platform::current()?, 
       exec_strategy_opts.remote_cache_read, 
       exec_strategy_opts.remote_cache_write, 
       remoting_opts.cache_eager_fetch, 
     )?) 
   } else { 
     local_command_runner 
   }; 
 // Possibly use the local cache runner, regardless of remote execution/caching. 
 let maybe_local_cached_command_runner = if exec_strategy_opts.use_local_cache { 
   let process_execution_store = ShardedLmdb::new( 
     local_store_dir.join("processes"), 
     2 * DEFAULT_LOCAL_STORE_GC_TARGET_BYTES, 
     executor.clone(), 
     DEFAULT_LEASE_TIME, 
   ) 
   .map_err(|err| format!("Could not initialize store for process cache: {:?}", err))?; 
   Box::new(process_execution::cache::CommandRunner::new( 
     maybe_remote_enabled_command_runner.into(), 
     process_execution_store, 
     full_store.clone(), 
     process_execution_metadata.clone(), 
   )) 
 } else { 
   maybe_remote_enabled_command_runner 
 }; 
 // Possibly either add the remote execution runner or the remote cache runner. 
 // `global_options.py` already validates that both are not set at the same time. 
 let maybe_remote_enabled_command_runner: Box<dyn CommandRunner> = 
   if remoting_opts.execution_enable { 
     Box::new(BoundedCommandRunner::new( 
       Box::new(process_execution::remote::CommandRunner::new( 
         // No problem unwrapping here because the global options validation 
         // requires the remoting_opts.execution_server be present when 
         // remoting_opts.execution_enable is set. 
         &remoting_opts.execution_address.clone().unwrap(), 
         remoting_opts.store_addresses.clone(), 
         process_execution_metadata.clone(), 
         root_ca_certs.clone(), 
         remoting_opts.execution_headers.clone(), 
         full_store.clone(), 
         // TODO if we ever want to configure the remote platform to be something else we 
         // need to take an option all the way down here and into the remote::CommandRunner struct. 
         Platform::Linux, 
         remoting_opts.execution_overall_deadline, 
         Duration::from_millis(100), 
       )?), 
       exec_strategy_opts.remote_parallelism, 
     )) 
   } else if remote_caching_used { 
     let action_cache_address = remote_store_addresses 
       .first() 
       .ok_or_else(|| "At least one remote store must be specified".to_owned())?; 
     Box::new(process_execution::remote_cache::CommandRunner::new( 
       local_command_runner.into(), 
       process_execution_metadata.clone(), 
       executor.clone(), 
       full_store.clone(), 
       action_cache_address.as_str(), 
       root_ca_certs.clone(), 
       remoting_opts.store_headers.clone(), 
       Platform::current()?, 
       exec_strategy_opts.remote_cache_read, 
       exec_strategy_opts.remote_cache_write, 
       remoting_opts.cache_eager_fetch, 
     )?) 
   } else { 
     local_command_runner 
   }; 
  
 // Possibly use the local cache runner, regardless of remote execution/caching. 
 let maybe_local_cached_command_runner = if exec_strategy_opts.use_local_cache { 
   let process_execution_store = ShardedLmdb::new( 
     local_store_dir.join("processes"), 
     2 * DEFAULT_LOCAL_STORE_GC_TARGET_BYTES, 
     executor.clone(), 
     DEFAULT_LEASE_TIME, 
   ) 
   .map_err(|err| format!("Could not initialize store for process cache: {:?}", err))?; 
   Box::new(process_execution::cache::CommandRunner::new( 
     maybe_remote_enabled_command_runner.into(), 
     process_execution_store, 
     full_store.clone(), 
     process_execution_metadata.clone(), 
   )) 
 } else { 
   maybe_remote_enabled_command_runner 
 }; 
+}
diff --git a/src/rust/engine/workunit_store/src/metrics.rs b/src/rust/engine/workunit_store/src/metrics.rs
@@ -48,6 +48,9 @@ pub enum Metric {
   LocalCacheRequestsUncached,
   LocalCacheReadErrors,
   LocalCacheWriteErrors,
+  /// The total time saved (in milliseconds) thanks to local cache hits instead of running the
+  /// processes directly.
+  LocalCacheTotalTimeSavedMs,
   LocalExecutionRequests,
   RemoteCacheRequests,
   RemoteCacheRequestsCached,
@@ -58,6 +61,9 @@ pub enum Metric {
   RemoteCacheWriteFinished,
   RemoteCacheSpeculationLocalCompletedFirst,
   RemoteCacheSpeculationRemoteCompletedFirst,
+  /// The total time saved (in milliseconds) thanks to remote cache hits instead of running the
+  /// processes directly.
+  RemoteCacheTotalTimeSavedMs,
   RemoteExecutionErrors,
   RemoteExecutionRequests,
   RemoteExecutionRPCErrors,
@@ -81,4 +87,10 @@ pub enum ObservationMetric {
   LocalStoreReadBlobSize,
   RemoteExecutionRPCFirstResponseTime,
   RemoteStoreTimeToFirstByte,
+  /// The time saved (in milliseconds) thanks to a local cache hit instead of running the process
+  /// directly.
+  LocalCacheTimeSavedMs,
+  /// The time saved (in milliseconds) thanks to a remote cache hit instead of running the process
+  /// directly.
+  RemoteCacheTimeSavedMs,
 }