Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics for time saved from local and remote caching #11601

Merged
merged 6 commits into from
Feb 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion src/rust/engine/process_execution/src/cache.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::sync::Arc;
use std::time::Instant;

use async_trait::async_trait;
use bazel_protos::gen::build::bazel::remote::execution::v2 as remexec;
Expand All @@ -10,7 +11,7 @@ use prost::Message;
use serde::{Deserialize, Serialize};
use sharded_lmdb::ShardedLmdb;
use store::Store;
use workunit_store::{with_workunit, Level, Metric, WorkunitMetadata};
use workunit_store::{with_workunit, Level, Metric, ObservationMetric, WorkunitMetadata};

use crate::{
Context, FallibleProcessResultWithPlatform, MultiPlatformProcess, Platform, Process,
Expand Down Expand Up @@ -59,6 +60,7 @@ impl crate::CommandRunner for CommandRunner {
req: MultiPlatformProcess,
context: Context,
) -> Result<FallibleProcessResultWithPlatform, String> {
let cache_lookup_start = Instant::now();
let context2 = context.clone();
let cache_read_future = async move {
context
Expand All @@ -76,9 +78,19 @@ impl crate::CommandRunner for CommandRunner {
let command_runner = self.clone();
match self.lookup(key).await {
Ok(Some(result)) if result.exit_code == 0 || cache_failures => {
let lookup_elapsed = cache_lookup_start.elapsed();
context
.workunit_store
.increment_counter(Metric::LocalCacheRequestsCached, 1);
if let Some(time_saved) = result.metadata.time_saved_from_cache(lookup_elapsed) {
let time_saved = time_saved.as_millis() as u64;
context
.workunit_store
.increment_counter(Metric::LocalCacheTotalTimeSavedMs, time_saved);
context
.workunit_store
.record_observation(ObservationMetric::LocalCacheTimeSavedMs, time_saved);
}
return Ok(result);
}
Err(err) => {
Expand Down
24 changes: 24 additions & 0 deletions src/rust/engine/process_execution/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,30 @@ impl ProcessResultMetadata {
pub fn new(total_elapsed: Option<Duration>) -> Self {
ProcessResultMetadata { total_elapsed }
}

/// How much faster a cache hit was than running the process again.
///
/// This includes the overhead of setting up and cleaning up the process for execution, and it
/// should include all overhead for the cache lookup.
///
/// If the cache hit was slower than the original process, we return 0. Note that the cache hit
/// may still have been faster than rerunning the process a second time, e.g. if speculation
/// is used and the cache hit completed before the rerun; still, we cannot know how long the
/// second run would have taken, so the best we can do is report 0.
///
/// If the original process's execution time was not recorded, we return None because we
/// cannot make a meaningful comparison.
pub fn time_saved_from_cache(
&self,
cache_lookup: std::time::Duration,
) -> Option<std::time::Duration> {
self.total_elapsed.and_then(|original_process| {
let original_process: std::time::Duration = original_process.into();
original_process
.checked_sub(cache_lookup)
.or_else(|| Some(std::time::Duration::new(0, 0)))
})
}
}

impl From<ExecutedActionMetadata> for ProcessResultMetadata {
Expand Down
14 changes: 13 additions & 1 deletion src/rust/engine/process_execution/src/remote_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use std::collections::{BTreeMap, HashSet, VecDeque};
use std::ffi::OsString;
use std::path::Component;
use std::sync::Arc;
use std::time::Instant;

use async_trait::async_trait;
use bazel_protos::gen::build::bazel::remote::execution::v2 as remexec;
Expand All @@ -14,7 +15,7 @@ use remexec::action_cache_client::ActionCacheClient;
use remexec::{ActionResult, Command, FileNode, Tree};
use store::Store;
use tonic::transport::Channel;
use workunit_store::{with_workunit, Level, Metric, WorkunitMetadata};
use workunit_store::{with_workunit, Level, Metric, ObservationMetric, WorkunitMetadata};

use crate::remote::make_execute_request;
use crate::{
Expand Down Expand Up @@ -391,6 +392,7 @@ impl crate::CommandRunner for CommandRunner {
req: MultiPlatformProcess,
context: Context,
) -> Result<FallibleProcessResultWithPlatform, String> {
let cache_lookup_start = Instant::now();
// Construct the REv2 ExecuteRequest and related data for this execution request.
let request = self
.extract_compatible_request(&req)
Expand Down Expand Up @@ -452,7 +454,17 @@ impl crate::CommandRunner for CommandRunner {
tokio::select! {
cache_result = cache_read_future => {
if let Some(cached_response) = cache_result {
let lookup_elapsed = cache_lookup_start.elapsed();
context.workunit_store.increment_counter(Metric::RemoteCacheSpeculationRemoteCompletedFirst, 1);
if let Some(time_saved) = cached_response.metadata.time_saved_from_cache(lookup_elapsed) {
let time_saved = time_saved.as_millis() as u64;
context
.workunit_store
.increment_counter(Metric::RemoteCacheTotalTimeSavedMs, time_saved);
Comment on lines +461 to +463
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW, hdrhistogram has a sum method, which should be roughly equivalent here. It won't have perfect accuracy, but it might still be worth it from a clarity/redundancy perspective to avoid double storing things.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, I don't see that with the Python library though: https://github.com/HdrHistogram/HdrHistogram_py/blob/master/hdrh/histogram.py

So iiuc, we would need to add special casing code that sums the histogram in Rust, then exposes over FFI to Python. The double counting seems cleaner than that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Python HdrHistogram exposes an iterator (get_recorded_iterator I believe) over all values. You could just sum over that, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but that's still adding special-cased code then to our stats_aggregator.py, and it has a loss of precision.

A major use case I'm trying to facilitate is Pants users reporting in the past that they were trying to answer how much time was saved from caching compared to the cost of uploading/downloading the cache in CI. I want to facilitate answering that question without users needing to add custom code, e.g. put this advice in our Pants And CI docs page.

context
.workunit_store
.record_observation(ObservationMetric::RemoteCacheTimeSavedMs, time_saved);
}
return Ok(cached_response);
} else {
// Note that we don't increment a counter here, as there is nothing of note in this
Expand Down
18 changes: 18 additions & 0 deletions src/rust/engine/process_execution/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,21 @@ fn process_result_metadata_to_and_from_executed_action_metadata() {
let process_result_missing: ExecutedActionMetadata = ProcessResultMetadata::default().into();
assert_eq!(process_result_missing, ExecutedActionMetadata::default());
}

#[test]
fn process_result_metadata_time_saved_from_cache() {
let metadata = ProcessResultMetadata::new(Some(concrete_time::Duration::new(5, 150)));
let time_saved = metadata.time_saved_from_cache(Duration::new(1, 100));
assert_eq!(time_saved, Some(Duration::new(4, 50)));

// If the cache lookup took more time than the process, we return 0.
let metadata = ProcessResultMetadata::new(Some(concrete_time::Duration::new(1, 0)));
let time_saved = metadata.time_saved_from_cache(Duration::new(5, 0));
assert_eq!(time_saved, Some(Duration::new(0, 0)));

// If the original process time wasn't recorded, we can't compute the time saved.
assert_eq!(
ProcessResultMetadata::default().time_saved_from_cache(Duration::new(1, 100)),
None
);
Comment on lines +103 to +107
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we consider storing a 0 here instead (similar to the negative case)? The idea would be to always have an entry in this histogram when we end up using the cache, even if we can't report a benefit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so, wouldn't that skew the data? Let's say we have 50 cache hits, only 30 of which had the original execution time recorded. That means that 60% of our histogram entries will be 0 and will skew the data to look like the caching is not very helpful.

Fwit, local execution should always be storing the original execution time. The only reason we might not have the execution time is that we can't guarantee remote execution has set the ExecutedActionMetadata. But cache.rs and remote_cache.rs only ever deal with local process executions, so in practicality, those codepaths will always have access to the original time.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so, wouldn't that skew the data? Let's say we have 50 cache hits, only 30 of which had the original execution time recorded. That means that 60% of our histogram entries will be 0 and will skew the data to look like the caching is not very helpful.

It wouldn't skew the data... it would make it more accurate, if anything: if there are a bunch of zeros being recorded, we need to expose that, as it could be a problem in one of a few different places.

It's possible to be negative if the current process run takes longer than the previous one did.

When would that happen?

In exactly the case I've described, I think. You ran the process in the past: it took 1.5 seconds to run. You're running it again, and racing it against a cache lookup. For some reason, when you run the process again, it takes a lot longer to run than it did the first time... say 3 seconds. Meanwhile, your cache lookup also takes a lot longer, but still wins the race in 2 seconds. You end up with a time saved value of 1.5 - 2.

Copy link
Contributor Author

@Eric-Arellano Eric-Arellano Feb 25, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay so there are two different scenarios and I think we want to handle them differently:

  1. The original time was not recorded. As explained above, this should never happen because local execution should always have the timing recorded, and cache.rs and remote_cache.rs solely work with local processes, not remote caching. We're only handling the missing data case for type safety - if you want, we could error or use Result in this case, so that we eagerly know when the original process time wasn't recorded? I'm not comfortable with using 0 to represent this case.
  2. The cache hit was slower than the original process, but still faster than rerunning the process (via speculation). This is the negative numbers case. We, by definition, cannot know how slow the process would be on the second run, so the best we can do is represent with a 0. That sounds sensible to me.

So, we could change the modeling of .time_saved_from_cache() be Result<Duration, String>, where a Duration of 0 means the cache hit was slower than original process. Or stick with Option<String> and use 0. Wdyt?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original time was not recorded. As explained above, this should never happen because local execution should always have the timing recorded, and cache.rs and remote_cache.rs solely work with local processes, not remote caching. We're only handling the missing data case for type safety - if you want, we could error or use Result in this case, so that we eagerly know when the original process time wasn't recorded? I'm not comfortable with using 0 to represent this case.

It can happen in the context of remote execution, where a server does not return the value. We shouldn't error for a server that cannot return us metrics, but we should do something to indicate that the server is reporting bad metrics.

So, we could change the modeling of .time_saved_from_cache() be Result<Duration, String>, where a Duration of 0 means the cache hit was slower than original process. Or stick with Option and use 0. Wdyt?

Warnings would be annoying, and we'd need to add a way to disable them probably. Could put it at debug, but then fair chance that we never notice it. So yea, I'd still vote for 0 here, as it exposes the issue iff someone is actually looking at these metrics... but debug level logging wouldn't be the end of the world.

That sounds sensible to me.

👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can happen in the context of remote execution, where a server does not return the value.

Yes, but cache.rs and remote_cache.rs never will consume the results of remote execution, right? I'm only talking about case #1 here.

K, I'll change case #2 to use 0, but keep Option to represent case #1.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but cache.rs and remote_cache.rs never will consume the results of remote execution, right? I'm only talking about case #1 here.

They will if a local cache is wrapped around remote execution, which is common.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They will if a local cache is wrapped around remote execution, which is common.

I'm not familiar with this workflow - when does this happen?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// Possibly either add the remote execution runner or the remote cache runner.
// `global_options.py` already validates that both are not set at the same time.
let maybe_remote_enabled_command_runner: Box<dyn CommandRunner> =
if remoting_opts.execution_enable {
Box::new(BoundedCommandRunner::new(
Box::new(process_execution::remote::CommandRunner::new(
// No problem unwrapping here because the global options validation
// requires the remoting_opts.execution_server be present when
// remoting_opts.execution_enable is set.
&remoting_opts.execution_address.clone().unwrap(),
remoting_opts.store_addresses.clone(),
process_execution_metadata.clone(),
root_ca_certs.clone(),
remoting_opts.execution_headers.clone(),
full_store.clone(),
// TODO if we ever want to configure the remote platform to be something else we
// need to take an option all the way down here and into the remote::CommandRunner struct.
Platform::Linux,
remoting_opts.execution_overall_deadline,
Duration::from_millis(100),
)?),
exec_strategy_opts.remote_parallelism,
))
} else if remote_caching_used {
let action_cache_address = remote_store_addresses
.first()
.ok_or_else(|| "At least one remote store must be specified".to_owned())?;
Box::new(process_execution::remote_cache::CommandRunner::new(
local_command_runner.into(),
process_execution_metadata.clone(),
executor.clone(),
full_store.clone(),
action_cache_address.as_str(),
root_ca_certs.clone(),
remoting_opts.store_headers.clone(),
Platform::current()?,
exec_strategy_opts.remote_cache_read,
exec_strategy_opts.remote_cache_write,
remoting_opts.cache_eager_fetch,
)?)
} else {
local_command_runner
};
// Possibly use the local cache runner, regardless of remote execution/caching.
let maybe_local_cached_command_runner = if exec_strategy_opts.use_local_cache {
let process_execution_store = ShardedLmdb::new(
local_store_dir.join("processes"),
2 * DEFAULT_LOCAL_STORE_GC_TARGET_BYTES,
executor.clone(),
DEFAULT_LEASE_TIME,
)
.map_err(|err| format!("Could not initialize store for process cache: {:?}", err))?;
Box::new(process_execution::cache::CommandRunner::new(
maybe_remote_enabled_command_runner.into(),
process_execution_store,
full_store.clone(),
process_execution_metadata.clone(),
))
} else {
maybe_remote_enabled_command_runner
};

maybe_local_cached_command_runner is wrapped around maybe_remote_enabled_command_runner.

}
12 changes: 12 additions & 0 deletions src/rust/engine/workunit_store/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ pub enum Metric {
LocalCacheRequestsUncached,
LocalCacheReadErrors,
LocalCacheWriteErrors,
/// The total time saved (in milliseconds) thanks to local cache hits instead of running the
/// processes directly.
LocalCacheTotalTimeSavedMs,
LocalExecutionRequests,
RemoteCacheRequests,
RemoteCacheRequestsCached,
Expand All @@ -58,6 +61,9 @@ pub enum Metric {
RemoteCacheWriteFinished,
RemoteCacheSpeculationLocalCompletedFirst,
RemoteCacheSpeculationRemoteCompletedFirst,
/// The total time saved (in milliseconds) thanks to remote cache hits instead of running the
/// processes directly.
RemoteCacheTotalTimeSavedMs,
RemoteExecutionErrors,
RemoteExecutionRequests,
RemoteExecutionRPCErrors,
Expand All @@ -81,4 +87,10 @@ pub enum ObservationMetric {
LocalStoreReadBlobSize,
RemoteExecutionRPCFirstResponseTime,
RemoteStoreTimeToFirstByte,
/// The time saved (in milliseconds) thanks to a local cache hit instead of running the process
/// directly.
LocalCacheTimeSavedMs,
/// The time saved (in milliseconds) thanks to a remote cache hit instead of running the process
/// directly.
RemoteCacheTimeSavedMs,
}