Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pageserver: expose prometheus metrics for startup time #4893

Merged
merged 14 commits into from
Aug 8, 2023
Merged
49 changes: 28 additions & 21 deletions pageserver/src/bin/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use remote_storage::GenericRemoteStorage;
use tokio::time::Instant;
use tracing::*;

use metrics::set_build_info_metric;
Expand Down Expand Up @@ -226,6 +228,19 @@ fn start_pageserver(
launch_ts: &'static LaunchTimestamp,
conf: &'static PageServerConf,
) -> anyhow::Result<()> {
// Monotonic time for later calculating startup duration
let started_startup_at = Instant::now();

let startup_checkpoint = move |phase: &str, human_phase: &str| {
let elapsed = started_startup_at.elapsed();
let secs = elapsed.as_secs_f64();
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
info!(
elapsed_millis = elapsed.as_millis(),
"{human_phase} ({secs:.3}s since start)"
)
};

// Print version and launch timestamp to the log,
// and expose them as prometheus metrics.
// A changed version string indicates changed software.
Expand Down Expand Up @@ -335,6 +350,11 @@ fn start_pageserver(
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;

// Up to this point no significant I/O has been done: this should have been fast. Record
// duration prior to starting I/O intensive phase of startup.
startup_checkpoint("initial", "Starting loading tenants");
STARTUP_IS_LOADING.set(1);

// Startup staging or optimizing:
//
// We want to minimize downtime for `page_service` connections, and trying not to overload
Expand All @@ -360,7 +380,6 @@ fn start_pageserver(
};

// Scan the local 'tenants/' directory and start loading the tenants
let init_started_at = std::time::Instant::now();
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
Expand All @@ -378,18 +397,13 @@ fn start_pageserver(
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

init_done_rx.wait().await;
startup_checkpoint("initial_tenant_load", "Initial load completed");
STARTUP_IS_LOADING.set(0);

// initial logical sizes can now start, as they were waiting on init_done_rx.

scopeguard::ScopeGuard::into_inner(guard);

let init_done = std::time::Instant::now();
let elapsed = init_done - init_started_at;

tracing::info!(
elapsed_millis = elapsed.as_millis(),
"Initial load completed"
);

let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());

let timeout = conf.background_task_maximum_delay;
Expand All @@ -398,12 +412,7 @@ fn start_pageserver(

let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
Ok(_) => {
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed"
);
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
None
}
Err(_) => {
Expand All @@ -419,6 +428,7 @@ fn start_pageserver(

// allow background jobs to start
drop(background_jobs_can_start);
startup_checkpoint("background_jobs_can_start", "Starting background jobs");

if let Some(init_sizes_done) = init_sizes_done {
// ending up here is not a bug; at the latest logical sizes will be queried by
Expand All @@ -428,14 +438,11 @@ fn start_pageserver(

scopeguard::ScopeGuard::into_inner(guard);

let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed after timeout (background jobs already started)"
);
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");

}

startup_checkpoint("complete", "Startup complete");
};

async move {
Expand Down
2 changes: 1 addition & 1 deletion pageserver/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod keyspace;
pub(crate) mod metrics;
pub mod metrics;
pub mod page_cache;
pub mod page_service;
pub mod pgdatadir_mapping;
Expand Down
37 changes: 33 additions & 4 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use metrics::metric_vec_duration::DurationResultObserver;
use metrics::{
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use strum::VariantNames;
Expand Down Expand Up @@ -394,6 +394,35 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
.expect("failed to define a metric")
});

/// How long did we take to start up? Broken down by labels to describe
/// different phases of startup.
pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
register_gauge_vec!(
"pageserver_startup_duration_seconds",
"Time taken by phases of pageserver startup, in seconds",
&["phase"]
)
.expect("Failed to register pageserver_startup_duration_seconds metric")
});

pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_startup_is_loading",
"1 while in initial startup load of tenants, 0 at other times"
)
.expect("Failed to register pageserver_startup_is_loading")
});

/// How long did tenants take to go from construction to active state?
pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_tenant_activation_seconds",
"Time taken by tenants to activate, in seconds",
CRITICAL_OP_BUCKETS.into()
)
.expect("Failed to register pageserver_tenant_activation_seconds metric")
});

/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
#[derive(Debug)]
pub struct EvictionsWithLowResidenceDuration {
Expand Down
3 changes: 3 additions & 0 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT_ACTIVATION;
use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
use crate::repository::GcResult;
use crate::task_mgr;
Expand Down Expand Up @@ -1639,6 +1640,8 @@ impl Tenant {
post_state = <&'static str>::from(&*current_state),
"activation attempt finished"
);

TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
});
}
}
Expand Down
41 changes: 40 additions & 1 deletion test_runner/regress/test_pageserver_restart.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):

env.neon_cli.create_branch("test_pageserver_restart")
endpoint = env.endpoints.create_start("test_pageserver_restart")
pageserver_http = env.pageserver.http_client()

pg_conn = endpoint.connect()
cur = pg_conn.cursor()
Expand Down Expand Up @@ -52,8 +53,11 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
# pageserver does if a compute node connects and sends a request for the tenant
# while it's still in Loading state. (It waits for the loading to finish, and then
# processes the request.)
tenant_load_delay_ms = 5000
env.pageserver.stop()
env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-loading-tenant=return(5000)"})
env.pageserver.start(
extra_env_vars={"FAILPOINTS": f"before-loading-tenant=return({tenant_load_delay_ms})"}
)

# Check that it's in Loading state
client = env.pageserver.http_client()
Expand All @@ -65,6 +69,41 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)

# Validate startup time metrics
metrics = pageserver_http.get_metrics()

# Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value
expectations = {
"initial": lambda t, p: True, # make no assumptions about the initial time point, it could be 0 in theory
# Initial tenant load should reflect the delay we injected
"initial_tenant_load": lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p,
# Subsequent steps should occur in expected order
"initial_logical_sizes": lambda t, p: t > 0 and t >= p,
"background_jobs_can_start": lambda t, p: t > 0 and t >= p,
"complete": lambda t, p: t > 0 and t >= p,
}

prev_value = None
for sample in metrics.query_all("pageserver_startup_duration_seconds"):
labels = dict(sample.labels)
phase = labels["phase"]
log.info(f"metric {phase}={sample.value}")
assert phase in expectations, f"Unexpected phase {phase}"
assert expectations[phase](
sample.value, prev_value
), f"Unexpected value for {phase}: {sample.value}"
prev_value = sample.value

# Startup is complete, this metric should exist but be zero
assert metrics.query_one("pageserver_startup_is_loading").value == 0

# This histogram should have been populated, although we aren't specific about exactly
# which bucket values: just nonzero
assert any(
bucket.value > 0
for bucket in metrics.query_all("pageserver_tenant_activation_seconds_bucket")
)


# Test that repeatedly kills and restarts the page server, while the
# safekeeper and compute node keep running.
Expand Down