neondatabase · jcsp · Aug 8, 2023 · Aug 3, 2023 · Aug 4, 2023 · Aug 4, 2023
@@ -9,8 +9,10 @@ use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use remote_storage::GenericRemoteStorage;
+use tokio::time::Instant;
 use tracing::*;
 
 use metrics::set_build_info_metric;
@@ -226,6 +228,19 @@ fn start_pageserver(
     launch_ts: &'static LaunchTimestamp,
     conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
+    // Monotonic time for later calculating startup duration
+    let started_startup_at = Instant::now();
+
+    let startup_checkpoint = move |phase: &str, human_phase: &str| {
+        let elapsed = started_startup_at.elapsed();
+        let secs = elapsed.as_secs_f64();
+        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
+        info!(
+            elapsed_millis = elapsed.as_millis(),
+            "{human_phase} ({secs:.3}s since start)"
+        )
+    };
+
     // Print version and launch timestamp to the log,
     // and expose them as prometheus metrics.
     // A changed version string indicates changed software.
@@ -335,6 +350,11 @@ fn start_pageserver(
     // Set up remote storage client
     let remote_storage = create_remote_storage_client(conf)?;
 
+    // Up to this point no significant I/O has been done: this should have been fast.  Record
+    // duration prior to starting I/O intensive phase of startup.
+    startup_checkpoint("initial", "Starting loading tenants");
+    STARTUP_IS_LOADING.set(1);
+
     // Startup staging or optimizing:
     //
     // We want to minimize downtime for `page_service` connections, and trying not to overload
@@ -360,7 +380,6 @@ fn start_pageserver(
     };
 
     // Scan the local 'tenants/' directory and start loading the tenants
-    let init_started_at = std::time::Instant::now();
     let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
 
     BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -378,18 +397,13 @@ fn start_pageserver(
             let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
 
             init_done_rx.wait().await;
+            startup_checkpoint("initial_tenant_load", "Initial load completed");
+            STARTUP_IS_LOADING.set(0);
+
             // initial logical sizes can now start, as they were waiting on init_done_rx.
 
             scopeguard::ScopeGuard::into_inner(guard);
 
-            let init_done = std::time::Instant::now();
-            let elapsed = init_done - init_started_at;
-
-            tracing::info!(
-                elapsed_millis = elapsed.as_millis(),
-                "Initial load completed"
-            );
-
             let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
 
             let timeout = conf.background_task_maximum_delay;
@@ -398,12 +412,7 @@ fn start_pageserver(
 
             let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
                 Ok(_) => {
-                    let now = std::time::Instant::now();
-                    tracing::info!(
-                        from_init_done_millis = (now - init_done).as_millis(),
-                        from_init_millis = (now - init_started_at).as_millis(),
-                        "Initial logical sizes completed"
-                    );
+                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
                     None
                 }
                 Err(_) => {
@@ -419,6 +428,7 @@ fn start_pageserver(
 
             // allow background jobs to start
             drop(background_jobs_can_start);
+            startup_checkpoint("background_jobs_can_start", "Starting background jobs");
 
             if let Some(init_sizes_done) = init_sizes_done {
                 // ending up here is not a bug; at the latest logical sizes will be queried by
@@ -428,14 +438,11 @@ fn start_pageserver(
 
                 scopeguard::ScopeGuard::into_inner(guard);
 
-                let now = std::time::Instant::now();
-                tracing::info!(
-                    from_init_done_millis = (now - init_done).as_millis(),
-                    from_init_millis = (now - init_started_at).as_millis(),
-                    "Initial logical sizes completed after timeout (background jobs already started)"
-                );
+                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
 
             }
+
+            startup_checkpoint("complete", "Startup complete");
         };
 
         async move {

@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
-pub(crate) mod metrics;
+pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;

@@ -1,9 +1,9 @@
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
-    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
-    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
+    register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
+    register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
+    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use strum::VariantNames;
@@ -394,6 +394,35 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
     .expect("failed to define a metric")
 });
 
+/// How long did we take to start up?  Broken down by labels to describe
+/// different phases of startup.
+pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_startup_duration_seconds",
+        "Time taken by phases of pageserver startup, in seconds",
+        &["phase"]
+    )
+    .expect("Failed to register pageserver_startup_duration_seconds metric")
+});
+
+pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_startup_is_loading",
+        "1 while in initial startup load of tenants, 0 at other times"
+    )
+    .expect("Failed to register pageserver_startup_is_loading")
+});
+
+/// How long did tenants take to go from construction to active state?
+pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_tenant_activation_seconds",
+        "Time taken by tenants to activate, in seconds",
+        CRITICAL_OP_BUCKETS.into()
+    )
+    .expect("Failed to register pageserver_tenant_activation_seconds metric")
+});
+
 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {

@@ -56,6 +56,7 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
 use crate::is_uninit_mark;
+use crate::metrics::TENANT_ACTIVATION;
 use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -1639,6 +1640,8 @@ impl Tenant {
                     post_state = <&'static str>::from(&*current_state),
                     "activation attempt finished"
                 );
+
+                TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
             });
         }
     }

diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
@@ -12,6 +12,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 
     env.neon_cli.create_branch("test_pageserver_restart")
     endpoint = env.endpoints.create_start("test_pageserver_restart")
+    pageserver_http = env.pageserver.http_client()
 
     pg_conn = endpoint.connect()
     cur = pg_conn.cursor()
@@ -52,8 +53,11 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     # pageserver does if a compute node connects and sends a request for the tenant
     # while it's still in Loading state. (It waits for the loading to finish, and then
     # processes the request.)
+    tenant_load_delay_ms = 5000
     env.pageserver.stop()
-    env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-loading-tenant=return(5000)"})
+    env.pageserver.start(
+        extra_env_vars={"FAILPOINTS": f"before-loading-tenant=return({tenant_load_delay_ms})"}
+    )
 
     # Check that it's in Loading state
     client = env.pageserver.http_client()
@@ -65,6 +69,41 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
     cur.execute("SELECT count(*) FROM foo")
     assert cur.fetchone() == (100000,)
 
+    # Validate startup time metrics
+    metrics = pageserver_http.get_metrics()
+
+    # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value
+    expectations = {
+        "initial": lambda t, p: True,  # make no assumptions about the initial time point, it could be 0 in theory
+        # Initial tenant load should reflect the delay we injected
+        "initial_tenant_load": lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p,
+        # Subsequent steps should occur in expected order
+        "initial_logical_sizes": lambda t, p: t > 0 and t >= p,
+        "background_jobs_can_start": lambda t, p: t > 0 and t >= p,
+        "complete": lambda t, p: t > 0 and t >= p,
+    }
+
+    prev_value = None
+    for sample in metrics.query_all("pageserver_startup_duration_seconds"):
+        labels = dict(sample.labels)
+        phase = labels["phase"]
+        log.info(f"metric {phase}={sample.value}")
+        assert phase in expectations, f"Unexpected phase {phase}"
+        assert expectations[phase](
+            sample.value, prev_value
+        ), f"Unexpected value for {phase}: {sample.value}"
+        prev_value = sample.value
+
+    # Startup is complete, this metric should exist but be zero
+    assert metrics.query_one("pageserver_startup_is_loading").value == 0
+
+    # This histogram should have been populated, although we aren't specific about exactly
+    # which bucket values: just nonzero
+    assert any(
+        bucket.value > 0
+        for bucket in metrics.query_all("pageserver_tenant_activation_seconds_bucket")
+    )
+
 
 # Test that repeatedly kills and restarts the page server, while the
 # safekeeper and compute node keep running.